Repository: clxia12/RT-DETRv3
Branch: main
Commit: 349e7d99a506
Files: 393
Total size: 3.9 MB
Directory structure:
gitextract_swotw2om/
├── .gitignore
├── LICENSE
├── README.md
├── configs/
│ ├── datasets/
│ │ ├── coco_detection.yml
│ │ ├── coco_instance.yml
│ │ ├── culane.yml
│ │ ├── dota.yml
│ │ ├── dota_ms.yml
│ │ ├── lvis_detection.yml
│ │ ├── mcmot.yml
│ │ ├── mot.yml
│ │ ├── objects365_detection.yml
│ │ ├── roadsign_voc.yml
│ │ ├── sniper_coco_detection.yml
│ │ ├── sniper_visdrone_detection.yml
│ │ ├── spine_coco.yml
│ │ ├── visdrone_detection.yml
│ │ ├── voc.yml
│ │ └── wider_face.yml
│ ├── rtdetrv3/
│ │ ├── _base_/
│ │ │ ├── optimizer_6x.yml
│ │ │ ├── rtdetr_reader.yml
│ │ │ └── rtdetrv3_r50vd.yml
│ │ ├── rtdetrv3_r18vd_6x_coco.yml
│ │ ├── rtdetrv3_r18vd_6x_lvis.yml
│ │ ├── rtdetrv3_r34vd_6x_coco.yml
│ │ ├── rtdetrv3_r50vd_6x_coco.yml
│ │ └── rtdetrv3_r50vd_6x_lvis.yml
│ └── runtime.yml
├── dataset/
│ ├── coco/
│ │ └── download_coco.py
│ ├── dota/
│ │ └── .gitignore
│ ├── mot/
│ │ └── gen_labels_MOT.py
│ ├── roadsign_voc/
│ │ ├── download_roadsign_voc.py
│ │ └── label_list.txt
│ ├── spine_coco/
│ │ └── download_spine_coco.py
│ ├── voc/
│ │ ├── create_list.py
│ │ ├── download_voc.py
│ │ └── label_list.txt
│ └── wider_face/
│ └── download_wider_face.sh
├── ppdet/
│ ├── __init__.py
│ ├── core/
│ │ ├── __init__.py
│ │ ├── config/
│ │ │ ├── __init__.py
│ │ │ ├── schema.py
│ │ │ └── yaml_helpers.py
│ │ └── workspace.py
│ ├── data/
│ │ ├── __init__.py
│ │ ├── crop_utils/
│ │ │ ├── __init__.py
│ │ │ ├── annotation_cropper.py
│ │ │ └── chip_box_utils.py
│ │ ├── culane_utils.py
│ │ ├── reader.py
│ │ ├── shm_utils.py
│ │ ├── source/
│ │ │ ├── __init__.py
│ │ │ ├── category.py
│ │ │ ├── coco.py
│ │ │ ├── culane.py
│ │ │ ├── dataset.py
│ │ │ ├── keypoint_coco.py
│ │ │ ├── lvis.py
│ │ │ ├── mot.py
│ │ │ ├── pose3d_cmb.py
│ │ │ ├── sniper_coco.py
│ │ │ ├── voc.py
│ │ │ └── widerface.py
│ │ ├── transform/
│ │ │ ├── __init__.py
│ │ │ ├── atss_assigner.py
│ │ │ ├── autoaugment_utils.py
│ │ │ ├── batch_operators.py
│ │ │ ├── culane_operators.py
│ │ │ ├── gridmask_utils.py
│ │ │ ├── keypoint_operators.py
│ │ │ ├── keypoints_3d_operators.py
│ │ │ ├── mot_operators.py
│ │ │ ├── op_helper.py
│ │ │ ├── operators.py
│ │ │ └── rotated_operators.py
│ │ └── utils.py
│ ├── engine/
│ │ ├── __init__.py
│ │ ├── callbacks.py
│ │ ├── env.py
│ │ ├── export_utils.py
│ │ ├── naive_sync_bn.py
│ │ ├── tracker.py
│ │ ├── trainer.py
│ │ ├── trainer_cot.py
│ │ └── trainer_ssod.py
│ ├── ext_op/
│ │ ├── README.md
│ │ ├── csrc/
│ │ │ ├── matched_rbox_iou/
│ │ │ │ ├── matched_rbox_iou.cc
│ │ │ │ └── matched_rbox_iou.cu
│ │ │ ├── nms_rotated/
│ │ │ │ ├── nms_rotated.cc
│ │ │ │ └── nms_rotated.cu
│ │ │ └── rbox_iou/
│ │ │ ├── rbox_iou.cc
│ │ │ ├── rbox_iou.cu
│ │ │ └── rbox_iou_utils.h
│ │ ├── setup.py
│ │ └── unittest/
│ │ ├── test_matched_rbox_iou.py
│ │ └── test_rbox_iou.py
│ ├── metrics/
│ │ ├── __init__.py
│ │ ├── coco_utils.py
│ │ ├── culane_metrics.py
│ │ ├── fast_cocoeval/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── ext/
│ │ │ │ ├── cocoeval.cc
│ │ │ │ ├── cocoeval.h
│ │ │ │ └── setup.py
│ │ │ └── fast_cocoeval.py
│ │ ├── json_results.py
│ │ ├── keypoint_metrics.py
│ │ ├── lvis_utils.py
│ │ ├── map_utils.py
│ │ ├── mcmot_metrics.py
│ │ ├── metrics.py
│ │ ├── mot_metrics.py
│ │ ├── munkres.py
│ │ ├── pose3d_metrics.py
│ │ └── widerface_utils.py
│ ├── model_zoo/
│ │ ├── .gitignore
│ │ ├── __init__.py
│ │ ├── model_zoo.py
│ │ └── tests/
│ │ ├── __init__.py
│ │ ├── test_get_model.py
│ │ └── test_list_model.py
│ ├── modeling/
│ │ ├── __init__.py
│ │ ├── architectures/
│ │ │ ├── __init__.py
│ │ │ ├── blazeface.py
│ │ │ ├── bytetrack.py
│ │ │ ├── cascade_rcnn.py
│ │ │ ├── centernet.py
│ │ │ ├── centertrack.py
│ │ │ ├── clrnet.py
│ │ │ ├── deepsort.py
│ │ │ ├── detr.py
│ │ │ ├── detr_ssod.py
│ │ │ ├── fairmot.py
│ │ │ ├── faster_rcnn.py
│ │ │ ├── fcos.py
│ │ │ ├── gfl.py
│ │ │ ├── jde.py
│ │ │ ├── keypoint_hrhrnet.py
│ │ │ ├── keypoint_hrnet.py
│ │ │ ├── keypoint_petr.py
│ │ │ ├── keypoint_vitpose.py
│ │ │ ├── mask_rcnn.py
│ │ │ ├── meta_arch.py
│ │ │ ├── multi_stream_detector.py
│ │ │ ├── picodet.py
│ │ │ ├── pose3d_metro.py
│ │ │ ├── ppyoloe.py
│ │ │ ├── queryinst.py
│ │ │ ├── retinanet.py
│ │ │ ├── rtdetrv3.py
│ │ │ ├── s2anet.py
│ │ │ ├── solov2.py
│ │ │ ├── sparse_rcnn.py
│ │ │ ├── ssd.py
│ │ │ ├── tood.py
│ │ │ ├── ttfnet.py
│ │ │ ├── yolo.py
│ │ │ ├── yolof.py
│ │ │ └── yolox.py
│ │ ├── assigners/
│ │ │ ├── __init__.py
│ │ │ ├── atss_assigner.py
│ │ │ ├── clrnet_assigner.py
│ │ │ ├── fcosr_assigner.py
│ │ │ ├── hungarian_assigner.py
│ │ │ ├── max_iou_assigner.py
│ │ │ ├── pose_utils.py
│ │ │ ├── rotated_task_aligned_assigner.py
│ │ │ ├── simota_assigner.py
│ │ │ ├── task_aligned_assigner.py
│ │ │ ├── task_aligned_assigner_cr.py
│ │ │ ├── uniform_assigner.py
│ │ │ └── utils.py
│ │ ├── backbones/
│ │ │ ├── __init__.py
│ │ │ ├── blazenet.py
│ │ │ ├── clrnet_resnet.py
│ │ │ ├── convnext.py
│ │ │ ├── csp_darknet.py
│ │ │ ├── cspresnet.py
│ │ │ ├── darknet.py
│ │ │ ├── dla.py
│ │ │ ├── esnet.py
│ │ │ ├── focalnet.py
│ │ │ ├── ghostnet.py
│ │ │ ├── hardnet.py
│ │ │ ├── hgnet_v2.py
│ │ │ ├── hrnet.py
│ │ │ ├── lcnet.py
│ │ │ ├── lite_hrnet.py
│ │ │ ├── mobilenet_v1.py
│ │ │ ├── mobilenet_v3.py
│ │ │ ├── mobileone.py
│ │ │ ├── name_adapter.py
│ │ │ ├── res2net.py
│ │ │ ├── resnet.py
│ │ │ ├── senet.py
│ │ │ ├── shufflenet_v2.py
│ │ │ ├── swin_transformer.py
│ │ │ ├── trans_encoder.py
│ │ │ ├── transformer_utils.py
│ │ │ ├── vgg.py
│ │ │ ├── vision_transformer.py
│ │ │ ├── vit_mae.py
│ │ │ └── vitpose.py
│ │ ├── bbox_utils.py
│ │ ├── clrnet_utils.py
│ │ ├── cls_utils.py
│ │ ├── heads/
│ │ │ ├── __init__.py
│ │ │ ├── bbox_head.py
│ │ │ ├── cascade_head.py
│ │ │ ├── centernet_head.py
│ │ │ ├── centertrack_head.py
│ │ │ ├── clrnet_head.py
│ │ │ ├── detr_head.py
│ │ │ ├── face_head.py
│ │ │ ├── fcos_head.py
│ │ │ ├── fcosr_head.py
│ │ │ ├── gfl_head.py
│ │ │ ├── keypoint_hrhrnet_head.py
│ │ │ ├── mask_head.py
│ │ │ ├── petr_head.py
│ │ │ ├── pico_head.py
│ │ │ ├── ppyoloe_contrast_head.py
│ │ │ ├── ppyoloe_head.py
│ │ │ ├── ppyoloe_ins_head.py
│ │ │ ├── ppyoloe_r_head.py
│ │ │ ├── retina_head.py
│ │ │ ├── roi_extractor.py
│ │ │ ├── s2anet_head.py
│ │ │ ├── simota_head.py
│ │ │ ├── solov2_head.py
│ │ │ ├── sparse_roi_head.py
│ │ │ ├── sparsercnn_head.py
│ │ │ ├── ssd_head.py
│ │ │ ├── tood_head.py
│ │ │ ├── ttf_head.py
│ │ │ ├── vitpose_head.py
│ │ │ ├── yolo_head.py
│ │ │ └── yolof_head.py
│ │ ├── initializer.py
│ │ ├── keypoint_utils.py
│ │ ├── lane_utils.py
│ │ ├── layers.py
│ │ ├── losses/
│ │ │ ├── __init__.py
│ │ │ ├── clrnet_line_iou_loss.py
│ │ │ ├── clrnet_loss.py
│ │ │ ├── cot_loss.py
│ │ │ ├── ctfocal_loss.py
│ │ │ ├── detr_loss.py
│ │ │ ├── fairmot_loss.py
│ │ │ ├── fcos_loss.py
│ │ │ ├── focal_loss.py
│ │ │ ├── gfocal_loss.py
│ │ │ ├── iou_aware_loss.py
│ │ │ ├── iou_loss.py
│ │ │ ├── jde_loss.py
│ │ │ ├── keypoint_loss.py
│ │ │ ├── pose3d_loss.py
│ │ │ ├── probiou_loss.py
│ │ │ ├── queryinst_loss.py
│ │ │ ├── smooth_l1_loss.py
│ │ │ ├── solov2_loss.py
│ │ │ ├── sparsercnn_loss.py
│ │ │ ├── ssd_loss.py
│ │ │ ├── supcontrast.py
│ │ │ ├── varifocal_loss.py
│ │ │ └── yolo_loss.py
│ │ ├── mot/
│ │ │ ├── __init__.py
│ │ │ ├── matching/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── deepsort_matching.py
│ │ │ │ ├── jde_matching.py
│ │ │ │ └── ocsort_matching.py
│ │ │ ├── motion/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── gmc.py
│ │ │ │ ├── kalman_filter.py
│ │ │ │ └── ocsort_kalman_filter.py
│ │ │ ├── tracker/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base_jde_tracker.py
│ │ │ │ ├── base_sde_tracker.py
│ │ │ │ ├── botsort_tracker.py
│ │ │ │ ├── center_tracker.py
│ │ │ │ ├── deepsort_tracker.py
│ │ │ │ ├── jde_tracker.py
│ │ │ │ └── ocsort_tracker.py
│ │ │ ├── utils.py
│ │ │ └── visualization.py
│ │ ├── necks/
│ │ │ ├── __init__.py
│ │ │ ├── bifpn.py
│ │ │ ├── blazeface_fpn.py
│ │ │ ├── centernet_fpn.py
│ │ │ ├── channel_mapper.py
│ │ │ ├── clrnet_fpn.py
│ │ │ ├── csp_pan.py
│ │ │ ├── custom_pan.py
│ │ │ ├── dilated_encoder.py
│ │ │ ├── es_pan.py
│ │ │ ├── fpn.py
│ │ │ ├── hrfpn.py
│ │ │ ├── lc_pan.py
│ │ │ ├── ttf_fpn.py
│ │ │ └── yolo_fpn.py
│ │ ├── ops.py
│ │ ├── post_process.py
│ │ ├── proposal_generator/
│ │ │ ├── __init__.py
│ │ │ ├── anchor_generator.py
│ │ │ ├── embedding_rpn_head.py
│ │ │ ├── proposal_generator.py
│ │ │ ├── rpn_head.py
│ │ │ ├── target.py
│ │ │ └── target_layer.py
│ │ ├── rbox_utils.py
│ │ ├── reid/
│ │ │ ├── __init__.py
│ │ │ ├── fairmot_embedding_head.py
│ │ │ ├── jde_embedding_head.py
│ │ │ ├── pplcnet_embedding.py
│ │ │ ├── pyramidal_embedding.py
│ │ │ ├── resnet.py
│ │ │ └── resnet_embedding.py
│ │ ├── shape_spec.py
│ │ ├── ssod/
│ │ │ ├── __init__.py
│ │ │ ├── losses.py
│ │ │ └── utils.py
│ │ ├── tests/
│ │ │ ├── __init__.py
│ │ │ ├── test_architectures.py
│ │ │ ├── test_base.py
│ │ │ ├── test_mstest.py
│ │ │ ├── test_ops.py
│ │ │ └── test_yolov3_loss.py
│ │ └── transformers/
│ │ ├── __init__.py
│ │ ├── deformable_transformer.py
│ │ ├── detr_transformer.py
│ │ ├── dino_transformer.py
│ │ ├── ext_op/
│ │ │ ├── README.md
│ │ │ ├── ms_deformable_attn_op.cc
│ │ │ ├── ms_deformable_attn_op.cu
│ │ │ ├── setup_ms_deformable_attn_op.py
│ │ │ └── test_ms_deformable_attn_op.py
│ │ ├── group_detr_transformer.py
│ │ ├── hybrid_encoder.py
│ │ ├── mask_dino_transformer.py
│ │ ├── mask_rtdetr_transformer.py
│ │ ├── matchers.py
│ │ ├── petr_transformer.py
│ │ ├── position_encoding.py
│ │ ├── rtdetr_transformer.py
│ │ ├── rtdetr_transformerv2.py
│ │ ├── rtdetr_transformerv3.py
│ │ └── utils.py
│ ├── optimizer/
│ │ ├── __init__.py
│ │ ├── adamw.py
│ │ ├── ema.py
│ │ ├── optimizer.py
│ │ └── utils.py
│ ├── slim/
│ │ ├── __init__.py
│ │ ├── distill_loss.py
│ │ ├── distill_model.py
│ │ ├── ofa.py
│ │ ├── prune.py
│ │ ├── quant.py
│ │ └── unstructured_prune.py
│ └── utils/
│ ├── __init__.py
│ ├── cam_utils.py
│ ├── check.py
│ ├── checkpoint.py
│ ├── cli.py
│ ├── colormap.py
│ ├── compact.py
│ ├── download.py
│ ├── fuse_utils.py
│ ├── logger.py
│ ├── profiler.py
│ ├── stats.py
│ ├── visualizer.py
│ └── voc_utils.py
├── requirements.txt
├── scripts/
│ ├── build_wheel.sh
│ ├── eval.sh
│ ├── kill.sh
│ └── train.sh
└── tools/
├── anchor_cluster.py
├── box_distribution.py
├── cam_ppdet.py
├── eval.py
├── eval_mot.py
├── export_model.py
├── gen_semi_coco.py
├── infer.py
├── infer_culane.py
├── infer_mot.py
├── post_quant.py
├── slice_image.py
├── sniper_params_stats.py
├── train.py
└── x2coco.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README.md
================================================
English | [简体中文](README_cn.md)
## RT-DETRv3: Real-time End-to-End Object Detection with Hierarchical Dense Positive Supervision
:fire::fire:**[WACV 2025 Oral]** The official implementation of the paper "[RT-DETRv3: Real-time End-to-End Object Detection with Hierarchical Dense Positive Supervision](https://arxiv.org/pdf/2409.08475)". \
[[`arXiv`](https://arxiv.org/pdf/2409.08475)]

## Model Zoo on COCO
| Model | Epoch | Backbone | Input shape | $AP^{val}$ | $AP^{val}_{50}$| Params(M) | FLOPs(G) | T4 TensorRT FP16(FPS) | Weight | Config | Log
|:--------------:|:-----:|:----------:| :-------:|:--------------------------:|:---------------------------:|:---------:|:--------:| :---------------------: |:------------------------------------------------------------------------------------:|:-------------------------------------------:|:---|
| RT-DETRv3-R18 | 6x | ResNet-18 | 640 | 48.1 | 66.2 | 20 | 60 | 217 |[baidu 网盘](https://pan.baidu.com/s/1s7lyT6_fHmczoegQZXdX-w?pwd=54jp) [google drive](https://drive.google.com/file/d/1zIDOjn1qDccC3TBsDlGQHOjVrehd26bk/view?usp=drive_link)| [config](./configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml) |
| RT-DETRv3-R34 | 6x | ResNet-34 | 640 | 49.9 | 67.7 | 31 | 92 | 161 | [baidu 网盘](https://pan.baidu.com/s/1VCg6oqNVF9_ZZdmlhUBgSA?pwd=pi32) [google drive](https://drive.google.com/file/d/12-wqAF8i67eqbocaWPK33d4tFkN2wGi2/view?usp=drive_link)| [config](./configs/rtdetrv3/rtdetrv3_r34vd_6x_coco.yml) |
| RT-DETRv3-R50 | 6x | ResNet-50 | 640 | 53.4 | 71.7 | 42 | 136 | 108 | [baidu 网盘](https://pan.baidu.com/s/1DuvrpMIqbU5okoDp16C94g?pwd=wrxy) [google drive](https://drive.google.com/file/d/1wfJE-QgdgqKE0IkiTuoD5HEbZwwZg3sQ/view?usp=drive_link)| [config](./configs/rtdetrv3/rtdetrv3_r50vd_6x_coco.yml) |
| RT-DETRv3-R101 | 6x | ResNet-101 | 640 | 54.6 | 73.1 | 76 | 259 | 74 | | [config](./configs/rtdetrv3/rtdetrv3_r101vd_6x_coco.yml) |
**Notes:**
- RT-DETRv3 uses 4 GPUs for training.
- RT-DETRv3 was trained on COCO train2017 and evaluated on val2017.
## Model Zoo on LVIS
| Model | Epoch | Backbone | Input shape | AP | $AP_{r}$ | $AP_{c}$ | $AP_{f}$ | Weight | Config | Log
|:--------------:|:-----:|:----------:| :-------:|:--------------------------:|:---------------------------:|:---------:| :---------------------: |:------------------------------------------------------------------------------------:|:-------------------------------------------:|:---|
| RT-DETRv3-R18 | 6x | ResNet-18 | 640 | 26.5 | 12.5 | 24.3 | 35.2 | | [config](./configs/rtdetrv3/rtdetrv3_r18vd_6x_lvis.yml) |
| RT-DETRv3-R50 | 6x | ResNet-50 | 640 | 33.9 | 20.2 | 32.5 | 41.5 | | [config](./configs/rtdetrv3/rtdetrv3_r50vd_6x_lvis.yml) |
## Quick start
Install requirements
```bash
pip install -r requirements.txt
```
Compile (optional)
```bash
cd ./ppdet/modeling/transformers/ext_op/
python setup_ms_deformable_attn_op.py install
```
See [details](./ppdet/modeling/transformers/ext_op/)
Data preparation
- Download and extract COCO 2017 train and val images.
```
path/to/coco/
annotations/ # annotation json files
train2017/ # train images
val2017/ # val images
```
- Modify config [`dataset_dir`](configs/datasets/coco_detection.yml)
Training & Evaluation & Testing
- Training on a Single GPU:
```shell
# training on single-GPU
export CUDA_VISIBLE_DEVICES=0
python tools/train.py -c configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml --eval
```
- Training on Multiple GPUs:
```shell
# training on multi-GPU
export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m paddle.distributed.launch --gpus 0,1,2,3 tools/train.py -c configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml --fleet --eval
```
- Evaluation:
```shell
python tools/eval.py -c configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml \
-o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetrv3_r18vd_6x_coco.pdparams
```
- Inference:
```shell
python tools/infer.py -c configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml \
-o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetrv3_r18vd_6x_coco.pdparams \
--infer_img=./demo/000000570688.jpg
```
## Deploy
1. Export model
```shell
python tools/export_model.py -c configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml \
-o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetrv3_r18vd_6x_coco.pdparams trt=True \
--output_dir=output_inference
```
2. Convert to ONNX
- Install [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX) and ONNX
```shell
pip install onnx==1.13.0
pip install paddle2onnx==1.0.5
```
- Convert:
```shell
paddle2onnx --model_dir=./output_inference/rtdetrv3_r18vd_6x_coco/ \
--model_filename model.pdmodel \
--params_filename model.pdiparams \
--opset_version 16 \
--save_file rtdetrv3_r18vd_6x_coco.onnx
```
3. Convert to TensorRT
- TensorRT version >= 8.5.1
- Inference can refer to [Bennchmark](../benchmark)
```shell
trtexec --onnx=./rtdetrv3_r18vd_6x_coco.onnx \
--workspace=4096 \
--shapes=image:1x3x640x640 \
--saveEngine=rtdetrv3_r18vd_6x_coco.trt \
--avgRuns=100 \
--fp16
```
-
## Citation
If you find RT-DETRv3 useful in your research, please consider giving a star ⭐ and citing:
```
@article{wang2024rt,
title={RT-DETRv3: Real-time End-to-End Object Detection with Hierarchical Dense Positive Supervision},
author={Wang, Shuo and Xia, Chunlong and Lv, Feng and Shi, Yifeng},
journal={arXiv preprint arXiv:2409.08475},
year={2024}
}
```
================================================
FILE: configs/datasets/coco_detection.yml
================================================
metric: COCO
num_classes: 80
TrainDataset:
name: COCODataSet
image_dir: train2017
anno_path: annotations/instances_train2017.json
dataset_dir: /root/paddlejob/workspace/env_run/ws/datasets/COCO
data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
EvalDataset:
name: COCODataSet
image_dir: val2017
anno_path: annotations/instances_val2017.json
dataset_dir: /root/paddlejob/workspace/env_run/ws/datasets/COCO
allow_empty: true
TestDataset:
name: ImageFolder
anno_path: annotations/instances_val2017.json # also support txt (like VOC's label_list.txt)
dataset_dir: dataset/coco # if set, anno_path will be 'dataset_dir/anno_path'
================================================
FILE: configs/datasets/coco_instance.yml
================================================
metric: COCO
num_classes: 80
TrainDataset:
name: COCODataSet
image_dir: train2017
anno_path: annotations/instances_train2017.json
dataset_dir: dataset/coco
data_fields: ['image', 'gt_bbox', 'gt_class', 'gt_poly', 'is_crowd']
EvalDataset:
name: COCODataSet
image_dir: val2017
anno_path: annotations/instances_val2017.json
dataset_dir: dataset/coco
TestDataset:
name: ImageFolder
anno_path: annotations/instances_val2017.json # also support txt (like VOC's label_list.txt)
dataset_dir: dataset/coco # if set, anno_path will be 'dataset_dir/anno_path'
================================================
FILE: configs/datasets/culane.yml
================================================
metric: CULaneMetric
num_classes: 5 # 4 lanes + background
cut_height: &cut_height 270
dataset_dir: &dataset_dir dataset/culane
TrainDataset:
name: CULaneDataSet
dataset_dir: *dataset_dir
list_path: 'list/train_gt.txt'
split: train
cut_height: *cut_height
EvalDataset:
name: CULaneDataSet
dataset_dir: *dataset_dir
list_path: 'list/test.txt'
split: test
cut_height: *cut_height
TestDataset:
name: CULaneDataSet
dataset_dir: *dataset_dir
list_path: 'list/test.txt'
split: test
cut_height: *cut_height
================================================
FILE: configs/datasets/dota.yml
================================================
metric: RBOX
num_classes: 15
TrainDataset:
!COCODataSet
image_dir: trainval1024/images
anno_path: trainval1024/DOTA_trainval1024.json
dataset_dir: dataset/dota/
data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly']
EvalDataset:
!COCODataSet
image_dir: trainval1024/images
anno_path: trainval1024/DOTA_trainval1024.json
dataset_dir: dataset/dota/
data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly']
TestDataset:
!ImageFolder
anno_path: test1024/DOTA_test1024.json
dataset_dir: dataset/dota/
================================================
FILE: configs/datasets/dota_ms.yml
================================================
metric: RBOX
num_classes: 15
TrainDataset:
!COCODataSet
image_dir: trainval1024/images
anno_path: trainval1024/DOTA_trainval1024.json
dataset_dir: dataset/dota_ms/
data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly']
EvalDataset:
!COCODataSet
image_dir: trainval1024/images
anno_path: trainval1024/DOTA_trainval1024.json
dataset_dir: dataset/dota_ms/
data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly']
TestDataset:
!ImageFolder
anno_path: test1024/DOTA_test1024.json
dataset_dir: dataset/dota_ms/
================================================
FILE: configs/datasets/lvis_detection.yml
================================================
metric: LVIS
num_classes: 1203
TrainDataset:
name: LVISDataSet
image_dir: .
anno_path: annotations/lvis_v1_train.json
dataset_dir: /root/paddlejob/workspace/env_run/ws/datasets/COCO
data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
EvalDataset:
name: LVISDataSet
image_dir: .
anno_path: annotations/lvis_v1_val.json
dataset_dir: /root/paddlejob/workspace/env_run/ws/datasets/COCO
allow_empty: true
TestDataset:
name: ImageFolder
anno_path: annotations/lvis_v1_val.json # also support txt (like VOC's label_list.txt)
dataset_dir: /root/paddlejob/workspace/env_run/ws/datasets/COCO # if set, anno_path will be 'dataset_dir/anno_path'
================================================
FILE: configs/datasets/mcmot.yml
================================================
metric: MCMOT
num_classes: 10
# using VisDrone2019 MOT dataset with 10 classes as default, you can modify it for your needs.
# for MCMOT training
TrainDataset:
!MCMOTDataSet
dataset_dir: dataset/mot
image_lists: ['visdrone_mcmot.train']
data_fields: ['image', 'gt_bbox', 'gt_class', 'gt_ide']
label_list: label_list.txt
# for MCMOT evaluation
# If you want to change the MCMOT evaluation dataset, please modify 'data_root'
EvalMOTDataset:
!MOTImageFolder
dataset_dir: dataset/mot
data_root: visdrone_mcmot/images/val
keep_ori_im: False # set True if save visualization images or video, or used in DeepSORT
# for MCMOT video inference
TestMOTDataset:
!MOTImageFolder
dataset_dir: dataset/mot
keep_ori_im: True # set True if save visualization images or video
================================================
FILE: configs/datasets/mot.yml
================================================
metric: MOT
num_classes: 1
# for MOT training
TrainDataset:
!MOTDataSet
dataset_dir: dataset/mot
image_lists: ['mot17.train', 'caltech.all', 'cuhksysu.train', 'prw.train', 'citypersons.train', 'eth.train']
data_fields: ['image', 'gt_bbox', 'gt_class', 'gt_ide']
# for MOT evaluation
# If you want to change the MOT evaluation dataset, please modify 'data_root'
EvalMOTDataset:
!MOTImageFolder
dataset_dir: dataset/mot
data_root: MOT16/images/train
keep_ori_im: False # set True if save visualization images or video, or used in DeepSORT
# for MOT video inference
TestMOTDataset:
!MOTImageFolder
dataset_dir: dataset/mot
keep_ori_im: True # set True if save visualization images or video
================================================
FILE: configs/datasets/objects365_detection.yml
================================================
metric: COCO
num_classes: 365
TrainDataset:
!COCODataSet
image_dir: train
anno_path: annotations/zhiyuan_objv2_train.json
dataset_dir: dataset/objects365
data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
EvalDataset:
!COCODataSet
image_dir: val
anno_path: annotations/zhiyuan_objv2_val.json
dataset_dir: dataset/objects365
allow_empty: true
TestDataset:
!ImageFolder
anno_path: annotations/zhiyuan_objv2_val.json
dataset_dir: dataset/objects365/
================================================
FILE: configs/datasets/roadsign_voc.yml
================================================
metric: VOC
map_type: integral
num_classes: 4
TrainDataset:
name: VOCDataSet
dataset_dir: dataset/roadsign_voc
anno_path: train.txt
label_list: label_list.txt
data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult']
EvalDataset:
name: VOCDataSet
dataset_dir: dataset/roadsign_voc
anno_path: valid.txt
label_list: label_list.txt
data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult']
TestDataset:
name: ImageFolder
anno_path: dataset/roadsign_voc/label_list.txt
================================================
FILE: configs/datasets/sniper_coco_detection.yml
================================================
metric: SNIPERCOCO
num_classes: 80
TrainDataset:
!SniperCOCODataSet
image_dir: train2017
anno_path: annotations/instances_train2017.json
dataset_dir: dataset/coco
data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
allow_empty: true
is_trainset: true
image_target_sizes: [2000, 1000]
valid_box_ratio_ranges: [[-1, 0.1],[0.08, -1]]
chip_target_size: 512
chip_target_stride: 200
use_neg_chip: false
max_neg_num_per_im: 8
EvalDataset:
!SniperCOCODataSet
image_dir: val2017
anno_path: annotations/instances_val2017.json
dataset_dir: dataset/coco
data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
allow_empty: true
is_trainset: false
image_target_sizes: [2000, 1000]
valid_box_ratio_ranges: [[-1, 0.1], [0.08, -1]]
chip_target_size: 512
chip_target_stride: 200
max_per_img: -1
nms_thresh: 0.5
TestDataset:
!SniperCOCODataSet
image_dir: val2017
dataset_dir: dataset/coco
is_trainset: false
image_target_sizes: [2000, 1000]
valid_box_ratio_ranges: [[-1, 0.1],[0.08, -1]]
chip_target_size: 500
chip_target_stride: 200
max_per_img: -1
nms_thresh: 0.5
================================================
FILE: configs/datasets/sniper_visdrone_detection.yml
================================================
metric: SNIPERCOCO
num_classes: 9
TrainDataset:
!SniperCOCODataSet
image_dir: train
anno_path: annotations/train.json
dataset_dir: dataset/VisDrone2019_coco
data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
allow_empty: true
is_trainset: true
image_target_sizes: [8145, 2742]
valid_box_ratio_ranges: [[-1, 0.03142857142857144], [0.02333211853008726, -1]]
chip_target_size: 1536
chip_target_stride: 1184
use_neg_chip: false
max_neg_num_per_im: 8
EvalDataset:
!SniperCOCODataSet
image_dir: val
anno_path: annotations/val.json
dataset_dir: dataset/VisDrone2019_coco
data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
allow_empty: true
is_trainset: false
image_target_sizes: [8145, 2742]
valid_box_ratio_ranges: [[-1, 0.03142857142857144], [0.02333211853008726, -1]]
chip_target_size: 1536
chip_target_stride: 1184
max_per_img: -1
nms_thresh: 0.5
TestDataset:
!SniperCOCODataSet
image_dir: val
dataset_dir: dataset/VisDrone2019_coco
is_trainset: false
image_target_sizes: [8145, 2742]
valid_box_ratio_ranges: [[-1, 0.03142857142857144], [0.02333211853008726, -1]]
chip_target_size: 1536
chip_target_stride: 1184
max_per_img: -1
nms_thresh: 0.5
================================================
FILE: configs/datasets/spine_coco.yml
================================================
metric: RBOX
num_classes: 9
TrainDataset:
!COCODataSet
image_dir: images
anno_path: annotations/train.json
dataset_dir: dataset/spine_coco
data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly']
EvalDataset:
!COCODataSet
image_dir: images
anno_path: annotations/valid.json
dataset_dir: dataset/spine_coco
data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly']
TestDataset:
!ImageFolder
anno_path: annotations/valid.json
dataset_dir: dataset/spine_coco
================================================
FILE: configs/datasets/visdrone_detection.yml
================================================
metric: COCO
num_classes: 10
TrainDataset:
!COCODataSet
image_dir: VisDrone2019-DET-train
anno_path: train.json
dataset_dir: dataset/visdrone
data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
EvalDataset:
!COCODataSet
image_dir: VisDrone2019-DET-val
anno_path: val.json
# image_dir: test_dev
# anno_path: test_dev.json
dataset_dir: dataset/visdrone
TestDataset:
!ImageFolder
anno_path: val.json
dataset_dir: dataset/visdrone
================================================
FILE: configs/datasets/voc.yml
================================================
metric: VOC
map_type: 11point
num_classes: 20
TrainDataset:
name: VOCDataSet
dataset_dir: dataset/voc
anno_path: trainval.txt
label_list: label_list.txt
data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult']
EvalDataset:
name: VOCDataSet
dataset_dir: dataset/voc
anno_path: test.txt
label_list: label_list.txt
data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult']
TestDataset:
name: ImageFolder
anno_path: dataset/voc/label_list.txt
================================================
FILE: configs/datasets/wider_face.yml
================================================
metric: WiderFace
num_classes: 1
TrainDataset:
!WIDERFaceDataSet
dataset_dir: dataset/wider_face
anno_path: wider_face_split/wider_face_train_bbx_gt.txt
image_dir: WIDER_train/images
data_fields: ['image', 'gt_bbox', 'gt_class']
EvalDataset:
!WIDERFaceValDataset
dataset_dir: dataset/wider_face
image_dir: WIDER_val/images
anno_path: wider_face_split/wider_face_val_bbx_gt.txt
gt_mat_path: WIDER_val/ground_truth
data_fields: ['image', 'gt_bbox', 'gt_class', 'ori_gt_bbox']
TestDataset:
!ImageFolder
use_default_label: true
================================================
FILE: configs/rtdetrv3/_base_/optimizer_6x.yml
================================================
epoch: 72
LearningRate:
base_lr: 0.0004
schedulers:
- !PiecewiseDecay
gamma: 1.0
milestones: [100]
use_warmup: true
- !LinearWarmup
start_factor: 0.001
steps: 2000
OptimizerBuilder:
clip_grad_by_norm: 0.1
regularizer: false
optimizer:
type: AdamW
weight_decay: 0.0001
================================================
FILE: configs/rtdetrv3/_base_/rtdetr_reader.yml
================================================
worker_num: 4
TrainReader:
sample_transforms:
- Decode: {}
- RandomDistort: {prob: 0.8}
- RandomExpand: {fill_value: [123.675, 116.28, 103.53]}
- RandomCrop: {prob: 0.8}
- RandomFlip: {}
batch_transforms:
- BatchRandomResize: {target_size: [480, 512, 544, 576, 608, 640, 640, 640, 672, 704, 736, 768, 800], random_size: True, random_interp: True, keep_ratio: False}
- NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
- NormalizeBox: {retain_origin_box: true}
- BboxXYXY2XYWH: {}
- Permute: {}
- PadGT: {only_origin_box: true}
batch_size: 16
shuffle: true
drop_last: true
collate_batch: false
use_shared_memory: true
EvalReader:
sample_transforms:
- Decode: {}
- Resize: {target_size: [640, 640], keep_ratio: False, interp: 2}
- NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
- Permute: {}
batch_size: 16
shuffle: false
drop_last: false
TestReader:
inputs_def:
image_shape: [3, 640, 640]
sample_transforms:
- Decode: {}
- Resize: {target_size: [640, 640], keep_ratio: False, interp: 2}
- NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
- Permute: {}
batch_size: 1
shuffle: false
drop_last: false
================================================
FILE: configs/rtdetrv3/_base_/rtdetrv3_r50vd.yml
================================================
architecture: RTDETRV3
pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_v2_pretrained.pdparams
norm_type: sync_bn
use_ema: True
ema_decay: 0.9999
ema_decay_type: "exponential"
ema_filter_no_grad: True
hidden_dim: 256
use_focal_loss: True
eval_size: [640, 640]
RTDETRV3:
backbone: ResNet
neck: HybridEncoder
transformer: RTDETRTransformerv3
detr_head: DINOv3Head
aux_o2m_head: PPYOLOEHead
post_process: DETRPostProcess
ResNet:
# index 0 stands for res2
depth: 50
variant: d
norm_type: bn
freeze_at: 0
return_idx: [1, 2, 3]
lr_mult_list: [0.1, 0.1, 0.1, 0.1]
num_stages: 4
freeze_stem_only: True
HybridEncoder:
hidden_dim: 256
use_encoder_idx: [2]
num_encoder_layers: 1
encoder_layer:
name: TransformerLayer
d_model: 256
nhead: 8
dim_feedforward: 1024
dropout: 0.
activation: 'gelu'
expansion: 1.0
RTDETRTransformerv3:
num_queries: 300
position_embed_type: sine
feat_strides: [8, 16, 32]
num_levels: 3
nhead: 8
num_decoder_layers: 6
dim_feedforward: 1024
dropout: 0.0
activation: relu
num_denoising: 100
label_noise_ratio: 0.5
box_noise_scale: 1.0
learnt_init_query: False
num_noises: 0
num_noise_queries: []
num_noise_denoising: 100
learnt_init_query: False
DINOv3Head:
o2m: 4
loss:
name: DINOv3Loss
loss_coeff: {class: 1, bbox: 5, giou: 2}
aux_loss: True
use_vfl: True
matcher:
name: HungarianMatcher
matcher_coeff: {class: 2, bbox: 5, giou: 2}
PPYOLOEHead:
fpn_strides: [8, 16, 32]
grid_cell_scale: 5.0
grid_cell_offset: 0.5
static_assigner_epoch: 30
use_varifocal_loss: True
loss_weight: {class: 1.0, iou: 2.5, dfl: 0.5}
static_assigner:
name: ATSSAssigner
topk: 9
assigner:
name: TaskAlignedAssigner
topk: 13
alpha: 1.0
beta: 6.0
nms:
name: MultiClassNMS
nms_top_k: 1000
keep_top_k: 300
score_threshold: 0.01
nms_threshold: 0.7
DETRPostProcess:
num_top_queries: 300
================================================
FILE: configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml
================================================
_BASE_: [
'../datasets/coco_detection.yml',
'../runtime.yml',
'_base_/optimizer_6x.yml',
'_base_/rtdetrv3_r50vd.yml',
'_base_/rtdetr_reader.yml',
]
weights: output/rtdetrv3_r18vd_6x_coco/model_final
find_unused_parameters: True
log_iter: 200
o2m_branch: True
num_queries_o2m: 450
pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet18_vd_pretrained.pdparams
RTDETRV3:
backbone: ResNet
neck: HybridEncoder
transformer: RTDETRTransformerv3
detr_head: DINOv3Head
aux_o2m_head: PPYOLOEHead
post_process: DETRPostProcess
ResNet:
depth: 18
variant: d
return_idx: [1, 2, 3]
freeze_at: -1
freeze_norm: false
norm_decay: 0.
HybridEncoder:
hidden_dim: 256
use_encoder_idx: [2]
num_encoder_layers: 1
encoder_layer:
name: TransformerLayer
d_model: 256
nhead: 8
dim_feedforward: 1024
dropout: 0.
activation: 'gelu'
expansion: 0.5
depth_mult: 1.0
RTDETRTransformerv3:
eval_idx: -1
num_decoder_layers: 3
num_noises: 3
num_noise_queries: [300, 300, 300]
num_noise_denoising: 100
learnt_init_query: False
================================================
FILE: configs/rtdetrv3/rtdetrv3_r18vd_6x_lvis.yml
================================================
_BASE_: [
'../datasets/lvis_detection.yml',
'../runtime.yml',
'_base_/optimizer_6x.yml',
'_base_/rtdetrv3_r50vd.yml',
'_base_/rtdetr_reader.yml',
]
weights: output/rtdetrv3vd_r18_6x_lvis/model_final
find_unused_parameters: True
log_iter: 200
o2m_branch: True
num_queries_o2m: 450
pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet18_vd_pretrained.pdparams
RTDETRV3:
backbone: ResNet
neck: HybridEncoder
transformer: RTDETRTransformerv3
detr_head: DINOv3Head
aux_o2m_head: PPYOLOEHead
post_process: DETRPostProcess
ResNet:
depth: 18
variant: d
return_idx: [1, 2, 3]
freeze_at: -1
freeze_norm: false
norm_decay: 0.
HybridEncoder:
hidden_dim: 256
use_encoder_idx: [2]
num_encoder_layers: 1
encoder_layer:
name: TransformerLayer
d_model: 256
nhead: 8
dim_feedforward: 1024
dropout: 0.
activation: 'gelu'
expansion: 0.5
depth_mult: 1.0
RTDETRTransformerv3:
eval_idx: -1
num_decoder_layers: 3
num_noises: 2
num_noise_queries: [300, 300]
num_noise_denoising: 100
learnt_init_query: False
================================================
FILE: configs/rtdetrv3/rtdetrv3_r34vd_6x_coco.yml
================================================
_BASE_: [
'../datasets/coco_detection.yml',
'../runtime.yml',
'_base_/optimizer_6x.yml',
'_base_/rtdetrv3_r50vd.yml',
'_base_/rtdetr_reader.yml',
]
weights: output/rtdetrv3_r34vd_6x_coco/model_final
find_unused_parameters: True
log_iter: 200
o2m_branch: True
num_queries_o2m: 450
pretrain_weights: https://bj.bcebos.com/v1/paddledet/models/pretrained/ResNet34_vd_pretrained.pdparams
RTDETRV3:
backbone: ResNet
neck: HybridEncoder
transformer: RTDETRTransformerv3
detr_head: DINOv3Head
aux_o2m_head: PPYOLOEHead
post_process: DETRPostProcess
ResNet:
depth: 34
variant: d
return_idx: [1, 2, 3]
freeze_at: -1
freeze_norm: false
norm_decay: 0.
HybridEncoder:
hidden_dim: 256
use_encoder_idx: [2]
num_encoder_layers: 1
encoder_layer:
name: TransformerLayer
d_model: 256
nhead: 8
dim_feedforward: 1024
dropout: 0.
activation: 'gelu'
expansion: 0.5
depth_mult: 1.0
RTDETRTransformerv3:
eval_idx: -1
num_decoder_layers: 4
num_noises: 3
num_noise_queries: [300, 300, 300]
num_noise_denoising: 100
learnt_init_query: False
================================================
FILE: configs/rtdetrv3/rtdetrv3_r50vd_6x_coco.yml
================================================
_BASE_: [
'../datasets/coco_detection.yml',
'../runtime.yml',
'_base_/optimizer_6x.yml',
'_base_/rtdetrv3_r50vd.yml',
'_base_/rtdetr_reader.yml',
]
weights: output/rtdetrv3_r50vd_6x_coco/model_final
find_unused_parameters: True
log_iter: 200
o2m_branch: True
num_queries_o2m: 450
pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_v2_pretrained.pdparams
RTDETRTransformerv3:
eval_idx: -1
num_decoder_layers: 6
num_noises: 2
num_noise_queries: [300, 300]
num_noise_denoising: 100
learnt_init_query: False
================================================
FILE: configs/rtdetrv3/rtdetrv3_r50vd_6x_lvis.yml
================================================
_BASE_: [
'../datasets/lvis_detection.yml',
'../runtime.yml',
'_base_/optimizer_6x.yml',
'_base_/rtdetrv3_r50vd.yml',
'_base_/rtdetr_reader.yml',
]
weights: output/rtdetrv3_r50vd_6x_lvis/model_final
find_unused_parameters: True
log_iter: 200
snapshot_epoch: 2
o2m_branch: True
num_queries_o2m: 450
pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_v2_pretrained.pdparams
RTDETRTransformerv3:
eval_idx: -1
num_decoder_layers: 6
num_noises: 1
num_noise_queries: [300]
num_noise_denoising: 100
learnt_init_query: False
================================================
FILE: configs/runtime.yml
================================================
use_gpu: true
use_xpu: false
use_mlu: false
use_npu: false
log_iter: 20
save_dir: output
snapshot_epoch: 1
print_flops: false
print_params: false
# Exporting the model
export:
post_process: True # Whether post-processing is included in the network when export model.
nms: True # Whether NMS is included in the network when export model.
benchmark: False # It is used to testing model performance, if set `True`, post-process and NMS will not be exported.
fuse_conv_bn: False
================================================
FILE: dataset/coco/download_coco.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import os.path as osp
import logging
# add python path of PaddleDetection to sys.path
parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))
if parent_path not in sys.path:
sys.path.append(parent_path)
from ppdet.utils.download import download_dataset
logging.basicConfig(level=logging.INFO)
download_path = osp.split(osp.realpath(sys.argv[0]))[0]
download_dataset(download_path, 'coco')
================================================
FILE: dataset/dota/.gitignore
================================================
================================================
FILE: dataset/mot/gen_labels_MOT.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os.path as osp
import os
import numpy as np
MOT_data = 'MOT16'
# choose a data in ['MOT15', 'MOT16', 'MOT17', 'MOT20']
# or your custom data (prepare it following the 'docs/tutorials/PrepareMOTDataSet.md')
def mkdirs(d):
if not osp.exists(d):
os.makedirs(d)
seq_root = './{}/images/train'.format(MOT_data)
label_root = './{}/labels_with_ids/train'.format(MOT_data)
mkdirs(label_root)
seqs = [s for s in os.listdir(seq_root)]
tid_curr = 0
tid_last = -1
for seq in seqs:
seq_info = open(osp.join(seq_root, seq, 'seqinfo.ini')).read()
seq_width = int(seq_info[seq_info.find('imWidth=') + 8:seq_info.find(
'\nimHeight')])
seq_height = int(seq_info[seq_info.find('imHeight=') + 9:seq_info.find(
'\nimExt')])
gt_txt = osp.join(seq_root, seq, 'gt', 'gt.txt')
gt = np.loadtxt(gt_txt, dtype=np.float64, delimiter=',')
seq_label_root = osp.join(label_root, seq, 'img1')
mkdirs(seq_label_root)
for fid, tid, x, y, w, h, mark, label, _ in gt:
if mark == 0 or not label == 1:
continue
fid = int(fid)
tid = int(tid)
if not tid == tid_last:
tid_curr += 1
tid_last = tid
x += w / 2
y += h / 2
label_fpath = osp.join(seq_label_root, '{:06d}.txt'.format(fid))
label_str = '0 {:d} {:.6f} {:.6f} {:.6f} {:.6f}\n'.format(
tid_curr, x / seq_width, y / seq_height, w / seq_width,
h / seq_height)
with open(label_fpath, 'a') as f:
f.write(label_str)
================================================
FILE: dataset/roadsign_voc/download_roadsign_voc.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import os.path as osp
import logging
# add python path of PaddleDetection to sys.path
parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))
if parent_path not in sys.path:
sys.path.append(parent_path)
from ppdet.utils.download import download_dataset
logging.basicConfig(level=logging.INFO)
download_path = osp.split(osp.realpath(sys.argv[0]))[0]
download_dataset(download_path, 'roadsign_voc')
================================================
FILE: dataset/roadsign_voc/label_list.txt
================================================
speedlimit
crosswalk
trafficlight
stop
================================================
FILE: dataset/spine_coco/download_spine_coco.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import os.path as osp
import logging
# add python path of PaddleDetection to sys.path
parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))
if parent_path not in sys.path:
sys.path.append(parent_path)
from ppdet.utils.download import download_dataset
logging.basicConfig(level=logging.INFO)
download_path = osp.split(osp.realpath(sys.argv[0]))[0]
download_dataset(download_path, 'spine_coco')
================================================
FILE: dataset/voc/create_list.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import os.path as osp
import logging
# add python path of PaddleDetection to sys.path
parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))
if parent_path not in sys.path:
sys.path.append(parent_path)
from ppdet.utils.download import create_voc_list
logging.basicConfig(level=logging.INFO)
voc_path = osp.split(osp.realpath(sys.argv[0]))[0]
create_voc_list(voc_path)
================================================
FILE: dataset/voc/download_voc.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import os.path as osp
import logging
# add python path of PaddleDetection to sys.path
parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))
if parent_path not in sys.path:
sys.path.append(parent_path)
from ppdet.utils.download import download_dataset
logging.basicConfig(level=logging.INFO)
download_path = osp.split(osp.realpath(sys.argv[0]))[0]
download_dataset(download_path, 'voc')
================================================
FILE: dataset/voc/label_list.txt
================================================
aeroplane
bicycle
bird
boat
bottle
bus
car
cat
chair
cow
diningtable
dog
horse
motorbike
person
pottedplant
sheep
sofa
train
tvmonitor
================================================
FILE: dataset/wider_face/download_wider_face.sh
================================================
# All rights `PaddleDetection` reserved
# References:
# @inproceedings{yang2016wider,
# Author = {Yang, Shuo and Luo, Ping and Loy, Chen Change and Tang, Xiaoou},
# Booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
# Title = {WIDER FACE: A Face Detection Benchmark},
# Year = {2016}}
DIR="$( cd "$(dirname "$0")" ; pwd -P )"
cd "$DIR"
# Download the data.
echo "Downloading..."
wget https://dataset.bj.bcebos.com/wider_face/WIDER_train.zip
wget https://dataset.bj.bcebos.com/wider_face/WIDER_val.zip
wget https://dataset.bj.bcebos.com/wider_face/wider_face_split.zip
# Extract the data.
echo "Extracting..."
unzip -q WIDER_train.zip
unzip -q WIDER_val.zip
unzip -q wider_face_split.zip
================================================
FILE: ppdet/__init__.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import (core, data, engine, modeling, model_zoo, optimizer, metrics,
utils, slim)
try:
from .version import full_version as __version__
from .version import commit as __git_commit__
except ImportError:
import sys
sys.stderr.write("Warning: import ppdet from source directory " \
"without installing, run 'python setup.py install' to " \
"install ppdet firstly\n")
================================================
FILE: ppdet/core/__init__.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import config
================================================
FILE: ppdet/core/config/__init__.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
================================================
FILE: ppdet/core/config/schema.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import inspect
import importlib
import re
try:
from docstring_parser import parse as doc_parse
except Exception:
def doc_parse(*args):
pass
try:
from typeguard import check_type
except Exception:
def check_type(*args):
pass
__all__ = ['SchemaValue', 'SchemaDict', 'SharedConfig', 'extract_schema']
class SchemaValue(object):
def __init__(self, name, doc='', type=None):
super(SchemaValue, self).__init__()
self.name = name
self.doc = doc
self.type = type
def set_default(self, value):
self.default = value
def has_default(self):
return hasattr(self, 'default')
class SchemaDict(dict):
def __init__(self, **kwargs):
super(SchemaDict, self).__init__()
self.schema = {}
self.strict = False
self.doc = ""
self.update(kwargs)
def __setitem__(self, key, value):
# XXX also update regular dict to SchemaDict??
if isinstance(value, dict) and key in self and isinstance(self[key],
SchemaDict):
self[key].update(value)
else:
super(SchemaDict, self).__setitem__(key, value)
def __missing__(self, key):
if self.has_default(key):
return self.schema[key].default
elif key in self.schema:
return self.schema[key]
else:
raise KeyError(key)
def copy(self):
newone = SchemaDict()
newone.__dict__.update(self.__dict__)
newone.update(self)
return newone
def set_schema(self, key, value):
assert isinstance(value, SchemaValue)
self.schema[key] = value
def set_strict(self, strict):
self.strict = strict
def has_default(self, key):
return key in self.schema and self.schema[key].has_default()
def is_default(self, key):
if not self.has_default(key):
return False
if hasattr(self[key], '__dict__'):
return True
else:
return key not in self or self[key] == self.schema[key].default
def find_default_keys(self):
return [
k for k in list(self.keys()) + list(self.schema.keys())
if self.is_default(k)
]
def mandatory(self):
return any([k for k in self.schema.keys() if not self.has_default(k)])
def find_missing_keys(self):
missing = [
k for k in self.schema.keys()
if k not in self and not self.has_default(k)
]
placeholders = [k for k in self if self[k] in ('', '')]
return missing + placeholders
def find_extra_keys(self):
return list(set(self.keys()) - set(self.schema.keys()))
def find_mismatch_keys(self):
mismatch_keys = []
for arg in self.schema.values():
if arg.type is not None:
try:
check_type("{}.{}".format(self.name, arg.name),
self[arg.name], arg.type)
except Exception:
mismatch_keys.append(arg.name)
return mismatch_keys
def validate(self):
missing_keys = self.find_missing_keys()
if missing_keys:
raise ValueError("Missing param for class<{}>: {}".format(
self.name, ", ".join(missing_keys)))
extra_keys = self.find_extra_keys()
if extra_keys and self.strict:
raise ValueError("Extraneous param for class<{}>: {}".format(
self.name, ", ".join(extra_keys)))
mismatch_keys = self.find_mismatch_keys()
if mismatch_keys:
raise TypeError("Wrong param type for class<{}>: {}".format(
self.name, ", ".join(mismatch_keys)))
class SharedConfig(object):
"""
Representation class for `__shared__` annotations, which work as follows:
- if `key` is set for the module in config file, its value will take
precedence
- if `key` is not set for the module but present in the config file, its
value will be used
- otherwise, use the provided `default_value` as fallback
Args:
key: config[key] will be injected
default_value: fallback value
"""
def __init__(self, key, default_value=None):
super(SharedConfig, self).__init__()
self.key = key
self.default_value = default_value
def extract_schema(cls):
"""
Extract schema from a given class
Args:
cls (type): Class from which to extract.
Returns:
schema (SchemaDict): Extracted schema.
"""
ctor = cls.__init__
# python 2 compatibility
if hasattr(inspect, 'getfullargspec'):
argspec = inspect.getfullargspec(ctor)
annotations = argspec.annotations
has_kwargs = argspec.varkw is not None
else:
argspec = inspect.getfullargspec(ctor)
# python 2 type hinting workaround, see pep-3107
# however, since `typeguard` does not support python 2, type checking
# is still python 3 only for now
annotations = getattr(ctor, '__annotations__', {})
has_kwargs = argspec.varkw is not None
names = [arg for arg in argspec.args if arg != 'self']
defaults = argspec.defaults
num_defaults = argspec.defaults is not None and len(argspec.defaults) or 0
num_required = len(names) - num_defaults
docs = cls.__doc__
if docs is None and getattr(cls, '__category__', None) == 'op':
docs = cls.__call__.__doc__
try:
docstring = doc_parse(docs)
except Exception:
docstring = None
if docstring is None:
comments = {}
else:
comments = {}
for p in docstring.params:
match_obj = re.match('^([a-zA-Z_]+[a-zA-Z_0-9]*).*', p.arg_name)
if match_obj is not None:
comments[match_obj.group(1)] = p.description
schema = SchemaDict()
schema.name = cls.__name__
schema.doc = ""
if docs is not None:
start_pos = docs[0] == '\n' and 1 or 0
schema.doc = docs[start_pos:].split("\n")[0].strip()
# XXX handle paddle's weird doc convention
if '**' == schema.doc[:2] and '**' == schema.doc[-2:]:
schema.doc = schema.doc[2:-2].strip()
schema.category = hasattr(cls, '__category__') and getattr(
cls, '__category__') or 'module'
schema.strict = not has_kwargs
schema.pymodule = importlib.import_module(cls.__module__)
schema.inject = getattr(cls, '__inject__', [])
schema.shared = getattr(cls, '__shared__', [])
for idx, name in enumerate(names):
comment = name in comments and comments[name] or name
if name in schema.inject:
type_ = None
else:
type_ = name in annotations and annotations[name] or None
value_schema = SchemaValue(name, comment, type_)
if name in schema.shared:
assert idx >= num_required, "shared config must have default value"
default = defaults[idx - num_required]
value_schema.set_default(SharedConfig(name, default))
elif idx >= num_required:
default = defaults[idx - num_required]
value_schema.set_default(default)
schema.set_schema(name, value_schema)
return schema
================================================
FILE: ppdet/core/config/yaml_helpers.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import importlib
import inspect
import yaml
from .schema import SharedConfig
__all__ = ['serializable', 'Callable']
def represent_dictionary_order(self, dict_data):
return self.represent_mapping('tag:yaml.org,2002:map', dict_data.items())
def setup_orderdict():
from collections import OrderedDict
yaml.add_representer(OrderedDict, represent_dictionary_order)
def _make_python_constructor(cls):
def python_constructor(loader, node):
if isinstance(node, yaml.SequenceNode):
args = loader.construct_sequence(node, deep=True)
return cls(*args)
else:
kwargs = loader.construct_mapping(node, deep=True)
try:
return cls(**kwargs)
except Exception as ex:
print("Error when construct {} instance from yaml config".
format(cls.__name__))
raise ex
return python_constructor
def _make_python_representer(cls):
# python 2 compatibility
if hasattr(inspect, 'getfullargspec'):
argspec = inspect.getfullargspec(cls)
else:
argspec = inspect.getfullargspec(cls.__init__)
argnames = [arg for arg in argspec.args if arg != 'self']
def python_representer(dumper, obj):
if argnames:
data = {name: getattr(obj, name) for name in argnames}
else:
data = obj.__dict__
if '_id' in data:
del data['_id']
return dumper.represent_mapping(u'!{}'.format(cls.__name__), data)
return python_representer
def serializable(cls):
"""
Add loader and dumper for given class, which must be
"trivially serializable"
Args:
cls: class to be serialized
Returns: cls
"""
yaml.add_constructor(u'!{}'.format(cls.__name__),
_make_python_constructor(cls))
yaml.add_representer(cls, _make_python_representer(cls))
return cls
yaml.add_representer(SharedConfig,
lambda d, o: d.represent_data(o.default_value))
@serializable
class Callable(object):
"""
Helper to be used in Yaml for creating arbitrary class objects
Args:
full_type (str): the full module path to target function
"""
def __init__(self, full_type, args=[], kwargs={}):
super(Callable, self).__init__()
self.full_type = full_type
self.args = args
self.kwargs = kwargs
def __call__(self):
if '.' in self.full_type:
idx = self.full_type.rfind('.')
module = importlib.import_module(self.full_type[:idx])
func_name = self.full_type[idx + 1:]
else:
try:
module = importlib.import_module('builtins')
except Exception:
module = importlib.import_module('__builtin__')
func_name = self.full_type
func = getattr(module, func_name)
return func(*self.args, **self.kwargs)
================================================
FILE: ppdet/core/workspace.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import importlib
import os
import sys
import yaml
import collections
try:
collectionsAbc = collections.abc
except AttributeError:
collectionsAbc = collections
from .config.schema import SchemaDict, SharedConfig, extract_schema
from .config.yaml_helpers import serializable
__all__ = [
'global_config',
'load_config',
'merge_config',
'get_registered_modules',
'create',
'register',
'serializable',
'dump_value',
]
def dump_value(value):
# XXX this is hackish, but collections.abc is not available in python 2
if hasattr(value, '__dict__') or isinstance(value, (dict, tuple, list)):
value = yaml.dump(value, default_flow_style=True)
value = value.replace('\n', '')
value = value.replace('...', '')
return "'{}'".format(value)
else:
# primitive types
return str(value)
class AttrDict(dict):
"""Single level attribute dict, NOT recursive"""
def __init__(self, **kwargs):
super(AttrDict, self).__init__()
super(AttrDict, self).update(kwargs)
def __getattr__(self, key):
if key in self:
return self[key]
raise AttributeError("object has no attribute '{}'".format(key))
def __setattr__(self, key, value):
self[key] = value
def copy(self):
new_dict = AttrDict()
for k, v in self.items():
new_dict.update({k: v})
return new_dict
global_config = AttrDict()
BASE_KEY = '_BASE_'
# parse and load _BASE_ recursively
def _load_config_with_base(file_path):
with open(file_path) as f:
file_cfg = yaml.load(f, Loader=yaml.Loader)
# NOTE: cfgs outside have higher priority than cfgs in _BASE_
if BASE_KEY in file_cfg:
all_base_cfg = AttrDict()
base_ymls = list(file_cfg[BASE_KEY])
for base_yml in base_ymls:
if base_yml.startswith("~"):
base_yml = os.path.expanduser(base_yml)
if not base_yml.startswith('/'):
base_yml = os.path.join(os.path.dirname(file_path), base_yml)
with open(base_yml) as f:
base_cfg = _load_config_with_base(base_yml)
all_base_cfg = merge_config(base_cfg, all_base_cfg)
del file_cfg[BASE_KEY]
return merge_config(file_cfg, all_base_cfg)
return file_cfg
def load_config(file_path):
"""
Load config from file.
Args:
file_path (str): Path of the config file to be loaded.
Returns: global config
"""
_, ext = os.path.splitext(file_path)
assert ext in ['.yml', '.yaml'], "only support yaml files for now"
# load config from file and merge into global config
cfg = _load_config_with_base(file_path)
cfg['filename'] = os.path.splitext(os.path.split(file_path)[-1])[0]
merge_config(cfg)
return global_config
def dict_merge(dct, merge_dct):
""" Recursive dict merge. Inspired by :meth:``dict.update()``, instead of
updating only top-level keys, dict_merge recurses down into dicts nested
to an arbitrary depth, updating keys. The ``merge_dct`` is merged into
``dct``.
Args:
dct: dict onto which the merge is executed
merge_dct: dct merged into dct
Returns: dct
"""
for k, v in merge_dct.items():
if (k in dct and isinstance(dct[k], dict) and
isinstance(merge_dct[k], collectionsAbc.Mapping)):
dict_merge(dct[k], merge_dct[k])
else:
dct[k] = merge_dct[k]
return dct
def merge_config(config, another_cfg=None):
"""
Merge config into global config or another_cfg.
Args:
config (dict): Config to be merged.
Returns: global config
"""
global global_config
dct = another_cfg or global_config
return dict_merge(dct, config)
def get_registered_modules():
return {k: v for k, v in global_config.items() if isinstance(v, SchemaDict)}
def make_partial(cls):
op_module = importlib.import_module(cls.__op__.__module__)
op = getattr(op_module, cls.__op__.__name__)
cls.__category__ = getattr(cls, '__category__', None) or 'op'
def partial_apply(self, *args, **kwargs):
kwargs_ = self.__dict__.copy()
kwargs_.update(kwargs)
return op(*args, **kwargs_)
if getattr(cls, '__append_doc__', True): # XXX should default to True?
if sys.version_info[0] > 2:
cls.__doc__ = "Wrapper for `{}` OP".format(op.__name__)
cls.__init__.__doc__ = op.__doc__
cls.__call__ = partial_apply
cls.__call__.__doc__ = op.__doc__
else:
# XXX work around for python 2
partial_apply.__doc__ = op.__doc__
cls.__call__ = partial_apply
return cls
def register(cls):
"""
Register a given module class.
Args:
cls (type): Module class to be registered.
Returns: cls
"""
if cls.__name__ in global_config:
raise ValueError("Module class already registered: {}".format(
cls.__name__))
if hasattr(cls, '__op__'):
cls = make_partial(cls)
global_config[cls.__name__] = extract_schema(cls)
return cls
def create(cls_or_name, **kwargs):
"""
Create an instance of given module class.
Args:
cls_or_name (type or str): Class of which to create instance.
Returns: instance of type `cls_or_name`
"""
assert type(cls_or_name) in [type, str
], "should be a class or name of a class"
name = type(cls_or_name) == str and cls_or_name or cls_or_name.__name__
if name in global_config:
if isinstance(global_config[name], SchemaDict):
pass
elif hasattr(global_config[name], "__dict__"):
# support instance return directly
return global_config[name]
else:
raise ValueError("The module {} is not registered".format(name))
else:
raise ValueError("The module {} is not registered".format(name))
config = global_config[name]
cls = getattr(config.pymodule, name)
cls_kwargs = {}
cls_kwargs.update(global_config[name])
# parse `shared` annoation of registered modules
if getattr(config, 'shared', None):
for k in config.shared:
target_key = config[k]
shared_conf = config.schema[k].default
assert isinstance(shared_conf, SharedConfig)
if target_key is not None and not isinstance(target_key,
SharedConfig):
continue # value is given for the module
elif shared_conf.key in global_config:
# `key` is present in config
cls_kwargs[k] = global_config[shared_conf.key]
else:
cls_kwargs[k] = shared_conf.default_value
# parse `inject` annoation of registered modules
if getattr(cls, 'from_config', None):
cls_kwargs.update(cls.from_config(config, **kwargs))
if getattr(config, 'inject', None):
for k in config.inject:
target_key = config[k]
# optional dependency
if target_key is None:
continue
if isinstance(target_key, dict) or hasattr(target_key, '__dict__'):
if 'name' not in target_key.keys():
continue
inject_name = str(target_key['name'])
if inject_name not in global_config:
raise ValueError(
"Missing injection name {} and check it's name in cfg file".
format(k))
target = global_config[inject_name]
for i, v in target_key.items():
if i == 'name':
continue
target[i] = v
if isinstance(target, SchemaDict):
cls_kwargs[k] = create(inject_name)
elif isinstance(target_key, str):
if target_key not in global_config:
raise ValueError("Missing injection config:", target_key)
target = global_config[target_key]
if isinstance(target, SchemaDict):
cls_kwargs[k] = create(target_key)
elif hasattr(target, '__dict__'): # serialized object
cls_kwargs[k] = target
else:
raise ValueError("Unsupported injection type:", target_key)
# prevent modification of global config values of reference types
# (e.g., list, dict) from within the created module instances
#kwargs = copy.deepcopy(kwargs)
return cls(**cls_kwargs)
================================================
FILE: ppdet/data/__init__.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import source
from . import transform
from . import reader
from .source import *
from .transform import *
from .reader import *
================================================
FILE: ppdet/data/crop_utils/__init__.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
================================================
FILE: ppdet/data/crop_utils/annotation_cropper.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import math
import random
import numpy as np
from copy import deepcopy
from typing import List, Tuple
from collections import defaultdict
from .chip_box_utils import nms, transform_chip_boxes2image_boxes
from .chip_box_utils import find_chips_to_cover_overlaped_boxes
from .chip_box_utils import transform_chip_box
from .chip_box_utils import intersection_over_box
class AnnoCropper(object):
def __init__(self,
image_target_sizes: List[int],
valid_box_ratio_ranges: List[List[float]],
chip_target_size: int,
chip_target_stride: int,
use_neg_chip: bool=False,
max_neg_num_per_im: int=8,
max_per_img: int=-1,
nms_thresh: int=0.5):
"""
Generate chips by chip_target_size and chip_target_stride.
These two parameters just like kernel_size and stride in cnn.
Each image has its raw size. After resizing, then get its target size.
The resizing scale = target_size / raw_size.
So are chips of the image.
box_ratio = box_raw_size / image_raw_size = box_target_size / image_target_size
The 'size' above mentioned is the size of long-side of image, box or chip.
:param image_target_sizes: [2000, 1000]
:param valid_box_ratio_ranges: [[-1, 0.1],[0.08, -1]]
:param chip_target_size: 500
:param chip_target_stride: 200
"""
self.target_sizes = image_target_sizes
self.valid_box_ratio_ranges = valid_box_ratio_ranges
assert len(self.target_sizes) == len(self.valid_box_ratio_ranges)
self.scale_num = len(self.target_sizes)
self.chip_target_size = chip_target_size # is target size
self.chip_target_stride = chip_target_stride # is target stride
self.use_neg_chip = use_neg_chip
self.max_neg_num_per_im = max_neg_num_per_im
self.max_per_img = max_per_img
self.nms_thresh = nms_thresh
def crop_anno_records(self, records: List[dict]):
"""
The main logic:
# foreach record(image):
# foreach scale:
# 1 generate chips by chip size and stride for each scale
# 2 get pos chips
# - validate boxes: current scale; h,w >= 1
# - find pos chips greedily by valid gt boxes in each scale
# - for every valid gt box, find its corresponding pos chips in each scale
# 3 get neg chips
# - If given proposals, find neg boxes in them which are not in pos chips
# - If got neg boxes in last step, we find neg chips and assign neg boxes to neg chips such as 2.
# 4 sample neg chips if too much each image
# transform this image-scale annotations to chips(pos chips&neg chips) annotations
:param records, standard coco_record but with extra key `proposals`(Px4), which are predicted by stage1
model and maybe have neg boxes in them.
:return: new_records, list of dict like
{
'im_file': 'fake_image1.jpg',
'im_id': np.array([1]), # new _global_chip_id as im_id
'h': h, # chip height
'w': w, # chip width
'is_crowd': is_crowd, # Nx1 -> Mx1
'gt_class': gt_class, # Nx1 -> Mx1
'gt_bbox': gt_bbox, # Nx4 -> Mx4, 4 represents [x1,y1,x2,y2]
'gt_poly': gt_poly, # [None]xN -> [None]xM
'chip': [x1, y1, x2, y2] # added
}
Attention:
------------------------------>x
|
| (x1,y1)------
| | |
| | |
| | |
| | |
| | |
| ----------
| (x2,y2)
|
↓
y
If we use [x1, y1, x2, y2] to represent boxes or chips,
(x1,y1) is the left-top point which is in the box,
but (x2,y2) is the right-bottom point which is not in the box.
So x1 in [0, w-1], x2 in [1, w], y1 in [0, h-1], y2 in [1,h].
And you can use x2-x1 to get width, and you can use image[y1:y2, x1:x2] to get the box area.
"""
self.chip_records = []
self._global_chip_id = 1
for r in records:
self._cur_im_pos_chips = [
] # element: (chip, boxes_idx), chip is [x1, y1, x2, y2], boxes_ids is List[int]
self._cur_im_neg_chips = [] # element: (chip, neg_box_num)
for scale_i in range(self.scale_num):
self._get_current_scale_parameters(scale_i, r)
# Cx4
chips = self._create_chips(r['h'], r['w'], self._cur_scale)
# # dict: chipid->[box_id, ...]
pos_chip2boxes_idx = self._get_valid_boxes_and_pos_chips(
r['gt_bbox'], chips)
# dict: chipid->neg_box_num
neg_chip2box_num = self._get_neg_boxes_and_chips(
chips,
list(pos_chip2boxes_idx.keys()), r.get('proposals', None))
self._add_to_cur_im_chips(chips, pos_chip2boxes_idx,
neg_chip2box_num)
cur_image_records = self._trans_all_chips2annotations(r)
self.chip_records.extend(cur_image_records)
return self.chip_records
def _add_to_cur_im_chips(self, chips, pos_chip2boxes_idx, neg_chip2box_num):
for pos_chipid, boxes_idx in pos_chip2boxes_idx.items():
chip = np.array(chips[pos_chipid]) # copy chips slice
self._cur_im_pos_chips.append((chip, boxes_idx))
if neg_chip2box_num is None:
return
for neg_chipid, neg_box_num in neg_chip2box_num.items():
chip = np.array(chips[neg_chipid])
self._cur_im_neg_chips.append((chip, neg_box_num))
def _trans_all_chips2annotations(self, r):
gt_bbox = r['gt_bbox']
im_file = r['im_file']
is_crowd = r['is_crowd']
gt_class = r['gt_class']
# gt_poly = r['gt_poly'] # [None]xN
# remaining keys: im_id, h, w
chip_records = self._trans_pos_chips2annotations(im_file, gt_bbox,
is_crowd, gt_class)
if not self.use_neg_chip:
return chip_records
sampled_neg_chips = self._sample_neg_chips()
neg_chip_records = self._trans_neg_chips2annotations(im_file,
sampled_neg_chips)
chip_records.extend(neg_chip_records)
return chip_records
def _trans_pos_chips2annotations(self, im_file, gt_bbox, is_crowd,
gt_class):
chip_records = []
for chip, boxes_idx in self._cur_im_pos_chips:
chip_bbox, final_boxes_idx = transform_chip_box(gt_bbox, boxes_idx,
chip)
x1, y1, x2, y2 = chip
chip_h = y2 - y1
chip_w = x2 - x1
rec = {
'im_file': im_file,
'im_id': np.array([self._global_chip_id]),
'h': chip_h,
'w': chip_w,
'gt_bbox': chip_bbox,
'is_crowd': is_crowd[final_boxes_idx].copy(),
'gt_class': gt_class[final_boxes_idx].copy(),
# 'gt_poly': [None] * len(final_boxes_idx),
'chip': chip
}
self._global_chip_id += 1
chip_records.append(rec)
return chip_records
def _sample_neg_chips(self):
pos_num = len(self._cur_im_pos_chips)
neg_num = len(self._cur_im_neg_chips)
sample_num = min(pos_num + 2, self.max_neg_num_per_im)
assert sample_num >= 1
if neg_num <= sample_num:
return self._cur_im_neg_chips
candidate_num = int(sample_num * 1.5)
candidate_neg_chips = sorted(
self._cur_im_neg_chips, key=lambda x: -x[1])[:candidate_num]
random.shuffle(candidate_neg_chips)
sampled_neg_chips = candidate_neg_chips[:sample_num]
return sampled_neg_chips
def _trans_neg_chips2annotations(self,
im_file: str,
sampled_neg_chips: List[Tuple]):
chip_records = []
for chip, neg_box_num in sampled_neg_chips:
x1, y1, x2, y2 = chip
chip_h = y2 - y1
chip_w = x2 - x1
rec = {
'im_file': im_file,
'im_id': np.array([self._global_chip_id]),
'h': chip_h,
'w': chip_w,
'gt_bbox': np.zeros(
(0, 4), dtype=np.float32),
'is_crowd': np.zeros(
(0, 1), dtype=np.int32),
'gt_class': np.zeros(
(0, 1), dtype=np.int32),
# 'gt_poly': [],
'chip': chip
}
self._global_chip_id += 1
chip_records.append(rec)
return chip_records
def _get_current_scale_parameters(self, scale_i, r):
im_size = max(r['h'], r['w'])
im_target_size = self.target_sizes[scale_i]
self._cur_im_size, self._cur_im_target_size = im_size, im_target_size
self._cur_scale = self._get_current_scale(im_target_size, im_size)
self._cur_valid_ratio_range = self.valid_box_ratio_ranges[scale_i]
def _get_current_scale(self, im_target_size, im_size):
return im_target_size / im_size
def _create_chips(self, h: int, w: int, scale: float):
"""
Generate chips by chip_target_size and chip_target_stride.
These two parameters just like kernel_size and stride in cnn.
:return: chips, Cx4, xy in raw size dimension
"""
chip_size = self.chip_target_size # omit target for simplicity
stride = self.chip_target_stride
width = int(scale * w)
height = int(scale * h)
min_chip_location_diff = 20 # in target size
assert chip_size >= stride
chip_overlap = chip_size - stride
if (width - chip_overlap
) % stride > min_chip_location_diff: # 不能被stride整除的部分比较大,则保留
w_steps = max(1, int(math.ceil((width - chip_overlap) / stride)))
else: # 不能被stride整除的部分比较小,则丢弃
w_steps = max(1, int(math.floor((width - chip_overlap) / stride)))
if (height - chip_overlap) % stride > min_chip_location_diff:
h_steps = max(1, int(math.ceil((height - chip_overlap) / stride)))
else:
h_steps = max(1, int(math.floor((height - chip_overlap) / stride)))
chips = list()
for j in range(h_steps):
for i in range(w_steps):
x1 = i * stride
y1 = j * stride
x2 = min(x1 + chip_size, width)
y2 = min(y1 + chip_size, height)
chips.append([x1, y1, x2, y2])
# check chip size
for item in chips:
if item[2] - item[0] > chip_size * 1.1 or item[3] - item[
1] > chip_size * 1.1:
raise ValueError(item)
chips = np.array(chips, dtype=np.float32)
raw_size_chips = chips / scale
return raw_size_chips
def _get_valid_boxes_and_pos_chips(self, gt_bbox, chips):
valid_ratio_range = self._cur_valid_ratio_range
im_size = self._cur_im_size
scale = self._cur_scale
# Nx4 N
valid_boxes, valid_boxes_idx = self._validate_boxes(
valid_ratio_range, im_size, gt_bbox, scale)
# dict: chipid->[box_id, ...]
pos_chip2boxes_idx = self._find_pos_chips(chips, valid_boxes,
valid_boxes_idx)
return pos_chip2boxes_idx
def _validate_boxes(self,
valid_ratio_range: List[float],
im_size: int,
gt_boxes: 'np.array of Nx4',
scale: float):
"""
:return: valid_boxes: Nx4, valid_boxes_idx: N
"""
ws = (gt_boxes[:, 2] - gt_boxes[:, 0]).astype(np.int32)
hs = (gt_boxes[:, 3] - gt_boxes[:, 1]).astype(np.int32)
maxs = np.maximum(ws, hs)
box_ratio = maxs / im_size
mins = np.minimum(ws, hs)
target_mins = mins * scale
low = valid_ratio_range[0] if valid_ratio_range[0] > 0 else 0
high = valid_ratio_range[1] if valid_ratio_range[1] > 0 else np.finfo(
np.float32).max
valid_boxes_idx = np.nonzero((low <= box_ratio) & (box_ratio < high) & (
target_mins >= 2))[0]
valid_boxes = gt_boxes[valid_boxes_idx]
return valid_boxes, valid_boxes_idx
def _find_pos_chips(self,
chips: 'Cx4',
valid_boxes: 'Bx4',
valid_boxes_idx: 'B'):
"""
:return: pos_chip2boxes_idx, dict: chipid->[box_id, ...]
"""
iob = intersection_over_box(chips, valid_boxes) # overlap, CxB
iob_threshold_to_find_chips = 1.
pos_chip_ids, _ = self._find_chips_to_cover_overlaped_boxes(
iob, iob_threshold_to_find_chips)
pos_chip_ids = set(pos_chip_ids)
iob_threshold_to_assign_box = 0.5
pos_chip2boxes_idx = self._assign_boxes_to_pos_chips(
iob, iob_threshold_to_assign_box, pos_chip_ids, valid_boxes_idx)
return pos_chip2boxes_idx
def _find_chips_to_cover_overlaped_boxes(self, iob, overlap_threshold):
return find_chips_to_cover_overlaped_boxes(iob, overlap_threshold)
def _assign_boxes_to_pos_chips(self, iob, overlap_threshold, pos_chip_ids,
valid_boxes_idx):
chip_ids, box_ids = np.nonzero(iob >= overlap_threshold)
pos_chip2boxes_idx = defaultdict(list)
for chip_id, box_id in zip(chip_ids, box_ids):
if chip_id not in pos_chip_ids:
continue
raw_gt_box_idx = valid_boxes_idx[box_id]
pos_chip2boxes_idx[chip_id].append(raw_gt_box_idx)
return pos_chip2boxes_idx
def _get_neg_boxes_and_chips(self,
chips: 'Cx4',
pos_chip_ids: 'D',
proposals: 'Px4'):
"""
:param chips:
:param pos_chip_ids:
:param proposals:
:return: neg_chip2box_num, None or dict: chipid->neg_box_num
"""
if not self.use_neg_chip:
return None
# train proposals maybe None
if proposals is None or len(proposals) < 1:
return None
valid_ratio_range = self._cur_valid_ratio_range
im_size = self._cur_im_size
scale = self._cur_scale
valid_props, _ = self._validate_boxes(valid_ratio_range, im_size,
proposals, scale)
neg_boxes = self._find_neg_boxes(chips, pos_chip_ids, valid_props)
neg_chip2box_num = self._find_neg_chips(chips, pos_chip_ids, neg_boxes)
return neg_chip2box_num
def _find_neg_boxes(self,
chips: 'Cx4',
pos_chip_ids: 'D',
valid_props: 'Px4'):
"""
:return: neg_boxes: Nx4
"""
if len(pos_chip_ids) == 0:
return valid_props
pos_chips = chips[pos_chip_ids]
iob = intersection_over_box(pos_chips, valid_props)
overlap_per_prop = np.max(iob, axis=0)
non_overlap_props_idx = overlap_per_prop < 0.5
neg_boxes = valid_props[non_overlap_props_idx]
return neg_boxes
def _find_neg_chips(self, chips: 'Cx4', pos_chip_ids: 'D',
neg_boxes: 'Nx4'):
"""
:return: neg_chip2box_num, dict: chipid->neg_box_num
"""
neg_chip_ids = np.setdiff1d(np.arange(len(chips)), pos_chip_ids)
neg_chips = chips[neg_chip_ids]
iob = intersection_over_box(neg_chips, neg_boxes)
iob_threshold_to_find_chips = 0.7
chosen_neg_chip_ids, chip_id2overlap_box_num = \
self._find_chips_to_cover_overlaped_boxes(iob, iob_threshold_to_find_chips)
neg_chipid2box_num = {}
for cid in chosen_neg_chip_ids:
box_num = chip_id2overlap_box_num[cid]
raw_chip_id = neg_chip_ids[cid]
neg_chipid2box_num[raw_chip_id] = box_num
return neg_chipid2box_num
def crop_infer_anno_records(self, records: List[dict]):
"""
transform image record to chips record
:param records:
:return: new_records, list of dict like
{
'im_file': 'fake_image1.jpg',
'im_id': np.array([1]), # new _global_chip_id as im_id
'h': h, # chip height
'w': w, # chip width
'chip': [x1, y1, x2, y2] # added
'ori_im_h': ori_im_h # added, origin image height
'ori_im_w': ori_im_w # added, origin image width
'scale_i': 0 # added,
}
"""
self.chip_records = []
self._global_chip_id = 1 # im_id start from 1
self._global_chip_id2img_id = {}
for r in records:
for scale_i in range(self.scale_num):
self._get_current_scale_parameters(scale_i, r)
# Cx4
chips = self._create_chips(r['h'], r['w'], self._cur_scale)
cur_img_chip_record = self._get_chips_records(r, chips, scale_i)
self.chip_records.extend(cur_img_chip_record)
return self.chip_records
def _get_chips_records(self, rec, chips, scale_i):
cur_img_chip_records = []
ori_im_h = rec["h"]
ori_im_w = rec["w"]
im_file = rec["im_file"]
ori_im_id = rec["im_id"]
for id, chip in enumerate(chips):
chip_rec = {}
x1, y1, x2, y2 = chip
chip_h = y2 - y1
chip_w = x2 - x1
chip_rec["im_file"] = im_file
chip_rec["im_id"] = self._global_chip_id
chip_rec["h"] = chip_h
chip_rec["w"] = chip_w
chip_rec["chip"] = chip
chip_rec["ori_im_h"] = ori_im_h
chip_rec["ori_im_w"] = ori_im_w
chip_rec["scale_i"] = scale_i
self._global_chip_id2img_id[self._global_chip_id] = int(ori_im_id)
self._global_chip_id += 1
cur_img_chip_records.append(chip_rec)
return cur_img_chip_records
def aggregate_chips_detections(self, results, records=None):
"""
# 1. transform chip dets to image dets
# 2. nms boxes per image;
# 3. format output results
:param results:
:param roidb:
:return:
"""
results = deepcopy(results)
records = records if records else self.chip_records
img_id2bbox = self._transform_chip2image_bboxes(results, records)
nms_img_id2bbox = self._nms_dets(img_id2bbox)
aggregate_results = self._reformat_results(nms_img_id2bbox)
return aggregate_results
def _transform_chip2image_bboxes(self, results, records):
# 1. Transform chip dets to image dets;
# 2. Filter valid range;
# 3. Reformat and Aggregate chip dets to Get scale_cls_dets
img_id2bbox = defaultdict(list)
for result in results:
bbox_locs = result['bbox']
bbox_nums = result['bbox_num']
if len(bbox_locs) == 1 and bbox_locs[0][
0] == -1: # current batch has no detections
# bbox_locs = array([[-1.]], dtype=float32); bbox_nums = [[1]]
# MultiClassNMS output: If there is no detected boxes for all images, lod will be set to {1} and Out only contains one value which is -1.
continue
im_ids = result['im_id'] # replace with range(len(bbox_nums))
last_bbox_num = 0
for idx, im_id in enumerate(im_ids):
cur_bbox_len = bbox_nums[idx]
bboxes = bbox_locs[last_bbox_num:last_bbox_num + cur_bbox_len]
last_bbox_num += cur_bbox_len
# box: [num_id, score, xmin, ymin, xmax, ymax]
if len(bboxes) == 0: # current image has no detections
continue
chip_rec = records[int(im_id) -
1] # im_id starts from 1, type is np.int64
image_size = max(chip_rec["ori_im_h"], chip_rec["ori_im_w"])
bboxes = transform_chip_boxes2image_boxes(
bboxes, chip_rec["chip"], chip_rec["ori_im_h"],
chip_rec["ori_im_w"])
scale_i = chip_rec["scale_i"]
cur_scale = self._get_current_scale(self.target_sizes[scale_i],
image_size)
_, valid_boxes_idx = self._validate_boxes(
self.valid_box_ratio_ranges[scale_i], image_size,
bboxes[:, 2:], cur_scale)
ori_img_id = self._global_chip_id2img_id[int(im_id)]
img_id2bbox[ori_img_id].append(bboxes[valid_boxes_idx])
return img_id2bbox
def _nms_dets(self, img_id2bbox):
# 1. NMS on each image-class
# 2. Limit number of detections to MAX_PER_IMAGE if requested
max_per_img = self.max_per_img
nms_thresh = self.nms_thresh
for img_id in img_id2bbox:
box = img_id2bbox[
img_id] # list of np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2]
box = np.concatenate(box, axis=0)
nms_dets = nms(box, nms_thresh)
if max_per_img > 0:
if len(nms_dets) > max_per_img:
keep = np.argsort(-nms_dets[:, 1])[:max_per_img]
nms_dets = nms_dets[keep]
img_id2bbox[img_id] = nms_dets
return img_id2bbox
def _reformat_results(self, img_id2bbox):
"""reformat results"""
im_ids = img_id2bbox.keys()
results = []
for img_id in im_ids: # output by original im_id order
if len(img_id2bbox[img_id]) == 0:
bbox = np.array(
[[-1., 0., 0., 0., 0., 0.]]) # edge case: no detections
bbox_num = np.array([0])
else:
# np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2]
bbox = img_id2bbox[img_id]
bbox_num = np.array([len(bbox)])
res = dict(im_id=np.array([[img_id]]), bbox=bbox, bbox_num=bbox_num)
results.append(res)
return results
================================================
FILE: ppdet/data/crop_utils/chip_box_utils.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
def bbox_area(boxes):
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
def intersection_over_box(chips, boxes):
"""
intersection area over box area
:param chips: C
:param boxes: B
:return: iob, CxB
"""
M = chips.shape[0]
N = boxes.shape[0]
if M * N == 0:
return np.zeros([M, N], dtype='float32')
box_area = bbox_area(boxes) # B
inter_x2y2 = np.minimum(np.expand_dims(chips, 1)[:, :, 2:],
boxes[:, 2:]) # CxBX2
inter_x1y1 = np.maximum(np.expand_dims(chips, 1)[:, :, :2],
boxes[:, :2]) # CxBx2
inter_wh = inter_x2y2 - inter_x1y1
inter_wh = np.clip(inter_wh, a_min=0, a_max=None)
inter_area = inter_wh[:, :, 0] * inter_wh[:, :, 1] # CxB
iob = inter_area / np.expand_dims(box_area, 0)
return iob
def clip_boxes(boxes, im_shape):
"""
Clip boxes to image boundaries.
:param boxes: [N, 4]
:param im_shape: tuple of 2, [h, w]
:return: [N, 4]
"""
# x1 >= 0
boxes[:, 0] = np.clip(boxes[:, 0], 0, im_shape[1] - 1)
# y1 >= 0
boxes[:, 1] = np.clip(boxes[:, 1], 0, im_shape[0] - 1)
# x2 < im_shape[1]
boxes[:, 2] = np.clip(boxes[:, 2], 1, im_shape[1])
# y2 < im_shape[0]
boxes[:, 3] = np.clip(boxes[:, 3], 1, im_shape[0])
return boxes
def transform_chip_box(gt_bbox: 'Gx4', boxes_idx: 'B', chip: '4'):
boxes_idx = np.array(boxes_idx)
cur_gt_bbox = gt_bbox[boxes_idx].copy() # Bx4
x1, y1, x2, y2 = chip
cur_gt_bbox[:, 0] -= x1
cur_gt_bbox[:, 1] -= y1
cur_gt_bbox[:, 2] -= x1
cur_gt_bbox[:, 3] -= y1
h = y2 - y1
w = x2 - x1
cur_gt_bbox = clip_boxes(cur_gt_bbox, (h, w))
ws = (cur_gt_bbox[:, 2] - cur_gt_bbox[:, 0]).astype(np.int32)
hs = (cur_gt_bbox[:, 3] - cur_gt_bbox[:, 1]).astype(np.int32)
valid_idx = (ws >= 2) & (hs >= 2)
return cur_gt_bbox[valid_idx], boxes_idx[valid_idx]
def find_chips_to_cover_overlaped_boxes(iob, overlap_threshold):
chip_ids, box_ids = np.nonzero(iob >= overlap_threshold)
chip_id2overlap_box_num = np.bincount(chip_ids) # 1d array
chip_id2overlap_box_num = np.pad(
chip_id2overlap_box_num, (0, len(iob) - len(chip_id2overlap_box_num)),
constant_values=0)
chosen_chip_ids = []
while len(box_ids) > 0:
value_counts = np.bincount(chip_ids) # 1d array
max_count_chip_id = np.argmax(value_counts)
assert max_count_chip_id not in chosen_chip_ids
chosen_chip_ids.append(max_count_chip_id)
box_ids_in_cur_chip = box_ids[chip_ids == max_count_chip_id]
ids_not_in_cur_boxes_mask = np.logical_not(
np.isin(box_ids, box_ids_in_cur_chip))
chip_ids = chip_ids[ids_not_in_cur_boxes_mask]
box_ids = box_ids[ids_not_in_cur_boxes_mask]
return chosen_chip_ids, chip_id2overlap_box_num
def transform_chip_boxes2image_boxes(chip_boxes, chip, img_h, img_w):
chip_boxes = np.array(sorted(chip_boxes, key=lambda item: -item[1]))
xmin, ymin, _, _ = chip
# Transform to origin image loc
chip_boxes[:, 2] += xmin
chip_boxes[:, 4] += xmin
chip_boxes[:, 3] += ymin
chip_boxes[:, 5] += ymin
chip_boxes = clip_boxes(chip_boxes, (img_h, img_w))
return chip_boxes
def nms(dets, thresh):
"""Apply classic DPM-style greedy NMS."""
if dets.shape[0] == 0:
return dets[[], :]
scores = dets[:, 1]
x1 = dets[:, 2]
y1 = dets[:, 3]
x2 = dets[:, 4]
y2 = dets[:, 5]
areas = (x2 - x1 + 1) * (y2 - y1 + 1)
order = scores.argsort()[::-1]
ndets = dets.shape[0]
suppressed = np.zeros((ndets), dtype=np.int32)
# nominal indices
# _i, _j
# sorted indices
# i, j
# temp variables for box i's (the box currently under consideration)
# ix1, iy1, ix2, iy2, iarea
# variables for computing overlap with box j (lower scoring box)
# xx1, yy1, xx2, yy2
# w, h
# inter, ovr
for _i in range(ndets):
i = order[_i]
if suppressed[i] == 1:
continue
ix1 = x1[i]
iy1 = y1[i]
ix2 = x2[i]
iy2 = y2[i]
iarea = areas[i]
for _j in range(_i + 1, ndets):
j = order[_j]
if suppressed[j] == 1:
continue
xx1 = max(ix1, x1[j])
yy1 = max(iy1, y1[j])
xx2 = min(ix2, x2[j])
yy2 = min(iy2, y2[j])
w = max(0.0, xx2 - xx1 + 1)
h = max(0.0, yy2 - yy1 + 1)
inter = w * h
ovr = inter / (iarea + areas[j] - inter)
if ovr >= thresh:
suppressed[j] = 1
keep = np.where(suppressed == 0)[0]
dets = dets[keep, :]
return dets
================================================
FILE: ppdet/data/culane_utils.py
================================================
import math
import numpy as np
from imgaug.augmentables.lines import LineString
from scipy.interpolate import InterpolatedUnivariateSpline
def lane_to_linestrings(lanes):
lines = []
for lane in lanes:
lines.append(LineString(lane))
return lines
def linestrings_to_lanes(lines):
lanes = []
for line in lines:
lanes.append(line.coords)
return lanes
def sample_lane(points, sample_ys, img_w):
# this function expects the points to be sorted
points = np.array(points)
if not np.all(points[1:, 1] < points[:-1, 1]):
raise Exception('Annotaion points have to be sorted')
x, y = points[:, 0], points[:, 1]
# interpolate points inside domain
assert len(points) > 1
interp = InterpolatedUnivariateSpline(
y[::-1], x[::-1], k=min(3, len(points) - 1))
domain_min_y = y.min()
domain_max_y = y.max()
sample_ys_inside_domain = sample_ys[(sample_ys >= domain_min_y) & (
sample_ys <= domain_max_y)]
assert len(sample_ys_inside_domain) > 0
interp_xs = interp(sample_ys_inside_domain)
# extrapolate lane to the bottom of the image with a straight line using the 2 points closest to the bottom
two_closest_points = points[:2]
extrap = np.polyfit(
two_closest_points[:, 1], two_closest_points[:, 0], deg=1)
extrap_ys = sample_ys[sample_ys > domain_max_y]
extrap_xs = np.polyval(extrap, extrap_ys)
all_xs = np.hstack((extrap_xs, interp_xs))
# separate between inside and outside points
inside_mask = (all_xs >= 0) & (all_xs < img_w)
xs_inside_image = all_xs[inside_mask]
xs_outside_image = all_xs[~inside_mask]
return xs_outside_image, xs_inside_image
def filter_lane(lane):
assert lane[-1][1] <= lane[0][1]
filtered_lane = []
used = set()
for p in lane:
if p[1] not in used:
filtered_lane.append(p)
used.add(p[1])
return filtered_lane
def transform_annotation(img_w, img_h, max_lanes, n_offsets, offsets_ys,
n_strips, strip_size, anno):
old_lanes = anno['lanes']
# removing lanes with less than 2 points
old_lanes = filter(lambda x: len(x) > 1, old_lanes)
# sort lane points by Y (bottom to top of the image)
old_lanes = [sorted(lane, key=lambda x: -x[1]) for lane in old_lanes]
# remove points with same Y (keep first occurrence)
old_lanes = [filter_lane(lane) for lane in old_lanes]
# normalize the annotation coordinates
old_lanes = [[[x * img_w / float(img_w), y * img_h / float(img_h)]
for x, y in lane] for lane in old_lanes]
# create tranformed annotations
lanes = np.ones(
(max_lanes, 2 + 1 + 1 + 2 + n_offsets), dtype=np.float32
) * -1e5 # 2 scores, 1 start_y, 1 start_x, 1 theta, 1 length, S+1 coordinates
lanes_endpoints = np.ones((max_lanes, 2))
# lanes are invalid by default
lanes[:, 0] = 1
lanes[:, 1] = 0
for lane_idx, lane in enumerate(old_lanes):
if lane_idx >= max_lanes:
break
try:
xs_outside_image, xs_inside_image = sample_lane(lane, offsets_ys,
img_w)
except AssertionError:
continue
if len(xs_inside_image) <= 1:
continue
all_xs = np.hstack((xs_outside_image, xs_inside_image))
lanes[lane_idx, 0] = 0
lanes[lane_idx, 1] = 1
lanes[lane_idx, 2] = len(xs_outside_image) / n_strips
lanes[lane_idx, 3] = xs_inside_image[0]
thetas = []
for i in range(1, len(xs_inside_image)):
theta = math.atan(
i * strip_size /
(xs_inside_image[i] - xs_inside_image[0] + 1e-5)) / math.pi
theta = theta if theta > 0 else 1 - abs(theta)
thetas.append(theta)
theta_far = sum(thetas) / len(thetas)
# lanes[lane_idx,
# 4] = (theta_closest + theta_far) / 2 # averaged angle
lanes[lane_idx, 4] = theta_far
lanes[lane_idx, 5] = len(xs_inside_image)
lanes[lane_idx, 6:6 + len(all_xs)] = all_xs
lanes_endpoints[lane_idx, 0] = (len(all_xs) - 1) / n_strips
lanes_endpoints[lane_idx, 1] = xs_inside_image[-1]
new_anno = {
'label': lanes,
'old_anno': anno,
'lane_endpoints': lanes_endpoints
}
return new_anno
================================================
FILE: ppdet/data/reader.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import os
import traceback
import six
import sys
if sys.version_info >= (3, 0):
pass
else:
pass
import numpy as np
import paddle
import paddle.nn.functional as F
from copy import deepcopy
from paddle.io import DataLoader, DistributedBatchSampler
from .utils import default_collate_fn
from ppdet.core.workspace import register
from . import transform
from .shm_utils import _get_shared_memory_size_in_M
from ppdet.utils.logger import setup_logger
logger = setup_logger('reader')
MAIN_PID = os.getpid()
class Compose(object):
def __init__(self, transforms, num_classes=80):
self.transforms = transforms
self.transforms_cls = []
for t in self.transforms:
for k, v in t.items():
op_cls = getattr(transform, k)
f = op_cls(**v)
if hasattr(f, 'num_classes'):
f.num_classes = num_classes
self.transforms_cls.append(f)
def _update_transforms_cls(self, data):
if 'transform_schedulers' in data:
def is_valid(op):
op_name = op.__class__.__name__
for t in data['transform_schedulers']:
for k, v in t.items():
if op_name == k:
# [start_epoch, stop_epoch)
start_epoch = v.get('start_epoch', 0)
if start_epoch > data['curr_epoch']:
return False
stop_epoch = v.get('stop_epoch', float('inf'))
if stop_epoch <= data['curr_epoch']:
return False
return True
return filter(is_valid, self.transforms_cls)
else:
return self.transforms_cls
def __call__(self, data):
transforms_cls = self._update_transforms_cls(data)
for f in transforms_cls:
try:
data = f(data)
except Exception as e:
stack_info = traceback.format_exc()
logger.warning("fail to map sample transform [{}] "
"with error: {} and stack:\n{}".format(
f, e, str(stack_info)))
raise e
return data
class BatchCompose(Compose):
def __init__(self, transforms, num_classes=80, collate_batch=True):
super(BatchCompose, self).__init__(transforms, num_classes)
self.collate_batch = collate_batch
def __call__(self, data):
transforms_cls = self._update_transforms_cls(data[0])
for f in transforms_cls:
try:
data = f(data)
except Exception as e:
stack_info = traceback.format_exc()
logger.warning("fail to map batch transform [{}] "
"with error: {} and stack:\n{}".format(
f, e, str(stack_info)))
raise e
# remove keys which is not needed by model
extra_key = ['h', 'w', 'flipped', 'transform_schedulers']
for k in extra_key:
for sample in data:
if k in sample:
sample.pop(k)
# batch data, if user-define batch function needed
# use user-defined here
if self.collate_batch:
batch_data = default_collate_fn(data)
else:
batch_data = {}
for k in data[0].keys():
tmp_data = []
for i in range(len(data)):
tmp_data.append(data[i][k])
if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:
tmp_data = np.stack(tmp_data, axis=0)
if 'origin_' in k:
tmp_data = np.stack(tmp_data, axis=0)
batch_data[k] = tmp_data
return batch_data
class BaseDataLoader(object):
"""
Base DataLoader implementation for detection models
Args:
sample_transforms (list): a list of transforms to perform
on each sample
batch_transforms (list): a list of transforms to perform
on batch
batch_size (int): batch size for batch collating, default 1.
shuffle (bool): whether to shuffle samples
drop_last (bool): whether to drop the last incomplete,
default False
num_classes (int): class number of dataset, default 80
collate_batch (bool): whether to collate batch in dataloader.
If set to True, the samples will collate into batch according
to the batch size. Otherwise, the ground-truth will not collate,
which is used when the number of ground-truch is different in
samples.
use_shared_memory (bool): whether to use shared memory to
accelerate data loading, enable this only if you
are sure that the shared memory size of your OS
is larger than memory cost of input datas of model.
Note that shared memory will be automatically
disabled if the shared memory of OS is less than
1G, which is not enough for detection models.
Default False.
"""
def __init__(self,
sample_transforms=[],
batch_transforms=[],
batch_size=1,
shuffle=False,
drop_last=False,
num_classes=80,
collate_batch=True,
use_shared_memory=False,
**kwargs):
# sample transform
self._sample_transforms = Compose(
sample_transforms, num_classes=num_classes)
# batch transfrom
self._batch_transforms = BatchCompose(batch_transforms, num_classes,
collate_batch)
self.batch_size = batch_size
self.shuffle = shuffle
self.drop_last = drop_last
self.use_shared_memory = use_shared_memory
self.kwargs = kwargs
def __call__(self,
dataset,
worker_num,
batch_sampler=None,
return_list=False):
self.dataset = dataset
self.dataset.check_or_download_dataset()
self.dataset.parse_dataset()
# get data
self.dataset.set_transform(self._sample_transforms)
# set kwargs
self.dataset.set_kwargs(**self.kwargs)
# batch sampler
if batch_sampler is None:
self._batch_sampler = DistributedBatchSampler(
self.dataset,
batch_size=self.batch_size,
shuffle=self.shuffle,
drop_last=self.drop_last)
else:
self._batch_sampler = batch_sampler
# DataLoader do not start sub-process in Windows and Mac
# system, do not need to use shared memory
use_shared_memory = self.use_shared_memory and \
sys.platform not in ['win32', 'darwin']
# check whether shared memory size is bigger than 1G(1024M)
if use_shared_memory:
shm_size = _get_shared_memory_size_in_M()
if shm_size is not None and shm_size < 1024.:
logger.warning("Shared memory size is less than 1G, "
"disable shared_memory in DataLoader")
use_shared_memory = False
self.dataloader = DataLoader(
dataset=self.dataset,
batch_sampler=self._batch_sampler,
collate_fn=self._batch_transforms,
num_workers=worker_num,
return_list=return_list,
use_shared_memory=use_shared_memory)
self.loader = iter(self.dataloader)
return self
def __len__(self):
return len(self._batch_sampler)
def __iter__(self):
return self
def __next__(self):
try:
return next(self.loader)
except StopIteration:
self.loader = iter(self.dataloader)
six.reraise(*sys.exc_info())
def next(self):
# python2 compatibility
return self.__next__()
@register
class TrainReader(BaseDataLoader):
__shared__ = ['num_classes']
def __init__(self,
sample_transforms=[],
batch_transforms=[],
batch_size=1,
shuffle=True,
drop_last=True,
num_classes=80,
collate_batch=True,
**kwargs):
super(TrainReader, self).__init__(sample_transforms, batch_transforms,
batch_size, shuffle, drop_last,
num_classes, collate_batch, **kwargs)
@register
class EvalReader(BaseDataLoader):
__shared__ = ['num_classes']
def __init__(self,
sample_transforms=[],
batch_transforms=[],
batch_size=1,
shuffle=False,
drop_last=False,
num_classes=80,
**kwargs):
super(EvalReader, self).__init__(sample_transforms, batch_transforms,
batch_size, shuffle, drop_last,
num_classes, **kwargs)
@register
class TestReader(BaseDataLoader):
__shared__ = ['num_classes']
def __init__(self,
sample_transforms=[],
batch_transforms=[],
batch_size=1,
shuffle=False,
drop_last=False,
num_classes=80,
**kwargs):
super(TestReader, self).__init__(sample_transforms, batch_transforms,
batch_size, shuffle, drop_last,
num_classes, **kwargs)
@register
class EvalMOTReader(BaseDataLoader):
__shared__ = ['num_classes']
def __init__(self,
sample_transforms=[],
batch_transforms=[],
batch_size=1,
shuffle=False,
drop_last=False,
num_classes=1,
**kwargs):
super(EvalMOTReader, self).__init__(sample_transforms, batch_transforms,
batch_size, shuffle, drop_last,
num_classes, **kwargs)
@register
class TestMOTReader(BaseDataLoader):
__shared__ = ['num_classes']
def __init__(self,
sample_transforms=[],
batch_transforms=[],
batch_size=1,
shuffle=False,
drop_last=False,
num_classes=1,
**kwargs):
super(TestMOTReader, self).__init__(sample_transforms, batch_transforms,
batch_size, shuffle, drop_last,
num_classes, **kwargs)
# For Semi-Supervised Object Detection (SSOD)
class Compose_SSOD(object):
def __init__(self, base_transforms, weak_aug, strong_aug, num_classes=80):
self.base_transforms = base_transforms
self.base_transforms_cls = []
for t in self.base_transforms:
for k, v in t.items():
op_cls = getattr(transform, k)
f = op_cls(**v)
if hasattr(f, 'num_classes'):
f.num_classes = num_classes
self.base_transforms_cls.append(f)
self.weak_augs = weak_aug
self.weak_augs_cls = []
for t in self.weak_augs:
for k, v in t.items():
op_cls = getattr(transform, k)
f = op_cls(**v)
if hasattr(f, 'num_classes'):
f.num_classes = num_classes
self.weak_augs_cls.append(f)
self.strong_augs = strong_aug
self.strong_augs_cls = []
for t in self.strong_augs:
for k, v in t.items():
op_cls = getattr(transform, k)
f = op_cls(**v)
if hasattr(f, 'num_classes'):
f.num_classes = num_classes
self.strong_augs_cls.append(f)
def __call__(self, data):
for f in self.base_transforms_cls:
try:
data = f(data)
except Exception as e:
stack_info = traceback.format_exc()
logger.warning("fail to map sample transform [{}] "
"with error: {} and stack:\n{}".format(
f, e, str(stack_info)))
raise e
weak_data = deepcopy(data)
strong_data = deepcopy(data)
for f in self.weak_augs_cls:
try:
weak_data = f(weak_data)
except Exception as e:
stack_info = traceback.format_exc()
logger.warning("fail to map weak aug [{}] "
"with error: {} and stack:\n{}".format(
f, e, str(stack_info)))
raise e
for f in self.strong_augs_cls:
try:
strong_data = f(strong_data)
except Exception as e:
stack_info = traceback.format_exc()
logger.warning("fail to map strong aug [{}] "
"with error: {} and stack:\n{}".format(
f, e, str(stack_info)))
raise e
weak_data['strong_aug'] = strong_data
return weak_data
class BatchCompose_SSOD(Compose):
def __init__(self, transforms, num_classes=80, collate_batch=True):
super(BatchCompose_SSOD, self).__init__(transforms, num_classes)
self.collate_batch = collate_batch
def __call__(self, data):
# split strong_data from data(weak_data)
strong_data = []
for sample in data:
strong_data.append(sample['strong_aug'])
sample.pop('strong_aug')
for f in self.transforms_cls:
try:
data = f(data)
if 'BatchRandomResizeForSSOD' in f._id:
strong_data = f(strong_data, data[1])[0]
data = data[0]
else:
strong_data = f(strong_data)
except Exception as e:
stack_info = traceback.format_exc()
logger.warning("fail to map batch transform [{}] "
"with error: {} and stack:\n{}".format(
f, e, str(stack_info)))
raise e
# remove keys which is not needed by model
extra_key = ['h', 'w', 'flipped']
for k in extra_key:
for sample in data:
if k in sample:
sample.pop(k)
for sample in strong_data:
if k in sample:
sample.pop(k)
# batch data, if user-define batch function needed
# use user-defined here
if self.collate_batch:
batch_data = default_collate_fn(data)
strong_batch_data = default_collate_fn(strong_data)
return batch_data, strong_batch_data
else:
batch_data = {}
for k in data[0].keys():
tmp_data = []
for i in range(len(data)):
tmp_data.append(data[i][k])
if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:
tmp_data = np.stack(tmp_data, axis=0)
batch_data[k] = tmp_data
strong_batch_data = {}
for k in strong_data[0].keys():
tmp_data = []
for i in range(len(strong_data)):
tmp_data.append(strong_data[i][k])
if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:
tmp_data = np.stack(tmp_data, axis=0)
strong_batch_data[k] = tmp_data
return batch_data, strong_batch_data
class CombineSSODLoader(object):
def __init__(self, label_loader, unlabel_loader):
self.label_loader = label_loader
self.unlabel_loader = unlabel_loader
def __iter__(self):
while True:
try:
label_samples = next(self.label_loader_iter)
except:
self.label_loader_iter = iter(self.label_loader)
label_samples = next(self.label_loader_iter)
try:
unlabel_samples = next(self.unlabel_loader_iter)
except:
self.unlabel_loader_iter = iter(self.unlabel_loader)
unlabel_samples = next(self.unlabel_loader_iter)
yield (
label_samples[0], # sup weak
label_samples[1], # sup strong
unlabel_samples[0], # unsup weak
unlabel_samples[1] # unsup strong
)
def __call__(self):
return self.__iter__()
class BaseSemiDataLoader(object):
def __init__(self,
sample_transforms=[],
weak_aug=[],
strong_aug=[],
sup_batch_transforms=[],
unsup_batch_transforms=[],
sup_batch_size=1,
unsup_batch_size=1,
shuffle=True,
drop_last=True,
num_classes=80,
collate_batch=True,
use_shared_memory=False,
**kwargs):
# sup transforms
self._sample_transforms_label = Compose_SSOD(
sample_transforms, weak_aug, strong_aug, num_classes=num_classes)
self._batch_transforms_label = BatchCompose_SSOD(
sup_batch_transforms, num_classes, collate_batch)
self.batch_size_label = sup_batch_size
# unsup transforms
self._sample_transforms_unlabel = Compose_SSOD(
sample_transforms, weak_aug, strong_aug, num_classes=num_classes)
self._batch_transforms_unlabel = BatchCompose_SSOD(
unsup_batch_transforms, num_classes, collate_batch)
self.batch_size_unlabel = unsup_batch_size
# common
self.shuffle = shuffle
self.drop_last = drop_last
self.use_shared_memory = use_shared_memory
self.kwargs = kwargs
def __call__(self,
dataset_label,
dataset_unlabel,
worker_num,
batch_sampler_label=None,
batch_sampler_unlabel=None,
return_list=False):
# sup dataset
self.dataset_label = dataset_label
self.dataset_label.check_or_download_dataset()
self.dataset_label.parse_dataset()
self.dataset_label.set_transform(self._sample_transforms_label)
self.dataset_label.set_kwargs(**self.kwargs)
if batch_sampler_label is None:
self._batch_sampler_label = DistributedBatchSampler(
self.dataset_label,
batch_size=self.batch_size_label,
shuffle=self.shuffle,
drop_last=self.drop_last)
else:
self._batch_sampler_label = batch_sampler_label
# unsup dataset
self.dataset_unlabel = dataset_unlabel
self.dataset_unlabel.length = self.dataset_label.__len__()
self.dataset_unlabel.check_or_download_dataset()
self.dataset_unlabel.parse_dataset()
self.dataset_unlabel.set_transform(self._sample_transforms_unlabel)
self.dataset_unlabel.set_kwargs(**self.kwargs)
if batch_sampler_unlabel is None:
self._batch_sampler_unlabel = DistributedBatchSampler(
self.dataset_unlabel,
batch_size=self.batch_size_unlabel,
shuffle=self.shuffle,
drop_last=self.drop_last)
else:
self._batch_sampler_unlabel = batch_sampler_unlabel
# DataLoader do not start sub-process in Windows and Mac
# system, do not need to use shared memory
use_shared_memory = self.use_shared_memory and \
sys.platform not in ['win32', 'darwin']
# check whether shared memory size is bigger than 1G(1024M)
if use_shared_memory:
shm_size = _get_shared_memory_size_in_M()
if shm_size is not None and shm_size < 1024.:
logger.warning("Shared memory size is less than 1G, "
"disable shared_memory in DataLoader")
use_shared_memory = False
self.dataloader_label = DataLoader(
dataset=self.dataset_label,
batch_sampler=self._batch_sampler_label,
collate_fn=self._batch_transforms_label,
num_workers=worker_num,
return_list=return_list,
use_shared_memory=use_shared_memory)
self.dataloader_unlabel = DataLoader(
dataset=self.dataset_unlabel,
batch_sampler=self._batch_sampler_unlabel,
collate_fn=self._batch_transforms_unlabel,
num_workers=worker_num,
return_list=return_list,
use_shared_memory=use_shared_memory)
self.dataloader = CombineSSODLoader(self.dataloader_label,
self.dataloader_unlabel)
self.loader = iter(self.dataloader)
return self
def __len__(self):
return len(self._batch_sampler_label)
def __iter__(self):
return self
def __next__(self):
return next(self.loader)
def next(self):
# python2 compatibility
return self.__next__()
@register
class SemiTrainReader(BaseSemiDataLoader):
__shared__ = ['num_classes']
def __init__(self,
sample_transforms=[],
weak_aug=[],
strong_aug=[],
sup_batch_transforms=[],
unsup_batch_transforms=[],
sup_batch_size=1,
unsup_batch_size=1,
shuffle=True,
drop_last=True,
num_classes=80,
collate_batch=True,
**kwargs):
super(SemiTrainReader, self).__init__(
sample_transforms, weak_aug, strong_aug, sup_batch_transforms,
unsup_batch_transforms, sup_batch_size, unsup_batch_size, shuffle,
drop_last, num_classes, collate_batch, **kwargs)
================================================
FILE: ppdet/data/shm_utils.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
SIZE_UNIT = ['K', 'M', 'G', 'T']
SHM_QUERY_CMD = 'df -h'
SHM_KEY = 'shm'
SHM_DEFAULT_MOUNT = '/dev/shm'
# [ shared memory size check ]
# In detection models, image/target data occupies a lot of memory, and
# will occupy lots of shared memory in multi-process DataLoader, we use
# following code to get shared memory size and perform a size check to
# disable shared memory use if shared memory size is not enough.
# Shared memory getting process as follows:
# 1. use `df -h` get all mount info
# 2. pick up spaces whose mount info contains 'shm'
# 3. if 'shm' space number is only 1, return its size
# 4. if there are multiple 'shm' space, try to find the default mount
# directory '/dev/shm' is Linux-like system, otherwise return the
# biggest space size.
def _parse_size_in_M(size_str):
if size_str[-1] == 'B':
num, unit = size_str[:-2], size_str[-2]
else:
num, unit = size_str[:-1], size_str[-1]
assert unit in SIZE_UNIT, \
"unknown shm size unit {}".format(unit)
return float(num) * \
(1024 ** (SIZE_UNIT.index(unit) - 1))
def _get_shared_memory_size_in_M():
try:
df_infos = os.popen(SHM_QUERY_CMD).readlines()
except:
return None
else:
shm_infos = []
for df_info in df_infos:
info = df_info.strip()
if info.find(SHM_KEY) >= 0:
shm_infos.append(info.split())
if len(shm_infos) == 0:
return None
elif len(shm_infos) == 1:
return _parse_size_in_M(shm_infos[0][3])
else:
default_mount_infos = [
si for si in shm_infos if si[-1] == SHM_DEFAULT_MOUNT
]
if default_mount_infos:
return _parse_size_in_M(default_mount_infos[0][3])
else:
return max([_parse_size_in_M(si[3]) for si in shm_infos])
================================================
FILE: ppdet/data/source/__init__.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import coco
from . import voc
from . import widerface
from . import category
from . import keypoint_coco
from . import mot
from . import sniper_coco
from . import culane
from . import lvis
from .coco import *
from .voc import *
from .widerface import *
from .category import *
from .keypoint_coco import *
from .mot import *
from .sniper_coco import SniperCOCODataSet
from .dataset import ImageFolder
from .pose3d_cmb import *
from .culane import *
from .lvis import *
================================================
FILE: ppdet/data/source/category.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from ppdet.data.source.voc import pascalvoc_label
from ppdet.data.source.widerface import widerface_label
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
__all__ = ['get_categories']
def get_categories(metric_type, anno_file=None, arch=None):
"""
Get class id to category id map and category id
to category name map from annotation file.
Args:
metric_type (str): metric type, currently support 'coco', 'voc', 'oid'
and 'widerface'.
anno_file (str): annotation file path
"""
if arch == 'keypoint_arch':
return (None, {'id': 'keypoint'})
if anno_file == None or (not os.path.isfile(anno_file)):
logger.warning(
"anno_file '{}' is None or not set or not exist, "
"please recheck TrainDataset/EvalDataset/TestDataset.anno_path, "
"otherwise the default categories will be used by metric_type.".
format(anno_file))
if metric_type.lower() == 'coco' or metric_type.lower(
) == 'rbox' or metric_type.lower() == 'snipercoco':
if anno_file and os.path.isfile(anno_file):
if anno_file.endswith('json'):
# lazy import pycocotools here
from pycocotools.coco import COCO
coco = COCO(anno_file)
cats = coco.loadCats(coco.getCatIds())
clsid2catid = {i: cat['id'] for i, cat in enumerate(cats)}
catid2name = {cat['id']: cat['name'] for cat in cats}
elif anno_file.endswith('txt'):
cats = []
with open(anno_file) as f:
for line in f.readlines():
cats.append(line.strip())
if cats[0] == 'background': cats = cats[1:]
clsid2catid = {i: i for i in range(len(cats))}
catid2name = {i: name for i, name in enumerate(cats)}
else:
raise ValueError("anno_file {} should be json or txt.".format(
anno_file))
return clsid2catid, catid2name
# anno file not exist, load default categories of COCO17
else:
if metric_type.lower() == 'rbox':
logger.warning(
"metric_type: {}, load default categories of DOTA.".format(
metric_type))
return _dota_category()
logger.warning("metric_type: {}, load default categories of COCO.".
format(metric_type))
return _coco17_category()
elif metric_type.lower() == 'voc':
if anno_file and os.path.isfile(anno_file):
cats = []
with open(anno_file) as f:
for line in f.readlines():
cats.append(line.strip())
if cats[0] == 'background':
cats = cats[1:]
clsid2catid = {i: i for i in range(len(cats))}
catid2name = {i: name for i, name in enumerate(cats)}
return clsid2catid, catid2name
# anno file not exist, load default categories of
# VOC all 20 categories
else:
logger.warning("metric_type: {}, load default categories of VOC.".
format(metric_type))
return _vocall_category()
elif metric_type.lower() == 'oid':
if anno_file and os.path.isfile(anno_file):
logger.warning("only default categories support for OID19")
return _oid19_category()
elif metric_type.lower() == 'widerface':
return _widerface_category()
elif metric_type.lower() in [
'keypointtopdowncocoeval', 'keypointtopdownmpiieval',
'keypointtopdowncocowholebadyhandeval'
]:
return (None, {'id': 'keypoint'})
elif metric_type.lower() == 'pose3deval':
return (None, {'id': 'pose3d'})
elif metric_type.lower() in ['mot', 'motdet', 'reid']:
if anno_file and os.path.isfile(anno_file):
cats = []
with open(anno_file) as f:
for line in f.readlines():
cats.append(line.strip())
if cats[0] == 'background':
cats = cats[1:]
clsid2catid = {i: i for i in range(len(cats))}
catid2name = {i: name for i, name in enumerate(cats)}
return clsid2catid, catid2name
# anno file not exist, load default category 'pedestrian'.
else:
logger.warning(
"metric_type: {}, load default categories of pedestrian MOT.".
format(metric_type))
return _mot_category(category='pedestrian')
elif metric_type.lower() in ['kitti', 'bdd100kmot']:
return _mot_category(category='vehicle')
elif metric_type.lower() in ['mcmot']:
if anno_file and os.path.isfile(anno_file):
cats = []
with open(anno_file) as f:
for line in f.readlines():
cats.append(line.strip())
if cats[0] == 'background':
cats = cats[1:]
clsid2catid = {i: i for i in range(len(cats))}
catid2name = {i: name for i, name in enumerate(cats)}
return clsid2catid, catid2name
# anno file not exist, load default categories of visdrone all 10 categories
else:
logger.warning(
"metric_type: {}, load default categories of VisDrone.".format(
metric_type))
return _visdrone_category()
else:
raise ValueError("unknown metric type {}".format(metric_type))
def _mot_category(category='pedestrian'):
"""
Get class id to category id map and category id
to category name map of mot dataset
"""
label_map = {category: 0}
label_map = sorted(label_map.items(), key=lambda x: x[1])
cats = [l[0] for l in label_map]
clsid2catid = {i: i for i in range(len(cats))}
catid2name = {i: name for i, name in enumerate(cats)}
return clsid2catid, catid2name
def _coco17_category():
"""
Get class id to category id map and category id
to category name map of COCO2017 dataset
"""
clsid2catid = {
1: 1,
2: 2,
3: 3,
4: 4,
5: 5,
6: 6,
7: 7,
8: 8,
9: 9,
10: 10,
11: 11,
12: 13,
13: 14,
14: 15,
15: 16,
16: 17,
17: 18,
18: 19,
19: 20,
20: 21,
21: 22,
22: 23,
23: 24,
24: 25,
25: 27,
26: 28,
27: 31,
28: 32,
29: 33,
30: 34,
31: 35,
32: 36,
33: 37,
34: 38,
35: 39,
36: 40,
37: 41,
38: 42,
39: 43,
40: 44,
41: 46,
42: 47,
43: 48,
44: 49,
45: 50,
46: 51,
47: 52,
48: 53,
49: 54,
50: 55,
51: 56,
52: 57,
53: 58,
54: 59,
55: 60,
56: 61,
57: 62,
58: 63,
59: 64,
60: 65,
61: 67,
62: 70,
63: 72,
64: 73,
65: 74,
66: 75,
67: 76,
68: 77,
69: 78,
70: 79,
71: 80,
72: 81,
73: 82,
74: 84,
75: 85,
76: 86,
77: 87,
78: 88,
79: 89,
80: 90
}
catid2name = {
0: 'background',
1: 'person',
2: 'bicycle',
3: 'car',
4: 'motorcycle',
5: 'airplane',
6: 'bus',
7: 'train',
8: 'truck',
9: 'boat',
10: 'traffic light',
11: 'fire hydrant',
13: 'stop sign',
14: 'parking meter',
15: 'bench',
16: 'bird',
17: 'cat',
18: 'dog',
19: 'horse',
20: 'sheep',
21: 'cow',
22: 'elephant',
23: 'bear',
24: 'zebra',
25: 'giraffe',
27: 'backpack',
28: 'umbrella',
31: 'handbag',
32: 'tie',
33: 'suitcase',
34: 'frisbee',
35: 'skis',
36: 'snowboard',
37: 'sports ball',
38: 'kite',
39: 'baseball bat',
40: 'baseball glove',
41: 'skateboard',
42: 'surfboard',
43: 'tennis racket',
44: 'bottle',
46: 'wine glass',
47: 'cup',
48: 'fork',
49: 'knife',
50: 'spoon',
51: 'bowl',
52: 'banana',
53: 'apple',
54: 'sandwich',
55: 'orange',
56: 'broccoli',
57: 'carrot',
58: 'hot dog',
59: 'pizza',
60: 'donut',
61: 'cake',
62: 'chair',
63: 'couch',
64: 'potted plant',
65: 'bed',
67: 'dining table',
70: 'toilet',
72: 'tv',
73: 'laptop',
74: 'mouse',
75: 'remote',
76: 'keyboard',
77: 'cell phone',
78: 'microwave',
79: 'oven',
80: 'toaster',
81: 'sink',
82: 'refrigerator',
84: 'book',
85: 'clock',
86: 'vase',
87: 'scissors',
88: 'teddy bear',
89: 'hair drier',
90: 'toothbrush'
}
clsid2catid = {k - 1: v for k, v in clsid2catid.items()}
catid2name.pop(0)
return clsid2catid, catid2name
def _dota_category():
"""
Get class id to category id map and category id
to category name map of dota dataset
"""
catid2name = {
0: 'background',
1: 'plane',
2: 'baseball-diamond',
3: 'bridge',
4: 'ground-track-field',
5: 'small-vehicle',
6: 'large-vehicle',
7: 'ship',
8: 'tennis-court',
9: 'basketball-court',
10: 'storage-tank',
11: 'soccer-ball-field',
12: 'roundabout',
13: 'harbor',
14: 'swimming-pool',
15: 'helicopter'
}
catid2name.pop(0)
clsid2catid = {i: i + 1 for i in range(len(catid2name))}
return clsid2catid, catid2name
def _vocall_category():
"""
Get class id to category id map and category id
to category name map of mixup voc dataset
"""
label_map = pascalvoc_label()
label_map = sorted(label_map.items(), key=lambda x: x[1])
cats = [l[0] for l in label_map]
clsid2catid = {i: i for i in range(len(cats))}
catid2name = {i: name for i, name in enumerate(cats)}
return clsid2catid, catid2name
def _widerface_category():
label_map = widerface_label()
label_map = sorted(label_map.items(), key=lambda x: x[1])
cats = [l[0] for l in label_map]
clsid2catid = {i: i for i in range(len(cats))}
catid2name = {i: name for i, name in enumerate(cats)}
return clsid2catid, catid2name
def _oid19_category():
clsid2catid = {k: k + 1 for k in range(500)}
catid2name = {
0: "background",
1: "Infant bed",
2: "Rose",
3: "Flag",
4: "Flashlight",
5: "Sea turtle",
6: "Camera",
7: "Animal",
8: "Glove",
9: "Crocodile",
10: "Cattle",
11: "House",
12: "Guacamole",
13: "Penguin",
14: "Vehicle registration plate",
15: "Bench",
16: "Ladybug",
17: "Human nose",
18: "Watermelon",
19: "Flute",
20: "Butterfly",
21: "Washing machine",
22: "Raccoon",
23: "Segway",
24: "Taco",
25: "Jellyfish",
26: "Cake",
27: "Pen",
28: "Cannon",
29: "Bread",
30: "Tree",
31: "Shellfish",
32: "Bed",
33: "Hamster",
34: "Hat",
35: "Toaster",
36: "Sombrero",
37: "Tiara",
38: "Bowl",
39: "Dragonfly",
40: "Moths and butterflies",
41: "Antelope",
42: "Vegetable",
43: "Torch",
44: "Building",
45: "Power plugs and sockets",
46: "Blender",
47: "Billiard table",
48: "Cutting board",
49: "Bronze sculpture",
50: "Turtle",
51: "Broccoli",
52: "Tiger",
53: "Mirror",
54: "Bear",
55: "Zucchini",
56: "Dress",
57: "Volleyball",
58: "Guitar",
59: "Reptile",
60: "Golf cart",
61: "Tart",
62: "Fedora",
63: "Carnivore",
64: "Car",
65: "Lighthouse",
66: "Coffeemaker",
67: "Food processor",
68: "Truck",
69: "Bookcase",
70: "Surfboard",
71: "Footwear",
72: "Bench",
73: "Necklace",
74: "Flower",
75: "Radish",
76: "Marine mammal",
77: "Frying pan",
78: "Tap",
79: "Peach",
80: "Knife",
81: "Handbag",
82: "Laptop",
83: "Tent",
84: "Ambulance",
85: "Christmas tree",
86: "Eagle",
87: "Limousine",
88: "Kitchen & dining room table",
89: "Polar bear",
90: "Tower",
91: "Football",
92: "Willow",
93: "Human head",
94: "Stop sign",
95: "Banana",
96: "Mixer",
97: "Binoculars",
98: "Dessert",
99: "Bee",
100: "Chair",
101: "Wood-burning stove",
102: "Flowerpot",
103: "Beaker",
104: "Oyster",
105: "Woodpecker",
106: "Harp",
107: "Bathtub",
108: "Wall clock",
109: "Sports uniform",
110: "Rhinoceros",
111: "Beehive",
112: "Cupboard",
113: "Chicken",
114: "Man",
115: "Blue jay",
116: "Cucumber",
117: "Balloon",
118: "Kite",
119: "Fireplace",
120: "Lantern",
121: "Missile",
122: "Book",
123: "Spoon",
124: "Grapefruit",
125: "Squirrel",
126: "Orange",
127: "Coat",
128: "Punching bag",
129: "Zebra",
130: "Billboard",
131: "Bicycle",
132: "Door handle",
133: "Mechanical fan",
134: "Ring binder",
135: "Table",
136: "Parrot",
137: "Sock",
138: "Vase",
139: "Weapon",
140: "Shotgun",
141: "Glasses",
142: "Seahorse",
143: "Belt",
144: "Watercraft",
145: "Window",
146: "Giraffe",
147: "Lion",
148: "Tire",
149: "Vehicle",
150: "Canoe",
151: "Tie",
152: "Shelf",
153: "Picture frame",
154: "Printer",
155: "Human leg",
156: "Boat",
157: "Slow cooker",
158: "Croissant",
159: "Candle",
160: "Pancake",
161: "Pillow",
162: "Coin",
163: "Stretcher",
164: "Sandal",
165: "Woman",
166: "Stairs",
167: "Harpsichord",
168: "Stool",
169: "Bus",
170: "Suitcase",
171: "Human mouth",
172: "Juice",
173: "Skull",
174: "Door",
175: "Violin",
176: "Chopsticks",
177: "Digital clock",
178: "Sunflower",
179: "Leopard",
180: "Bell pepper",
181: "Harbor seal",
182: "Snake",
183: "Sewing machine",
184: "Goose",
185: "Helicopter",
186: "Seat belt",
187: "Coffee cup",
188: "Microwave oven",
189: "Hot dog",
190: "Countertop",
191: "Serving tray",
192: "Dog bed",
193: "Beer",
194: "Sunglasses",
195: "Golf ball",
196: "Waffle",
197: "Palm tree",
198: "Trumpet",
199: "Ruler",
200: "Helmet",
201: "Ladder",
202: "Office building",
203: "Tablet computer",
204: "Toilet paper",
205: "Pomegranate",
206: "Skirt",
207: "Gas stove",
208: "Cookie",
209: "Cart",
210: "Raven",
211: "Egg",
212: "Burrito",
213: "Goat",
214: "Kitchen knife",
215: "Skateboard",
216: "Salt and pepper shakers",
217: "Lynx",
218: "Boot",
219: "Platter",
220: "Ski",
221: "Swimwear",
222: "Swimming pool",
223: "Drinking straw",
224: "Wrench",
225: "Drum",
226: "Ant",
227: "Human ear",
228: "Headphones",
229: "Fountain",
230: "Bird",
231: "Jeans",
232: "Television",
233: "Crab",
234: "Microphone",
235: "Home appliance",
236: "Snowplow",
237: "Beetle",
238: "Artichoke",
239: "Jet ski",
240: "Stationary bicycle",
241: "Human hair",
242: "Brown bear",
243: "Starfish",
244: "Fork",
245: "Lobster",
246: "Corded phone",
247: "Drink",
248: "Saucer",
249: "Carrot",
250: "Insect",
251: "Clock",
252: "Castle",
253: "Tennis racket",
254: "Ceiling fan",
255: "Asparagus",
256: "Jaguar",
257: "Musical instrument",
258: "Train",
259: "Cat",
260: "Rifle",
261: "Dumbbell",
262: "Mobile phone",
263: "Taxi",
264: "Shower",
265: "Pitcher",
266: "Lemon",
267: "Invertebrate",
268: "Turkey",
269: "High heels",
270: "Bust",
271: "Elephant",
272: "Scarf",
273: "Barrel",
274: "Trombone",
275: "Pumpkin",
276: "Box",
277: "Tomato",
278: "Frog",
279: "Bidet",
280: "Human face",
281: "Houseplant",
282: "Van",
283: "Shark",
284: "Ice cream",
285: "Swim cap",
286: "Falcon",
287: "Ostrich",
288: "Handgun",
289: "Whiteboard",
290: "Lizard",
291: "Pasta",
292: "Snowmobile",
293: "Light bulb",
294: "Window blind",
295: "Muffin",
296: "Pretzel",
297: "Computer monitor",
298: "Horn",
299: "Furniture",
300: "Sandwich",
301: "Fox",
302: "Convenience store",
303: "Fish",
304: "Fruit",
305: "Earrings",
306: "Curtain",
307: "Grape",
308: "Sofa bed",
309: "Horse",
310: "Luggage and bags",
311: "Desk",
312: "Crutch",
313: "Bicycle helmet",
314: "Tick",
315: "Airplane",
316: "Canary",
317: "Spatula",
318: "Watch",
319: "Lily",
320: "Kitchen appliance",
321: "Filing cabinet",
322: "Aircraft",
323: "Cake stand",
324: "Candy",
325: "Sink",
326: "Mouse",
327: "Wine",
328: "Wheelchair",
329: "Goldfish",
330: "Refrigerator",
331: "French fries",
332: "Drawer",
333: "Treadmill",
334: "Picnic basket",
335: "Dice",
336: "Cabbage",
337: "Football helmet",
338: "Pig",
339: "Person",
340: "Shorts",
341: "Gondola",
342: "Honeycomb",
343: "Doughnut",
344: "Chest of drawers",
345: "Land vehicle",
346: "Bat",
347: "Monkey",
348: "Dagger",
349: "Tableware",
350: "Human foot",
351: "Mug",
352: "Alarm clock",
353: "Pressure cooker",
354: "Human hand",
355: "Tortoise",
356: "Baseball glove",
357: "Sword",
358: "Pear",
359: "Miniskirt",
360: "Traffic sign",
361: "Girl",
362: "Roller skates",
363: "Dinosaur",
364: "Porch",
365: "Human beard",
366: "Submarine sandwich",
367: "Screwdriver",
368: "Strawberry",
369: "Wine glass",
370: "Seafood",
371: "Racket",
372: "Wheel",
373: "Sea lion",
374: "Toy",
375: "Tea",
376: "Tennis ball",
377: "Waste container",
378: "Mule",
379: "Cricket ball",
380: "Pineapple",
381: "Coconut",
382: "Doll",
383: "Coffee table",
384: "Snowman",
385: "Lavender",
386: "Shrimp",
387: "Maple",
388: "Cowboy hat",
389: "Goggles",
390: "Rugby ball",
391: "Caterpillar",
392: "Poster",
393: "Rocket",
394: "Organ",
395: "Saxophone",
396: "Traffic light",
397: "Cocktail",
398: "Plastic bag",
399: "Squash",
400: "Mushroom",
401: "Hamburger",
402: "Light switch",
403: "Parachute",
404: "Teddy bear",
405: "Winter melon",
406: "Deer",
407: "Musical keyboard",
408: "Plumbing fixture",
409: "Scoreboard",
410: "Baseball bat",
411: "Envelope",
412: "Adhesive tape",
413: "Briefcase",
414: "Paddle",
415: "Bow and arrow",
416: "Telephone",
417: "Sheep",
418: "Jacket",
419: "Boy",
420: "Pizza",
421: "Otter",
422: "Office supplies",
423: "Couch",
424: "Cello",
425: "Bull",
426: "Camel",
427: "Ball",
428: "Duck",
429: "Whale",
430: "Shirt",
431: "Tank",
432: "Motorcycle",
433: "Accordion",
434: "Owl",
435: "Porcupine",
436: "Sun hat",
437: "Nail",
438: "Scissors",
439: "Swan",
440: "Lamp",
441: "Crown",
442: "Piano",
443: "Sculpture",
444: "Cheetah",
445: "Oboe",
446: "Tin can",
447: "Mango",
448: "Tripod",
449: "Oven",
450: "Mouse",
451: "Barge",
452: "Coffee",
453: "Snowboard",
454: "Common fig",
455: "Salad",
456: "Marine invertebrates",
457: "Umbrella",
458: "Kangaroo",
459: "Human arm",
460: "Measuring cup",
461: "Snail",
462: "Loveseat",
463: "Suit",
464: "Teapot",
465: "Bottle",
466: "Alpaca",
467: "Kettle",
468: "Trousers",
469: "Popcorn",
470: "Centipede",
471: "Spider",
472: "Sparrow",
473: "Plate",
474: "Bagel",
475: "Personal care",
476: "Apple",
477: "Brassiere",
478: "Bathroom cabinet",
479: "studio couch",
480: "Computer keyboard",
481: "Table tennis racket",
482: "Sushi",
483: "Cabinetry",
484: "Street light",
485: "Towel",
486: "Nightstand",
487: "Rabbit",
488: "Dolphin",
489: "Dog",
490: "Jug",
491: "Wok",
492: "Fire hydrant",
493: "Human eye",
494: "Skyscraper",
495: "Backpack",
496: "Potato",
497: "Paper towel",
498: "Lifejacket",
499: "Bicycle wheel",
500: "Toilet",
}
return clsid2catid, catid2name
def _visdrone_category():
clsid2catid = {i: i for i in range(10)}
catid2name = {
0: 'pedestrian',
1: 'people',
2: 'bicycle',
3: 'car',
4: 'van',
5: 'truck',
6: 'tricycle',
7: 'awning-tricycle',
8: 'bus',
9: 'motor'
}
return clsid2catid, catid2name
================================================
FILE: ppdet/data/source/coco.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import copy
try:
from collections.abc import Sequence
except Exception:
from collections import Sequence
import numpy as np
from ppdet.core.workspace import register, serializable
from .dataset import DetDataset
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
__all__ = [
'COCODataSet', 'SlicedCOCODataSet', 'SemiCOCODataSet', 'COCODetDataset', 'COCOInstSegDataset'
]
@register
@serializable
class COCODataSet(DetDataset):
"""
Load dataset with COCO format.
Args:
dataset_dir (str): root directory for dataset.
image_dir (str): directory for images.
anno_path (str): coco annotation file path.
data_fields (list): key name of data dictionary, at least have 'image'.
sample_num (int): number of samples to load, -1 means all.
load_crowd (bool): whether to load crowded ground-truth.
False as default
allow_empty (bool): whether to load empty entry. False as default
empty_ratio (float): the ratio of empty record number to total
record's, if empty_ratio is out of [0. ,1.), do not sample the
records and use all the empty entries. 1. as default
repeat (int): repeat times for dataset, use in benchmark.
"""
def __init__(self,
dataset_dir=None,
image_dir=None,
anno_path=None,
data_fields=['image'],
sample_num=-1,
load_crowd=False,
allow_empty=False,
empty_ratio=1.,
repeat=1):
super(COCODataSet, self).__init__(
dataset_dir,
image_dir,
anno_path,
data_fields,
sample_num,
repeat=repeat)
self.load_image_only = False
self.load_semantic = False
self.load_crowd = load_crowd
self.allow_empty = allow_empty
self.empty_ratio = empty_ratio
def _sample_empty(self, records, num):
# if empty_ratio is out of [0. ,1.), do not sample the records
if self.empty_ratio < 0. or self.empty_ratio >= 1.:
return records
import random
sample_num = min(
int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records))
records = random.sample(records, sample_num)
return records
def parse_dataset(self):
anno_path = os.path.join(self.dataset_dir, self.anno_path)
image_dir = os.path.join(self.dataset_dir, self.image_dir)
assert anno_path.endswith('.json'), \
'invalid coco annotation file: ' + anno_path
from pycocotools.coco import COCO
coco = COCO(anno_path)
img_ids = coco.getImgIds()
img_ids.sort()
cat_ids = coco.getCatIds()
records = []
empty_records = []
ct = 0
self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
self.cname2cid = dict({
coco.loadCats(catid)[0]['name']: clsid
for catid, clsid in self.catid2clsid.items()
})
if 'annotations' not in coco.dataset:
self.load_image_only = True
logger.warning('Annotation file: {} does not contains ground truth '
'and load image information only.'.format(anno_path))
for img_id in img_ids:
img_anno = coco.loadImgs([img_id])[0]
im_fname = img_anno['file_name']
im_w = float(img_anno['width'])
im_h = float(img_anno['height'])
im_path = os.path.join(image_dir,
im_fname) if image_dir else im_fname
is_empty = False
if not os.path.exists(im_path):
logger.warning('Illegal image file: {}, and it will be '
'ignored'.format(im_path))
continue
if im_w < 0 or im_h < 0:
logger.warning('Illegal width: {} or height: {} in annotation, '
'and im_id: {} will be ignored'.format(
im_w, im_h, img_id))
continue
coco_rec = {
'im_file': im_path,
'im_id': np.array([img_id]),
'h': im_h,
'w': im_w,
} if 'image' in self.data_fields else {}
if not self.load_image_only:
ins_anno_ids = coco.getAnnIds(
imgIds=[img_id], iscrowd=None if self.load_crowd else False)
instances = coco.loadAnns(ins_anno_ids)
bboxes = []
is_rbox_anno = False
for inst in instances:
# check gt bbox
if inst.get('ignore', False):
continue
if 'bbox' not in inst.keys():
continue
else:
if not any(np.array(inst['bbox'])):
continue
x1, y1, box_w, box_h = inst['bbox']
x2 = x1 + box_w
y2 = y1 + box_h
eps = 1e-5
if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps:
inst['clean_bbox'] = [
round(float(x), 3) for x in [x1, y1, x2, y2]
]
bboxes.append(inst)
else:
logger.warning(
'Found an invalid bbox in annotations: im_id: {}, '
'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(
img_id, float(inst['area']), x1, y1, x2, y2))
num_bbox = len(bboxes)
if num_bbox <= 0 and not self.allow_empty:
continue
elif num_bbox <= 0:
is_empty = True
gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
gt_poly = [None] * num_bbox
gt_track_id = -np.ones((num_bbox, 1), dtype=np.int32)
has_segmentation = False
has_track_id = False
for i, box in enumerate(bboxes):
catid = box['category_id']
gt_class[i][0] = self.catid2clsid[catid]
gt_bbox[i, :] = box['clean_bbox']
is_crowd[i][0] = box['iscrowd']
# check RLE format
if 'segmentation' in box and box['iscrowd'] == 1:
gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
elif 'segmentation' in box and box['segmentation']:
if not np.array(
box['segmentation'],
dtype=object).size > 0 and not self.allow_empty:
bboxes.pop(i)
gt_poly.pop(i)
np.delete(is_crowd, i)
np.delete(gt_class, i)
np.delete(gt_bbox, i)
else:
gt_poly[i] = box['segmentation']
has_segmentation = True
if 'track_id' in box:
gt_track_id[i][0] = box['track_id']
has_track_id = True
if has_segmentation and not any(
gt_poly) and not self.allow_empty:
continue
gt_rec = {
'is_crowd': is_crowd,
'gt_class': gt_class,
'gt_bbox': gt_bbox,
'gt_poly': gt_poly,
}
if has_track_id:
gt_rec.update({'gt_track_id': gt_track_id})
for k, v in gt_rec.items():
if k in self.data_fields:
coco_rec[k] = v
# TODO: remove load_semantic
if self.load_semantic and 'semantic' in self.data_fields:
seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps',
'train2017', im_fname[:-3] + 'png')
coco_rec.update({'semantic': seg_path})
logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format(
im_path, img_id, im_h, im_w))
if is_empty:
empty_records.append(coco_rec)
else:
records.append(coco_rec)
ct += 1
if self.sample_num > 0 and ct >= self.sample_num:
break
assert ct > 0, 'not found any coco record in %s' % (anno_path)
logger.info('Load [{} samples valid, {} samples invalid] in file {}.'.
format(ct, len(img_ids) - ct, anno_path))
if self.allow_empty and len(empty_records) > 0:
empty_records = self._sample_empty(empty_records, len(records))
records += empty_records
self.roidbs = records
@register
@serializable
class SlicedCOCODataSet(COCODataSet):
"""Sliced COCODataSet"""
def __init__(
self,
dataset_dir=None,
image_dir=None,
anno_path=None,
data_fields=['image'],
sample_num=-1,
load_crowd=False,
allow_empty=False,
empty_ratio=1.,
repeat=1,
sliced_size=[640, 640],
overlap_ratio=[0.25, 0.25], ):
super(SlicedCOCODataSet, self).__init__(
dataset_dir=dataset_dir,
image_dir=image_dir,
anno_path=anno_path,
data_fields=data_fields,
sample_num=sample_num,
load_crowd=load_crowd,
allow_empty=allow_empty,
empty_ratio=empty_ratio,
repeat=repeat, )
self.sliced_size = sliced_size
self.overlap_ratio = overlap_ratio
def parse_dataset(self):
anno_path = os.path.join(self.dataset_dir, self.anno_path)
image_dir = os.path.join(self.dataset_dir, self.image_dir)
assert anno_path.endswith('.json'), \
'invalid coco annotation file: ' + anno_path
from pycocotools.coco import COCO
coco = COCO(anno_path)
img_ids = coco.getImgIds()
img_ids.sort()
cat_ids = coco.getCatIds()
records = []
empty_records = []
ct = 0
ct_sub = 0
self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
self.cname2cid = dict({
coco.loadCats(catid)[0]['name']: clsid
for catid, clsid in self.catid2clsid.items()
})
if 'annotations' not in coco.dataset:
self.load_image_only = True
logger.warning('Annotation file: {} does not contains ground truth '
'and load image information only.'.format(anno_path))
try:
import sahi
from sahi.slicing import slice_image
except Exception as e:
logger.error(
'sahi not found, plaese install sahi. '
'for example: `pip install sahi`, see https://github.com/obss/sahi.'
)
raise e
sub_img_ids = 0
for img_id in img_ids:
img_anno = coco.loadImgs([img_id])[0]
im_fname = img_anno['file_name']
im_w = float(img_anno['width'])
im_h = float(img_anno['height'])
im_path = os.path.join(image_dir,
im_fname) if image_dir else im_fname
is_empty = False
if not os.path.exists(im_path):
logger.warning('Illegal image file: {}, and it will be '
'ignored'.format(im_path))
continue
if im_w < 0 or im_h < 0:
logger.warning('Illegal width: {} or height: {} in annotation, '
'and im_id: {} will be ignored'.format(
im_w, im_h, img_id))
continue
slice_image_result = sahi.slicing.slice_image(
image=im_path,
slice_height=self.sliced_size[0],
slice_width=self.sliced_size[1],
overlap_height_ratio=self.overlap_ratio[0],
overlap_width_ratio=self.overlap_ratio[1])
sub_img_num = len(slice_image_result)
for _ind in range(sub_img_num):
im = slice_image_result.images[_ind]
coco_rec = {
'image': im,
'im_id': np.array([sub_img_ids + _ind]),
'h': im.shape[0],
'w': im.shape[1],
'ori_im_id': np.array([img_id]),
'st_pix': np.array(
slice_image_result.starting_pixels[_ind],
dtype=np.float32),
'is_last': 1 if _ind == sub_img_num - 1 else 0,
} if 'image' in self.data_fields else {}
records.append(coco_rec)
ct_sub += sub_img_num
ct += 1
if self.sample_num > 0 and ct >= self.sample_num:
break
assert ct > 0, 'not found any coco record in %s' % (anno_path)
logger.info('{} samples and slice to {} sub_samples in file {}'.format(
ct, ct_sub, anno_path))
if self.allow_empty and len(empty_records) > 0:
empty_records = self._sample_empty(empty_records, len(records))
records += empty_records
self.roidbs = records
@register
@serializable
class SemiCOCODataSet(COCODataSet):
"""Semi-COCODataSet used for supervised and unsupervised dataSet"""
def __init__(self,
dataset_dir=None,
image_dir=None,
anno_path=None,
data_fields=['image'],
sample_num=-1,
load_crowd=False,
allow_empty=False,
empty_ratio=1.,
repeat=1,
supervised=True):
super(SemiCOCODataSet, self).__init__(
dataset_dir, image_dir, anno_path, data_fields, sample_num,
load_crowd, allow_empty, empty_ratio, repeat)
self.supervised = supervised
self.length = -1 # defalut -1 means all
def parse_dataset(self):
anno_path = os.path.join(self.dataset_dir, self.anno_path)
image_dir = os.path.join(self.dataset_dir, self.image_dir)
assert anno_path.endswith('.json'), \
'invalid coco annotation file: ' + anno_path
from pycocotools.coco import COCO
coco = COCO(anno_path)
img_ids = coco.getImgIds()
img_ids.sort()
cat_ids = coco.getCatIds()
records = []
empty_records = []
ct = 0
self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
self.cname2cid = dict({
coco.loadCats(catid)[0]['name']: clsid
for catid, clsid in self.catid2clsid.items()
})
if 'annotations' not in coco.dataset or self.supervised == False:
self.load_image_only = True
logger.warning('Annotation file: {} does not contains ground truth '
'and load image information only.'.format(anno_path))
for img_id in img_ids:
img_anno = coco.loadImgs([img_id])[0]
im_fname = img_anno['file_name']
im_w = float(img_anno['width'])
im_h = float(img_anno['height'])
im_path = os.path.join(image_dir,
im_fname) if image_dir else im_fname
is_empty = False
if not os.path.exists(im_path):
logger.warning('Illegal image file: {}, and it will be '
'ignored'.format(im_path))
continue
if im_w < 0 or im_h < 0:
logger.warning('Illegal width: {} or height: {} in annotation, '
'and im_id: {} will be ignored'.format(
im_w, im_h, img_id))
continue
coco_rec = {
'im_file': im_path,
'im_id': np.array([img_id]),
'h': im_h,
'w': im_w,
} if 'image' in self.data_fields else {}
if not self.load_image_only:
ins_anno_ids = coco.getAnnIds(
imgIds=[img_id], iscrowd=None if self.load_crowd else False)
instances = coco.loadAnns(ins_anno_ids)
bboxes = []
is_rbox_anno = False
for inst in instances:
# check gt bbox
if inst.get('ignore', False):
continue
if 'bbox' not in inst.keys():
continue
else:
if not any(np.array(inst['bbox'])):
continue
x1, y1, box_w, box_h = inst['bbox']
x2 = x1 + box_w
y2 = y1 + box_h
eps = 1e-5
if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps:
inst['clean_bbox'] = [
round(float(x), 3) for x in [x1, y1, x2, y2]
]
bboxes.append(inst)
else:
logger.warning(
'Found an invalid bbox in annotations: im_id: {}, '
'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(
img_id, float(inst['area']), x1, y1, x2, y2))
num_bbox = len(bboxes)
if num_bbox <= 0 and not self.allow_empty:
continue
elif num_bbox <= 0:
is_empty = True
gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
gt_poly = [None] * num_bbox
has_segmentation = False
for i, box in enumerate(bboxes):
catid = box['category_id']
gt_class[i][0] = self.catid2clsid[catid]
gt_bbox[i, :] = box['clean_bbox']
is_crowd[i][0] = box['iscrowd']
# check RLE format
if 'segmentation' in box and box['iscrowd'] == 1:
gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
elif 'segmentation' in box and box['segmentation']:
if not np.array(box['segmentation']
).size > 0 and not self.allow_empty:
bboxes.pop(i)
gt_poly.pop(i)
np.delete(is_crowd, i)
np.delete(gt_class, i)
np.delete(gt_bbox, i)
else:
gt_poly[i] = box['segmentation']
has_segmentation = True
if has_segmentation and not any(
gt_poly) and not self.allow_empty:
continue
gt_rec = {
'is_crowd': is_crowd,
'gt_class': gt_class,
'gt_bbox': gt_bbox,
'gt_poly': gt_poly,
}
for k, v in gt_rec.items():
if k in self.data_fields:
coco_rec[k] = v
# TODO: remove load_semantic
if self.load_semantic and 'semantic' in self.data_fields:
seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps',
'train2017', im_fname[:-3] + 'png')
coco_rec.update({'semantic': seg_path})
logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format(
im_path, img_id, im_h, im_w))
if is_empty:
empty_records.append(coco_rec)
else:
records.append(coco_rec)
ct += 1
if self.sample_num > 0 and ct >= self.sample_num:
break
assert ct > 0, 'not found any coco record in %s' % (anno_path)
logger.info('Load [{} samples valid, {} samples invalid] in file {}.'.
format(ct, len(img_ids) - ct, anno_path))
if self.allow_empty and len(empty_records) > 0:
empty_records = self._sample_empty(empty_records, len(records))
records += empty_records
self.roidbs = records
if self.supervised:
logger.info(f'Use {len(self.roidbs)} sup_samples data as LABELED')
else:
if self.length > 0: # unsup length will be decide by sup length
all_roidbs = self.roidbs.copy()
selected_idxs = [
np.random.choice(len(all_roidbs))
for _ in range(self.length)
]
self.roidbs = [all_roidbs[i] for i in selected_idxs]
logger.info(
f'Use {len(self.roidbs)} unsup_samples data as UNLABELED')
def __getitem__(self, idx):
n = len(self.roidbs)
if self.repeat > 1:
idx %= n
# data batch
roidb = copy.deepcopy(self.roidbs[idx])
if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch:
idx = np.random.randint(n)
roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch:
idx = np.random.randint(n)
roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch:
roidb = [roidb, ] + [
copy.deepcopy(self.roidbs[np.random.randint(n)])
for _ in range(4)
]
if isinstance(roidb, Sequence):
for r in roidb:
r['curr_iter'] = self._curr_iter
else:
roidb['curr_iter'] = self._curr_iter
self._curr_iter += 1
return self.transform(roidb)
# for PaddleX
@register
@serializable
class COCODetDataset(COCODataSet):
pass
# for PaddleX
@register
@serializable
class COCOInstSegDataset(COCODataSet):
pass
================================================
FILE: ppdet/data/source/culane.py
================================================
from ppdet.core.workspace import register, serializable
import cv2
import os
import tarfile
import numpy as np
import os.path as osp
from ppdet.data.source.dataset import DetDataset
from imgaug.augmentables.lines import LineStringsOnImage
from imgaug.augmentables.segmaps import SegmentationMapsOnImage
from ppdet.data.culane_utils import lane_to_linestrings
import pickle as pkl
from ppdet.utils.logger import setup_logger
try:
from collections.abc import Sequence
except Exception:
from collections import Sequence
from .dataset import DetDataset, _make_dataset, _is_valid_file
from ppdet.utils.download import download_dataset
logger = setup_logger(__name__)
@register
@serializable
class CULaneDataSet(DetDataset):
def __init__(
self,
dataset_dir,
cut_height,
list_path,
split='train',
data_fields=['image'],
video_file=None,
frame_rate=-1, ):
super(CULaneDataSet, self).__init__(
dataset_dir=dataset_dir,
cut_height=cut_height,
split=split,
data_fields=data_fields)
self.dataset_dir = dataset_dir
self.list_path = osp.join(dataset_dir, list_path)
self.cut_height = cut_height
self.data_fields = data_fields
self.split = split
self.training = 'train' in split
self.data_infos = []
self.video_file = video_file
self.frame_rate = frame_rate
self._imid2path = {}
self.predict_dir = None
def __len__(self):
return len(self.data_infos)
def check_or_download_dataset(self):
if not osp.exists(self.dataset_dir):
download_dataset("dataset", dataset="culane")
# extract .tar files in self.dataset_dir
for fname in os.listdir(self.dataset_dir):
logger.info("Decompressing {}...".format(fname))
# ignore .* files
if fname.startswith('.'):
continue
if fname.find('.tar.gz') >= 0:
with tarfile.open(osp.join(self.dataset_dir, fname)) as tf:
tf.extractall(path=self.dataset_dir)
logger.info("Dataset files are ready.")
def parse_dataset(self):
logger.info('Loading CULane annotations...')
if self.predict_dir is not None:
logger.info('switch to predict mode')
return
# Waiting for the dataset to load is tedious, let's cache it
os.makedirs('cache', exist_ok=True)
cache_path = 'cache/culane_paddle_{}.pkl'.format(self.split)
if os.path.exists(cache_path):
with open(cache_path, 'rb') as cache_file:
self.data_infos = pkl.load(cache_file)
self.max_lanes = max(
len(anno['lanes']) for anno in self.data_infos)
return
with open(self.list_path) as list_file:
for line in list_file:
infos = self.load_annotation(line.split())
self.data_infos.append(infos)
# cache data infos to file
with open(cache_path, 'wb') as cache_file:
pkl.dump(self.data_infos, cache_file)
def load_annotation(self, line):
infos = {}
img_line = line[0]
img_line = img_line[1 if img_line[0] == '/' else 0::]
img_path = os.path.join(self.dataset_dir, img_line)
infos['img_name'] = img_line
infos['img_path'] = img_path
if len(line) > 1:
mask_line = line[1]
mask_line = mask_line[1 if mask_line[0] == '/' else 0::]
mask_path = os.path.join(self.dataset_dir, mask_line)
infos['mask_path'] = mask_path
if len(line) > 2:
exist_list = [int(l) for l in line[2:]]
infos['lane_exist'] = np.array(exist_list)
anno_path = img_path[:
-3] + 'lines.txt' # remove sufix jpg and add lines.txt
with open(anno_path, 'r') as anno_file:
data = [
list(map(float, line.split())) for line in anno_file.readlines()
]
lanes = [[(lane[i], lane[i + 1]) for i in range(0, len(lane), 2)
if lane[i] >= 0 and lane[i + 1] >= 0] for lane in data]
lanes = [list(set(lane)) for lane in lanes] # remove duplicated points
lanes = [lane for lane in lanes
if len(lane) > 2] # remove lanes with less than 2 points
lanes = [sorted(
lane, key=lambda x: x[1]) for lane in lanes] # sort by y
infos['lanes'] = lanes
return infos
def set_images(self, images):
self.predict_dir = images
self.data_infos = self._load_images()
def _find_images(self):
predict_dir = self.predict_dir
if not isinstance(predict_dir, Sequence):
predict_dir = [predict_dir]
images = []
for im_dir in predict_dir:
if os.path.isdir(im_dir):
im_dir = os.path.join(self.predict_dir, im_dir)
images.extend(_make_dataset(im_dir))
elif os.path.isfile(im_dir) and _is_valid_file(im_dir):
images.append(im_dir)
return images
def _load_images(self):
images = self._find_images()
ct = 0
records = []
for image in images:
assert image != '' and os.path.isfile(image), \
"Image {} not found".format(image)
if self.sample_num > 0 and ct >= self.sample_num:
break
rec = {
'im_id': np.array([ct]),
"img_path": os.path.abspath(image),
"img_name": os.path.basename(image),
"lanes": []
}
self._imid2path[ct] = image
ct += 1
records.append(rec)
assert len(records) > 0, "No image file found"
return records
def get_imid2path(self):
return self._imid2path
def __getitem__(self, idx):
data_info = self.data_infos[idx]
img = cv2.imread(data_info['img_path'])
img = img[self.cut_height:, :, :]
sample = data_info.copy()
sample.update({'image': img})
img_org = sample['image']
if self.training:
label = cv2.imread(sample['mask_path'], cv2.IMREAD_UNCHANGED)
if len(label.shape) > 2:
label = label[:, :, 0]
label = label.squeeze()
label = label[self.cut_height:, :]
sample.update({'mask': label})
if self.cut_height != 0:
new_lanes = []
for i in sample['lanes']:
lanes = []
for p in i:
lanes.append((p[0], p[1] - self.cut_height))
new_lanes.append(lanes)
sample.update({'lanes': new_lanes})
sample['mask'] = SegmentationMapsOnImage(
sample['mask'], shape=img_org.shape)
sample['full_img_path'] = data_info['img_path']
sample['img_name'] = data_info['img_name']
sample['im_id'] = np.array([idx])
sample['image'] = sample['image'].copy().astype(np.uint8)
sample['lanes'] = lane_to_linestrings(sample['lanes'])
sample['lanes'] = LineStringsOnImage(
sample['lanes'], shape=img_org.shape)
sample['seg'] = np.zeros(img_org.shape)
return sample
================================================
FILE: ppdet/data/source/dataset.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import copy
import numpy as np
try:
from collections.abc import Sequence
except Exception:
from collections import Sequence
from pycocotools.coco import COCO
from paddle.io import Dataset
from ppdet.core.workspace import register, serializable
from ppdet.utils.download import get_dataset_path
from ppdet.data import source
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
@serializable
class DetDataset(Dataset):
"""
Load detection dataset.
Args:
dataset_dir (str): root directory for dataset.
image_dir (str): directory for images.
anno_path (str): annotation file path.
data_fields (list): key name of data dictionary, at least have 'image'.
sample_num (int): number of samples to load, -1 means all.
use_default_label (bool): whether to load default label list.
repeat (int): repeat times for dataset, use in benchmark.
"""
def __init__(self,
dataset_dir=None,
image_dir=None,
anno_path=None,
data_fields=['image'],
sample_num=-1,
use_default_label=None,
repeat=1,
**kwargs):
super(DetDataset, self).__init__()
self.dataset_dir = dataset_dir if dataset_dir is not None else ''
self.anno_path = anno_path
self.image_dir = image_dir if image_dir is not None else ''
self.data_fields = data_fields
self.sample_num = sample_num
self.use_default_label = use_default_label
self.repeat = repeat
self._epoch = 0
self._curr_iter = 0
def __len__(self, ):
return len(self.roidbs) * self.repeat
def __call__(self, *args, **kwargs):
return self
def __getitem__(self, idx):
n = len(self.roidbs)
if self.repeat > 1:
idx %= n
# data batch
roidb = copy.deepcopy(self.roidbs[idx])
if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch:
idx = np.random.randint(n)
roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch:
idx = np.random.randint(n)
roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch:
roidb = [roidb, ] + [
copy.deepcopy(self.roidbs[np.random.randint(n)])
for _ in range(4)
]
elif self.pre_img_epoch == 0 or self._epoch < self.pre_img_epoch:
# Add previous image as input, only used in CenterTrack
idx_pre_img = idx - 1
if idx_pre_img < 0:
idx_pre_img = idx + 1
roidb = [roidb, ] + [copy.deepcopy(self.roidbs[idx_pre_img])]
if isinstance(roidb, Sequence):
for r in roidb:
r['curr_iter'] = self._curr_iter
r['curr_epoch'] = self._epoch
else:
roidb['curr_iter'] = self._curr_iter
roidb['curr_epoch'] = self._epoch
self._curr_iter += 1
if self.transform_schedulers:
assert isinstance(self.transform_schedulers, list)
if isinstance(roidb, Sequence):
for r in roidb:
r['transform_schedulers'] = self.transform_schedulers
else:
roidb['transform_schedulers'] = self.transform_schedulers
return self.transform(roidb)
def check_or_download_dataset(self):
self.dataset_dir = get_dataset_path(self.dataset_dir, self.anno_path,
self.image_dir)
def set_kwargs(self, **kwargs):
self.mixup_epoch = kwargs.get('mixup_epoch', -1)
self.cutmix_epoch = kwargs.get('cutmix_epoch', -1)
self.mosaic_epoch = kwargs.get('mosaic_epoch', -1)
self.pre_img_epoch = kwargs.get('pre_img_epoch', -1)
self.transform_schedulers = kwargs.get('transform_schedulers', None)
def set_transform(self, transform):
self.transform = transform
def set_epoch(self, epoch_id):
self._epoch = epoch_id
def parse_dataset(self, ):
raise NotImplementedError(
"Need to implement parse_dataset method of Dataset")
def get_anno(self):
if self.anno_path is None:
return
return os.path.join(self.dataset_dir, self.anno_path)
def _is_valid_file(f, extensions=('.jpg', '.jpeg', '.png', '.bmp')):
return f.lower().endswith(extensions)
def _make_dataset(dir):
dir = os.path.expanduser(dir)
if not os.path.isdir(dir):
raise ('{} should be a dir'.format(dir))
images = []
for root, _, fnames in sorted(os.walk(dir, followlinks=True)):
for fname in sorted(fnames):
path = os.path.join(root, fname)
if _is_valid_file(path):
images.append(path)
return images
@register
@serializable
class ImageFolder(DetDataset):
def __init__(self,
dataset_dir=None,
image_dir=None,
anno_path=None,
sample_num=-1,
use_default_label=None,
**kwargs):
super(ImageFolder, self).__init__(
dataset_dir,
image_dir,
anno_path,
sample_num=sample_num,
use_default_label=use_default_label)
self._imid2path = {}
self.roidbs = None
self.sample_num = sample_num
def check_or_download_dataset(self):
return
def get_anno(self):
if self.anno_path is None:
return
if self.dataset_dir:
return os.path.join(self.dataset_dir, self.anno_path)
else:
return self.anno_path
def parse_dataset(self, ):
if not self.roidbs:
self.roidbs = self._load_images()
def _parse(self):
image_dir = self.image_dir
if not isinstance(image_dir, Sequence):
image_dir = [image_dir]
images = []
for im_dir in image_dir:
if os.path.isdir(im_dir):
im_dir = os.path.join(self.dataset_dir, im_dir)
images.extend(_make_dataset(im_dir))
elif os.path.isfile(im_dir) and _is_valid_file(im_dir):
images.append(im_dir)
return images
def get_images(self):
images_path = []
coco = COCO(os.path.join(self.dataset_dir, self.anno_path))
imgIds = coco.getImgIds(catIds=[])
for imgId in imgIds:
filename = coco.loadImgs(imgId)[0]["file_name"]
images_path.append(os.path.join(self.dataset_dir, self.image_dir, filename))
return images_path
def _load_images(self, do_eval=False):
images = self._parse()
ct = 0
records = []
anno_file = self.get_anno()
coco = COCO(anno_file)
for image in images:
assert image != '' and os.path.isfile(image), \
"Image {} not found".format(image)
if self.sample_num > 0 and ct >= self.sample_num:
break
if do_eval:
image_id = self.get_image_id(image, coco)
ct = image_id
rec = {'im_id': np.array([ct]), 'im_file': image}
self._imid2path[ct] = image
ct += 1
records.append(rec)
assert len(records) > 0, "No image file found"
return records
def get_image_id(self, image, coco):
image_ids = coco.getImgIds()
for image_id in image_ids:
img_info = coco.loadImgs(image_id)[0]
if img_info['file_name'] in image:
return image_id
else:
continue
def get_imid2path(self):
return self._imid2path
def set_images(self, images, do_eval=False):
self.image_dir = images
self.roidbs = self._load_images(do_eval=do_eval)
def set_slice_images(self,
images,
slice_size=[640, 640],
overlap_ratio=[0.25, 0.25]):
self.image_dir = images
ori_records = self._load_images()
try:
import sahi
from sahi.slicing import slice_image
except Exception as e:
logger.error(
'sahi not found, plaese install sahi. '
'for example: `pip install sahi`, see https://github.com/obss/sahi.'
)
raise e
sub_img_ids = 0
ct = 0
ct_sub = 0
records = []
for i, ori_rec in enumerate(ori_records):
im_path = ori_rec['im_file']
slice_image_result = sahi.slicing.slice_image(
image=im_path,
slice_height=slice_size[0],
slice_width=slice_size[1],
overlap_height_ratio=overlap_ratio[0],
overlap_width_ratio=overlap_ratio[1])
sub_img_num = len(slice_image_result)
for _ind in range(sub_img_num):
im = slice_image_result.images[_ind]
rec = {
'image': im,
'im_id': np.array([sub_img_ids + _ind]),
'h': im.shape[0],
'w': im.shape[1],
'ori_im_id': np.array([ori_rec['im_id'][0]]),
'st_pix': np.array(
slice_image_result.starting_pixels[_ind],
dtype=np.float32),
'is_last': 1 if _ind == sub_img_num - 1 else 0,
} if 'image' in self.data_fields else {}
records.append(rec)
ct_sub += sub_img_num
ct += 1
logger.info('{} samples and slice to {} sub_samples.'.format(ct,
ct_sub))
self.roidbs = records
def get_label_list(self):
# Only VOC dataset needs label list in ImageFold
return self.anno_path
@register
class CommonDataset(object):
def __init__(self, **dataset_args):
super(CommonDataset, self).__init__()
dataset_args = copy.deepcopy(dataset_args)
type = dataset_args.pop("name")
self.dataset = getattr(source, type)(**dataset_args)
def __call__(self):
return self.dataset
@register
class TrainDataset(CommonDataset):
pass
@register
class EvalMOTDataset(CommonDataset):
pass
@register
class TestMOTDataset(CommonDataset):
pass
@register
class EvalDataset(CommonDataset):
pass
@register
class TestDataset(CommonDataset):
pass
================================================
FILE: ppdet/data/source/keypoint_coco.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
this code is base on https://github.com/open-mmlab/mmpose
"""
import os
import cv2
import numpy as np
import json
import copy
import pycocotools
from pycocotools.coco import COCO
from .dataset import DetDataset
from ppdet.core.workspace import register, serializable
@serializable
class KeypointBottomUpBaseDataset(DetDataset):
"""Base class for bottom-up datasets.
All datasets should subclass it.
All subclasses should overwrite:
Methods:`_get_imganno`
Args:
dataset_dir (str): Root path to the dataset.
anno_path (str): Relative path to the annotation file.
image_dir (str): Path to a directory where images are held.
Default: None.
num_joints (int): keypoint numbers
transform (composed(operators)): A sequence of data transforms.
shard (list): [rank, worldsize], the distributed env params
test_mode (bool): Store True when building test or
validation dataset. Default: False.
"""
def __init__(self,
dataset_dir,
image_dir,
anno_path,
num_joints,
transform=[],
shard=[0, 1],
test_mode=False):
super().__init__(dataset_dir, image_dir, anno_path)
self.image_info = {}
self.ann_info = {}
self.img_prefix = os.path.join(dataset_dir, image_dir)
self.transform = transform
self.test_mode = test_mode
self.ann_info['num_joints'] = num_joints
self.img_ids = []
def parse_dataset(self):
pass
def __len__(self):
"""Get dataset length."""
return len(self.img_ids)
def _get_imganno(self, idx):
"""Get anno for a single image."""
raise NotImplementedError
def __getitem__(self, idx):
"""Prepare image for training given the index."""
records = copy.deepcopy(self._get_imganno(idx))
records['image'] = cv2.imread(records['image_file'])
records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB)
if 'mask' in records:
records['mask'] = (records['mask'] + 0).astype('uint8')
records = self.transform(records)
return records
def parse_dataset(self):
return
@register
@serializable
class KeypointBottomUpCocoDataset(KeypointBottomUpBaseDataset):
"""COCO dataset for bottom-up pose estimation.
The dataset loads raw features and apply specified transforms
to return a dict containing the image tensors and other information.
COCO keypoint indexes::
0: 'nose',
1: 'left_eye',
2: 'right_eye',
3: 'left_ear',
4: 'right_ear',
5: 'left_shoulder',
6: 'right_shoulder',
7: 'left_elbow',
8: 'right_elbow',
9: 'left_wrist',
10: 'right_wrist',
11: 'left_hip',
12: 'right_hip',
13: 'left_knee',
14: 'right_knee',
15: 'left_ankle',
16: 'right_ankle'
Args:
dataset_dir (str): Root path to the dataset.
anno_path (str): Relative path to the annotation file.
image_dir (str): Path to a directory where images are held.
Default: None.
num_joints (int): keypoint numbers
transform (composed(operators)): A sequence of data transforms.
shard (list): [rank, worldsize], the distributed env params
test_mode (bool): Store True when building test or
validation dataset. Default: False.
"""
def __init__(self,
dataset_dir,
image_dir,
anno_path,
num_joints,
transform=[],
shard=[0, 1],
test_mode=False,
return_mask=True,
return_bbox=True,
return_area=True,
return_class=True):
super().__init__(dataset_dir, image_dir, anno_path, num_joints,
transform, shard, test_mode)
self.ann_file = os.path.join(dataset_dir, anno_path)
self.shard = shard
self.test_mode = test_mode
self.return_mask = return_mask
self.return_bbox = return_bbox
self.return_area = return_area
self.return_class = return_class
def parse_dataset(self):
self.coco = COCO(self.ann_file)
self.img_ids = self.coco.getImgIds()
if not self.test_mode:
self.img_ids_tmp = []
for img_id in self.img_ids:
ann_ids = self.coco.getAnnIds(imgIds=img_id)
anno = self.coco.loadAnns(ann_ids)
anno = [obj for obj in anno if obj['iscrowd'] == 0]
if len(anno) == 0:
continue
self.img_ids_tmp.append(img_id)
self.img_ids = self.img_ids_tmp
blocknum = int(len(self.img_ids) / self.shard[1])
self.img_ids = self.img_ids[(blocknum * self.shard[0]):(blocknum * (
self.shard[0] + 1))]
self.num_images = len(self.img_ids)
self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs)
self.dataset_name = 'coco'
cat_ids = self.coco.getCatIds()
self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
print('=> num_images: {}'.format(self.num_images))
@staticmethod
def _get_mapping_id_name(imgs):
"""
Args:
imgs (dict): dict of image info.
Returns:
tuple: Image name & id mapping dicts.
- id2name (dict): Mapping image id to name.
- name2id (dict): Mapping image name to id.
"""
id2name = {}
name2id = {}
for image_id, image in imgs.items():
file_name = image['file_name']
id2name[image_id] = file_name
name2id[file_name] = image_id
return id2name, name2id
def _get_imganno(self, idx):
"""Get anno for a single image.
Args:
idx (int): image idx
Returns:
dict: info for model training
"""
coco = self.coco
img_id = self.img_ids[idx]
ann_ids = coco.getAnnIds(imgIds=img_id)
anno = coco.loadAnns(ann_ids)
anno = [
obj for obj in anno
if obj['iscrowd'] == 0 and obj['num_keypoints'] > 0
]
db_rec = {}
joints, orgsize = self._get_joints(anno, idx)
db_rec['gt_joints'] = joints
db_rec['im_shape'] = orgsize
if self.return_bbox:
db_rec['gt_bbox'] = self._get_bboxs(anno, idx)
if self.return_class:
db_rec['gt_class'] = self._get_labels(anno, idx)
if self.return_area:
db_rec['gt_areas'] = self._get_areas(anno, idx)
if self.return_mask:
db_rec['mask'] = self._get_mask(anno, idx)
db_rec['im_id'] = img_id
db_rec['image_file'] = os.path.join(self.img_prefix,
self.id2name[img_id])
return db_rec
def _get_joints(self, anno, idx):
"""Get joints for all people in an image."""
num_people = len(anno)
joints = np.zeros(
(num_people, self.ann_info['num_joints'], 3), dtype=np.float32)
for i, obj in enumerate(anno):
joints[i, :self.ann_info['num_joints'], :3] = \
np.array(obj['keypoints']).reshape([-1, 3])
img_info = self.coco.loadImgs(self.img_ids[idx])[0]
orgsize = np.array([img_info['height'], img_info['width'], 1])
return joints, orgsize
def _get_bboxs(self, anno, idx):
num_people = len(anno)
gt_bboxes = np.zeros((num_people, 4), dtype=np.float32)
for idx, obj in enumerate(anno):
if 'bbox' in obj:
gt_bboxes[idx, :] = obj['bbox']
gt_bboxes[:, 2] += gt_bboxes[:, 0]
gt_bboxes[:, 3] += gt_bboxes[:, 1]
return gt_bboxes
def _get_labels(self, anno, idx):
num_people = len(anno)
gt_labels = np.zeros((num_people, 1), dtype=np.float32)
for idx, obj in enumerate(anno):
if 'category_id' in obj:
catid = obj['category_id']
gt_labels[idx, 0] = self.catid2clsid[catid]
return gt_labels
def _get_areas(self, anno, idx):
num_people = len(anno)
gt_areas = np.zeros((num_people, ), dtype=np.float32)
for idx, obj in enumerate(anno):
if 'area' in obj:
gt_areas[idx, ] = obj['area']
return gt_areas
def _get_mask(self, anno, idx):
"""Get ignore masks to mask out losses."""
coco = self.coco
img_info = coco.loadImgs(self.img_ids[idx])[0]
m = np.zeros((img_info['height'], img_info['width']), dtype=np.float32)
for obj in anno:
if 'segmentation' in obj:
if obj['iscrowd']:
rle = pycocotools.mask.frPyObjects(obj['segmentation'],
img_info['height'],
img_info['width'])
m += pycocotools.mask.decode(rle)
elif obj['num_keypoints'] == 0:
rles = pycocotools.mask.frPyObjects(obj['segmentation'],
img_info['height'],
img_info['width'])
for rle in rles:
m += pycocotools.mask.decode(rle)
return m < 0.5
@register
@serializable
class KeypointBottomUpCrowdPoseDataset(KeypointBottomUpCocoDataset):
"""CrowdPose dataset for bottom-up pose estimation.
The dataset loads raw features and apply specified transforms
to return a dict containing the image tensors and other information.
CrowdPose keypoint indexes::
0: 'left_shoulder',
1: 'right_shoulder',
2: 'left_elbow',
3: 'right_elbow',
4: 'left_wrist',
5: 'right_wrist',
6: 'left_hip',
7: 'right_hip',
8: 'left_knee',
9: 'right_knee',
10: 'left_ankle',
11: 'right_ankle',
12: 'top_head',
13: 'neck'
Args:
dataset_dir (str): Root path to the dataset.
anno_path (str): Relative path to the annotation file.
image_dir (str): Path to a directory where images are held.
Default: None.
num_joints (int): keypoint numbers
transform (composed(operators)): A sequence of data transforms.
shard (list): [rank, worldsize], the distributed env params
test_mode (bool): Store True when building test or
validation dataset. Default: False.
"""
def __init__(self,
dataset_dir,
image_dir,
anno_path,
num_joints,
transform=[],
shard=[0, 1],
test_mode=False):
super().__init__(dataset_dir, image_dir, anno_path, num_joints,
transform, shard, test_mode)
self.ann_file = os.path.join(dataset_dir, anno_path)
self.shard = shard
self.test_mode = test_mode
def parse_dataset(self):
self.coco = COCO(self.ann_file)
self.img_ids = self.coco.getImgIds()
if not self.test_mode:
self.img_ids = [
img_id for img_id in self.img_ids
if len(self.coco.getAnnIds(
imgIds=img_id, iscrowd=None)) > 0
]
blocknum = int(len(self.img_ids) / self.shard[1])
self.img_ids = self.img_ids[(blocknum * self.shard[0]):(blocknum * (
self.shard[0] + 1))]
self.num_images = len(self.img_ids)
self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs)
self.dataset_name = 'crowdpose'
print('=> num_images: {}'.format(self.num_images))
@serializable
class KeypointTopDownBaseDataset(DetDataset):
"""Base class for top_down datasets.
All datasets should subclass it.
All subclasses should overwrite:
Methods:`_get_db`
Args:
dataset_dir (str): Root path to the dataset.
image_dir (str): Path to a directory where images are held.
anno_path (str): Relative path to the annotation file.
num_joints (int): keypoint numbers
transform (composed(operators)): A sequence of data transforms.
"""
def __init__(self,
dataset_dir,
image_dir,
anno_path,
num_joints,
transform=[]):
super().__init__(dataset_dir, image_dir, anno_path)
self.image_info = {}
self.ann_info = {}
self.img_prefix = os.path.join(dataset_dir, image_dir)
self.transform = transform
self.ann_info['num_joints'] = num_joints
self.db = []
def __len__(self):
"""Get dataset length."""
return len(self.db)
def _get_db(self):
"""Get a sample"""
raise NotImplementedError
def __getitem__(self, idx):
"""Prepare sample for training given the index."""
records = copy.deepcopy(self.db[idx])
records['image'] = cv2.imread(records['image_file'], cv2.IMREAD_COLOR |
cv2.IMREAD_IGNORE_ORIENTATION)
records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB)
records['score'] = records['score'] if 'score' in records else 1
records = self.transform(records)
# print('records', records)
return records
@register
@serializable
class KeypointTopDownCocoDataset(KeypointTopDownBaseDataset):
"""COCO dataset for top-down pose estimation.
The dataset loads raw features and apply specified transforms
to return a dict containing the image tensors and other information.
COCO keypoint indexes:
0: 'nose',
1: 'left_eye',
2: 'right_eye',
3: 'left_ear',
4: 'right_ear',
5: 'left_shoulder',
6: 'right_shoulder',
7: 'left_elbow',
8: 'right_elbow',
9: 'left_wrist',
10: 'right_wrist',
11: 'left_hip',
12: 'right_hip',
13: 'left_knee',
14: 'right_knee',
15: 'left_ankle',
16: 'right_ankle'
Args:
dataset_dir (str): Root path to the dataset.
image_dir (str): Path to a directory where images are held.
anno_path (str): Relative path to the annotation file.
num_joints (int): Keypoint numbers
trainsize (list):[w, h] Image target size
transform (composed(operators)): A sequence of data transforms.
bbox_file (str): Path to a detection bbox file
Default: None.
use_gt_bbox (bool): Whether to use ground truth bbox
Default: True.
pixel_std (int): The pixel std of the scale
Default: 200.
image_thre (float): The threshold to filter the detection box
Default: 0.0.
"""
def __init__(self,
dataset_dir,
image_dir,
anno_path,
num_joints,
trainsize,
transform=[],
bbox_file=None,
use_gt_bbox=True,
pixel_std=200,
image_thre=0.0,
center_scale=None):
super().__init__(dataset_dir, image_dir, anno_path, num_joints,
transform)
self.bbox_file = bbox_file
self.use_gt_bbox = use_gt_bbox
self.trainsize = trainsize
self.pixel_std = pixel_std
self.image_thre = image_thre
self.center_scale = center_scale
self.dataset_name = 'coco'
def parse_dataset(self):
if self.use_gt_bbox:
self.db = self._load_coco_keypoint_annotations()
else:
self.db = self._load_coco_person_detection_results()
def _load_coco_keypoint_annotations(self):
coco = COCO(self.get_anno())
img_ids = coco.getImgIds()
gt_db = []
for index in img_ids:
im_ann = coco.loadImgs(index)[0]
width = im_ann['width']
height = im_ann['height']
file_name = im_ann['file_name']
im_id = int(im_ann["id"])
annIds = coco.getAnnIds(imgIds=index, iscrowd=False)
objs = coco.loadAnns(annIds)
valid_objs = []
for obj in objs:
x, y, w, h = obj['bbox']
x1 = np.max((0, x))
y1 = np.max((0, y))
x2 = np.min((width - 1, x1 + np.max((0, w - 1))))
y2 = np.min((height - 1, y1 + np.max((0, h - 1))))
if obj['area'] > 0 and x2 >= x1 and y2 >= y1:
obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
valid_objs.append(obj)
objs = valid_objs
rec = []
for obj in objs:
if max(obj['keypoints']) == 0:
continue
joints = np.zeros(
(self.ann_info['num_joints'], 3), dtype=np.float32)
joints_vis = np.zeros(
(self.ann_info['num_joints'], 3), dtype=np.float32)
for ipt in range(self.ann_info['num_joints']):
joints[ipt, 0] = obj['keypoints'][ipt * 3 + 0]
joints[ipt, 1] = obj['keypoints'][ipt * 3 + 1]
joints[ipt, 2] = 0
t_vis = obj['keypoints'][ipt * 3 + 2]
if t_vis > 1:
t_vis = 1
joints_vis[ipt, 0] = t_vis
joints_vis[ipt, 1] = t_vis
joints_vis[ipt, 2] = 0
center, scale = self._box2cs(obj['clean_bbox'][:4])
rec.append({
'image_file': os.path.join(self.img_prefix, file_name),
'center': center,
'scale': scale,
'gt_joints': joints,
'joints_vis': joints_vis,
'im_id': im_id,
})
gt_db.extend(rec)
return gt_db
def _box2cs(self, box):
x, y, w, h = box[:4]
center = np.zeros((2), dtype=np.float32)
center[0] = x + w * 0.5
center[1] = y + h * 0.5
aspect_ratio = self.trainsize[0] * 1.0 / self.trainsize[1]
if self.center_scale is not None and np.random.rand() < 0.3:
center += self.center_scale * (np.random.rand(2) - 0.5) * [w, h]
if w > aspect_ratio * h:
h = w * 1.0 / aspect_ratio
elif w < aspect_ratio * h:
w = h * aspect_ratio
scale = np.array(
[w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
dtype=np.float32)
if center[0] != -1:
scale = scale * 1.25
return center, scale
def _load_coco_person_detection_results(self):
all_boxes = None
bbox_file_path = os.path.join(self.dataset_dir, self.bbox_file)
with open(bbox_file_path, 'r') as f:
all_boxes = json.load(f)
if not all_boxes:
print('=> Load %s fail!' % bbox_file_path)
return None
kpt_db = []
for n_img in range(0, len(all_boxes)):
det_res = all_boxes[n_img]
if det_res['category_id'] != 1:
continue
file_name = det_res[
'filename'] if 'filename' in det_res else '%012d.jpg' % det_res[
'image_id']
img_name = os.path.join(self.img_prefix, file_name)
box = det_res['bbox']
score = det_res['score']
im_id = int(det_res['image_id'])
if score < self.image_thre:
continue
center, scale = self._box2cs(box)
joints = np.zeros(
(self.ann_info['num_joints'], 3), dtype=np.float32)
joints_vis = np.ones(
(self.ann_info['num_joints'], 3), dtype=np.float32)
kpt_db.append({
'image_file': img_name,
'im_id': im_id,
'center': center,
'scale': scale,
'score': score,
'gt_joints': joints,
'joints_vis': joints_vis,
})
return kpt_db
@register
@serializable
class KeypointTopDownCocoWholeBodyHandDataset(KeypointTopDownBaseDataset):
"""CocoWholeBody dataset for top-down hand pose estimation.
The dataset loads raw features and apply specified transforms
to return a dict containing the image tensors and other information.
COCO-WholeBody Hand keypoint indexes:
0: 'wrist',
1: 'thumb1',
2: 'thumb2',
3: 'thumb3',
4: 'thumb4',
5: 'forefinger1',
6: 'forefinger2',
7: 'forefinger3',
8: 'forefinger4',
9: 'middle_finger1',
10: 'middle_finger2',
11: 'middle_finger3',
12: 'middle_finger4',
13: 'ring_finger1',
14: 'ring_finger2',
15: 'ring_finger3',
16: 'ring_finger4',
17: 'pinky_finger1',
18: 'pinky_finger2',
19: 'pinky_finger3',
20: 'pinky_finger4'
Args:
dataset_dir (str): Root path to the dataset.
image_dir (str): Path to a directory where images are held.
anno_path (str): Relative path to the annotation file.
num_joints (int): Keypoint numbers
trainsize (list):[w, h] Image target size
transform (composed(operators)): A sequence of data transforms.
pixel_std (int): The pixel std of the scale
Default: 200.
"""
def __init__(self,
dataset_dir,
image_dir,
anno_path,
num_joints,
trainsize,
transform=[],
pixel_std=200):
super().__init__(dataset_dir, image_dir, anno_path, num_joints,
transform)
self.trainsize = trainsize
self.pixel_std = pixel_std
self.dataset_name = 'coco_wholebady_hand'
def _box2cs(self, box):
x, y, w, h = box[:4]
center = np.zeros((2), dtype=np.float32)
center[0] = x + w * 0.5
center[1] = y + h * 0.5
aspect_ratio = self.trainsize[0] * 1.0 / self.trainsize[1]
if w > aspect_ratio * h:
h = w * 1.0 / aspect_ratio
elif w < aspect_ratio * h:
w = h * aspect_ratio
scale = np.array(
[w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
dtype=np.float32)
if center[0] != -1:
scale = scale * 1.25
return center, scale
def parse_dataset(self):
gt_db = []
num_joints = self.ann_info['num_joints']
coco = COCO(self.get_anno())
img_ids = list(coco.imgs.keys())
for img_id in img_ids:
im_ann = coco.loadImgs(img_id)[0]
image_file = os.path.join(self.img_prefix, im_ann['file_name'])
im_id = int(im_ann["id"])
ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False)
objs = coco.loadAnns(ann_ids)
for obj in objs:
for type in ['left', 'right']:
if (obj[f'{type}hand_valid'] and
max(obj[f'{type}hand_kpts']) > 0):
joints = np.zeros((num_joints, 3), dtype=np.float32)
joints_vis = np.zeros((num_joints, 3), dtype=np.float32)
keypoints = np.array(obj[f'{type}hand_kpts'])
keypoints = keypoints.reshape(-1, 3)
joints[:, :2] = keypoints[:, :2]
joints_vis[:, :2] = np.minimum(1, keypoints[:, 2:3])
center, scale = self._box2cs(obj[f'{type}hand_box'][:4])
gt_db.append({
'image_file': image_file,
'center': center,
'scale': scale,
'gt_joints': joints,
'joints_vis': joints_vis,
'im_id': im_id,
})
self.db = gt_db
@register
@serializable
class KeypointTopDownMPIIDataset(KeypointTopDownBaseDataset):
"""MPII dataset for topdown pose estimation.
The dataset loads raw features and apply specified transforms
to return a dict containing the image tensors and other information.
MPII keypoint indexes::
0: 'right_ankle',
1: 'right_knee',
2: 'right_hip',
3: 'left_hip',
4: 'left_knee',
5: 'left_ankle',
6: 'pelvis',
7: 'thorax',
8: 'upper_neck',
9: 'head_top',
10: 'right_wrist',
11: 'right_elbow',
12: 'right_shoulder',
13: 'left_shoulder',
14: 'left_elbow',
15: 'left_wrist',
Args:
dataset_dir (str): Root path to the dataset.
image_dir (str): Path to a directory where images are held.
anno_path (str): Relative path to the annotation file.
num_joints (int): Keypoint numbers
trainsize (list):[w, h] Image target size
transform (composed(operators)): A sequence of data transforms.
"""
def __init__(self,
dataset_dir,
image_dir,
anno_path,
num_joints,
transform=[]):
super().__init__(dataset_dir, image_dir, anno_path, num_joints,
transform)
self.dataset_name = 'mpii'
def parse_dataset(self):
with open(self.get_anno()) as anno_file:
anno = json.load(anno_file)
gt_db = []
for a in anno:
image_name = a['image']
im_id = a['image_id'] if 'image_id' in a else int(
os.path.splitext(image_name)[0])
c = np.array(a['center'], dtype=np.float32)
s = np.array([a['scale'], a['scale']], dtype=np.float32)
# Adjust center/scale slightly to avoid cropping limbs
if c[0] != -1:
c[1] = c[1] + 15 * s[1]
s = s * 1.25
c = c - 1
joints = np.zeros(
(self.ann_info['num_joints'], 3), dtype=np.float32)
joints_vis = np.zeros(
(self.ann_info['num_joints'], 3), dtype=np.float32)
if 'gt_joints' in a:
joints_ = np.array(a['gt_joints'])
joints_[:, 0:2] = joints_[:, 0:2] - 1
joints_vis_ = np.array(a['joints_vis'])
assert len(joints_) == self.ann_info[
'num_joints'], 'joint num diff: {} vs {}'.format(
len(joints_), self.ann_info['num_joints'])
joints[:, 0:2] = joints_[:, 0:2]
joints_vis[:, 0] = joints_vis_[:]
joints_vis[:, 1] = joints_vis_[:]
gt_db.append({
'image_file': os.path.join(self.img_prefix, image_name),
'im_id': im_id,
'center': c,
'scale': s,
'gt_joints': joints,
'joints_vis': joints_vis
})
print("number length: {}".format(len(gt_db)))
self.db = gt_db
================================================
FILE: ppdet/data/source/lvis.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import cv2
import copy
try:
from collections.abc import Sequence
except Exception:
from collections import Sequence
import numpy as np
from ppdet.core.workspace import register, serializable
from .dataset import DetDataset
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
__all__ = [
'LVISDataSet',
]
@register
@serializable
class LVISDataSet(DetDataset):
"""
Load dataset with LVISDataSet format.
Args:
dataset_dir (str): root directory for dataset.
image_dir (str): directory for images.
anno_path (str): coco annotation file path.
data_fields (list): key name of data dictionary, at least have 'image'.
sample_num (int): number of samples to load, -1 means all.
load_crowd (bool): whether to load crowded ground-truth.
False as default
allow_empty (bool): whether to load empty entry. False as default
empty_ratio (float): the ratio of empty record number to total
record's, if empty_ratio is out of [0. ,1.), do not sample the
records and use all the empty entries. 1. as default
repeat (int): repeat times for dataset, use in benchmark.
"""
def __init__(self,
dataset_dir=None,
image_dir=None,
anno_path=None,
data_fields=['image'],
sample_num=-1,
load_crowd=False,
allow_empty=False,
empty_ratio=1.,
repeat=1):
super(LVISDataSet, self).__init__(
dataset_dir,
image_dir,
anno_path,
data_fields,
sample_num,
repeat=repeat)
self.load_image_only = False
self.load_semantic = False
self.load_crowd = load_crowd
self.allow_empty = allow_empty
self.empty_ratio = empty_ratio
def _sample_empty(self, records, num):
# if empty_ratio is out of [0. ,1.), do not sample the records
if self.empty_ratio < 0. or self.empty_ratio >= 1.:
return records
import random
sample_num = min(
int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records))
records = random.sample(records, sample_num)
return records
def parse_dataset(self):
anno_path = os.path.join(self.dataset_dir, self.anno_path)
image_dir = os.path.join(self.dataset_dir, self.image_dir)
assert anno_path.endswith('.json'), \
'invalid coco annotation file: ' + anno_path
from lvis import LVIS
lvis_ = LVIS(anno_path)
img_ids = lvis_.get_img_ids()
img_ids.sort()
cat_ids = lvis_.get_cat_ids()
records = []
empty_records = []
ct = 0
self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
self.cname2cid = dict({
lvis_.load_cats([catid])[0]['name']: clsid
for catid, clsid in self.catid2clsid.items()
})
if 'annotations' not in lvis_.dataset:
self.load_image_only = True
logger.warning('Annotation file: {} does not contains ground truth '
'and load image information only.'.format(anno_path))
for img_id in img_ids:
img_anno = lvis_.load_imgs([img_id])[0]
im_fname = img_anno['coco_url'].replace('http://images.cocodataset.org/', '')
im_w = float(img_anno['width'])
im_h = float(img_anno['height'])
im_path = os.path.join(image_dir,
im_fname) if image_dir else im_fname
is_empty = False
if not os.path.exists(im_path):
logger.warning('Illegal image file: {}, and it will be '
'ignored'.format(im_path))
continue
if im_w < 0 or im_h < 0:
logger.warning('Illegal width: {} or height: {} in annotation, '
'and im_id: {} will be ignored'.format(
im_w, im_h, img_id))
continue
coco_rec = {
'im_file': im_path,
'im_id': np.array([img_id]),
'h': im_h,
'w': im_w,
} if 'image' in self.data_fields else {}
if not self.load_image_only:
ins_anno_ids = lvis_.get_ann_ids(img_ids=[img_id])
instances = lvis_.load_anns(ins_anno_ids)
bboxes = []
is_rbox_anno = False
for inst in instances:
# check gt bbox
if inst.get('ignore', False):
continue
if 'bbox' not in inst.keys():
continue
else:
if not any(np.array(inst['bbox'])):
continue
x1, y1, box_w, box_h = inst['bbox']
x2 = x1 + box_w
y2 = y1 + box_h
eps = 1e-5
if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps:
inst['clean_bbox'] = [
round(float(x), 3) for x in [x1, y1, x2, y2]
]
bboxes.append(inst)
else:
logger.warning(
'Found an invalid bbox in annotations: im_id: {}, '
'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(
img_id, float(inst['area']), x1, y1, x2, y2))
num_bbox = len(bboxes)
if num_bbox <= 0 and not self.allow_empty:
continue
elif num_bbox <= 0:
is_empty = True
gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
gt_poly = [None] * num_bbox
gt_track_id = -np.ones((num_bbox, 1), dtype=np.int32)
has_segmentation = False
has_track_id = False
for i, box in enumerate(bboxes):
catid = box['category_id']
gt_class[i][0] = self.catid2clsid[catid]
gt_bbox[i, :] = box['clean_bbox']
# is_crowd[i][0] = box['iscrowd']
# check RLE format
# if 'segmentation' in box and box['iscrowd'] == 1:
# gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
# elif 'segmentation' in box and box['segmentation']:
# if not np.array(
# box['segmentation'],
# dtype=object).size > 0 and not self.allow_empty:
# bboxes.pop(i)
# gt_poly.pop(i)
# np.delete(is_crowd, i)
# np.delete(gt_class, i)
# np.delete(gt_bbox, i)
# else:
# gt_poly[i] = box['segmentation']
# has_segmentation = True
if 'track_id' in box:
gt_track_id[i][0] = box['track_id']
has_track_id = True
if has_segmentation and not any(
gt_poly) and not self.allow_empty:
continue
gt_rec = {
'is_crowd': is_crowd,
'gt_class': gt_class,
'gt_bbox': gt_bbox,
'gt_poly': gt_poly,
}
if has_track_id:
gt_rec.update({'gt_track_id': gt_track_id})
for k, v in gt_rec.items():
if k in self.data_fields:
coco_rec[k] = v
# TODO: remove load_semantic
if self.load_semantic and 'semantic' in self.data_fields:
seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps',
'train2017', im_fname[:-3] + 'png')
coco_rec.update({'semantic': seg_path})
logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format(
im_path, img_id, im_h, im_w))
if is_empty:
empty_records.append(coco_rec)
else:
records.append(coco_rec)
ct += 1
if self.sample_num > 0 and ct >= self.sample_num:
break
assert ct > 0, 'not found any coco record in %s' % (anno_path)
logger.info('Load [{} samples valid, {} samples invalid] in file {}.'.
format(ct, len(img_ids) - ct, anno_path))
if self.allow_empty and len(empty_records) > 0:
empty_records = self._sample_empty(empty_records, len(records))
records += empty_records
self.roidbs = records
================================================
FILE: ppdet/data/source/mot.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import cv2
import glob
import numpy as np
from collections import OrderedDict, defaultdict
try:
from collections.abc import Sequence
except Exception:
from collections import Sequence
from .dataset import DetDataset, _make_dataset, _is_valid_file
from ppdet.core.workspace import register, serializable
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
@register
@serializable
class MOTDataSet(DetDataset):
"""
Load dataset with MOT format, only support single class MOT.
Args:
dataset_dir (str): root directory for dataset.
image_lists (str|list): mot data image lists, muiti-source mot dataset.
data_fields (list): key name of data dictionary, at least have 'image'.
sample_num (int): number of samples to load, -1 means all.
repeat (int): repeat times for dataset, use in benchmark.
Notes:
MOT datasets root directory following this:
dataset/mot
|——————image_lists
| |——————caltech.train
| |——————caltech.val
| |——————mot16.train
| |——————mot17.train
| ......
|——————Caltech
|——————MOT17
|——————......
All the MOT datasets have the following structure:
Caltech
|——————images
| └——————00001.jpg
| |—————— ...
| └——————0000N.jpg
└——————labels_with_ids
└——————00001.txt
|—————— ...
└——————0000N.txt
or
MOT17
|——————images
| └——————train
| └——————test
└——————labels_with_ids
└——————train
"""
def __init__(self,
dataset_dir=None,
image_lists=[],
data_fields=['image'],
sample_num=-1,
repeat=1):
super(MOTDataSet, self).__init__(
dataset_dir=dataset_dir,
data_fields=data_fields,
sample_num=sample_num,
repeat=repeat)
self.dataset_dir = dataset_dir
self.image_lists = image_lists
if isinstance(self.image_lists, str):
self.image_lists = [self.image_lists]
self.roidbs = None
self.cname2cid = None
def get_anno(self):
if self.image_lists == []:
return
# only used to get categories and metric
# only check first data, but the label_list of all data should be same.
first_mot_data = self.image_lists[0].split('.')[0]
anno_file = os.path.join(self.dataset_dir, first_mot_data,
'label_list.txt')
return anno_file
def parse_dataset(self):
self.img_files = OrderedDict()
self.img_start_index = OrderedDict()
self.label_files = OrderedDict()
self.tid_num = OrderedDict()
self.tid_start_index = OrderedDict()
img_index = 0
for data_name in self.image_lists:
# check every data image list
image_lists_dir = os.path.join(self.dataset_dir, 'image_lists')
assert os.path.isdir(image_lists_dir), \
"The {} is not a directory.".format(image_lists_dir)
list_path = os.path.join(image_lists_dir, data_name)
assert os.path.exists(list_path), \
"The list path {} does not exist.".format(list_path)
# record img_files, filter out empty ones
with open(list_path, 'r') as file:
self.img_files[data_name] = file.readlines()
self.img_files[data_name] = [
os.path.join(self.dataset_dir, x.strip())
for x in self.img_files[data_name]
]
self.img_files[data_name] = list(
filter(lambda x: len(x) > 0, self.img_files[data_name]))
self.img_start_index[data_name] = img_index
img_index += len(self.img_files[data_name])
# record label_files
self.label_files[data_name] = [
x.replace('images', 'labels_with_ids').replace(
'.png', '.txt').replace('.jpg', '.txt')
for x in self.img_files[data_name]
]
for data_name, label_paths in self.label_files.items():
max_index = -1
for lp in label_paths:
lb = np.loadtxt(lp)
if len(lb) < 1:
continue
if len(lb.shape) < 2:
img_max = lb[1]
else:
img_max = np.max(lb[:, 1])
if img_max > max_index:
max_index = img_max
self.tid_num[data_name] = int(max_index + 1)
last_index = 0
for i, (k, v) in enumerate(self.tid_num.items()):
self.tid_start_index[k] = last_index
last_index += v
self.num_identities_dict = defaultdict(int)
self.num_identities_dict[0] = int(last_index + 1) # single class
self.num_imgs_each_data = [len(x) for x in self.img_files.values()]
self.total_imgs = sum(self.num_imgs_each_data)
logger.info('MOT dataset summary: ')
logger.info(self.tid_num)
logger.info('Total images: {}'.format(self.total_imgs))
logger.info('Image start index: {}'.format(self.img_start_index))
logger.info('Total identities: {}'.format(self.num_identities_dict[0]))
logger.info('Identity start index: {}'.format(self.tid_start_index))
records = []
cname2cid = mot_label()
for img_index in range(self.total_imgs):
for i, (k, v) in enumerate(self.img_start_index.items()):
if img_index >= v:
data_name = list(self.label_files.keys())[i]
start_index = v
img_file = self.img_files[data_name][img_index - start_index]
lbl_file = self.label_files[data_name][img_index - start_index]
if not os.path.exists(img_file):
logger.warning('Illegal image file: {}, and it will be ignored'.
format(img_file))
continue
if not os.path.isfile(lbl_file):
logger.warning('Illegal label file: {}, and it will be ignored'.
format(lbl_file))
continue
labels = np.loadtxt(lbl_file, dtype=np.float32).reshape(-1, 6)
# each row in labels (N, 6) is [gt_class, gt_identity, cx, cy, w, h]
cx, cy = labels[:, 2], labels[:, 3]
w, h = labels[:, 4], labels[:, 5]
gt_bbox = np.stack((cx, cy, w, h)).T.astype('float32')
gt_class = labels[:, 0:1].astype('int32')
gt_score = np.ones((len(labels), 1)).astype('float32')
gt_ide = labels[:, 1:2].astype('int32')
for i, _ in enumerate(gt_ide):
if gt_ide[i] > -1:
gt_ide[i] += self.tid_start_index[data_name]
mot_rec = {
'im_file': img_file,
'im_id': img_index,
} if 'image' in self.data_fields else {}
gt_rec = {
'gt_class': gt_class,
'gt_score': gt_score,
'gt_bbox': gt_bbox,
'gt_ide': gt_ide,
}
for k, v in gt_rec.items():
if k in self.data_fields:
mot_rec[k] = v
records.append(mot_rec)
if self.sample_num > 0 and img_index >= self.sample_num:
break
assert len(records) > 0, 'not found any mot record in %s' % (
self.image_lists)
self.roidbs, self.cname2cid = records, cname2cid
@register
@serializable
class MCMOTDataSet(DetDataset):
"""
Load dataset with MOT format, support multi-class MOT.
Args:
dataset_dir (str): root directory for dataset.
image_lists (list(str)): mcmot data image lists, muiti-source mcmot dataset.
data_fields (list): key name of data dictionary, at least have 'image'.
label_list (str): if use_default_label is False, will load
mapping between category and class index.
sample_num (int): number of samples to load, -1 means all.
Notes:
MCMOT datasets root directory following this:
dataset/mot
|——————image_lists
| |——————visdrone_mcmot.train
| |——————visdrone_mcmot.val
visdrone_mcmot
|——————images
| └——————train
| └——————val
└——————labels_with_ids
└——————train
"""
def __init__(self,
dataset_dir=None,
image_lists=[],
data_fields=['image'],
label_list=None,
sample_num=-1):
super(MCMOTDataSet, self).__init__(
dataset_dir=dataset_dir,
data_fields=data_fields,
sample_num=sample_num)
self.dataset_dir = dataset_dir
self.image_lists = image_lists
if isinstance(self.image_lists, str):
self.image_lists = [self.image_lists]
self.label_list = label_list
self.roidbs = None
self.cname2cid = None
def get_anno(self):
if self.image_lists == []:
return
# only used to get categories and metric
# only check first data, but the label_list of all data should be same.
first_mot_data = self.image_lists[0].split('.')[0]
anno_file = os.path.join(self.dataset_dir, first_mot_data,
'label_list.txt')
return anno_file
def parse_dataset(self):
self.img_files = OrderedDict()
self.img_start_index = OrderedDict()
self.label_files = OrderedDict()
self.tid_num = OrderedDict()
self.tid_start_idx_of_cls_ids = defaultdict(dict) # for MCMOT
img_index = 0
for data_name in self.image_lists:
# check every data image list
image_lists_dir = os.path.join(self.dataset_dir, 'image_lists')
assert os.path.isdir(image_lists_dir), \
"The {} is not a directory.".format(image_lists_dir)
list_path = os.path.join(image_lists_dir, data_name)
assert os.path.exists(list_path), \
"The list path {} does not exist.".format(list_path)
# record img_files, filter out empty ones
with open(list_path, 'r') as file:
self.img_files[data_name] = file.readlines()
self.img_files[data_name] = [
os.path.join(self.dataset_dir, x.strip())
for x in self.img_files[data_name]
]
self.img_files[data_name] = list(
filter(lambda x: len(x) > 0, self.img_files[data_name]))
self.img_start_index[data_name] = img_index
img_index += len(self.img_files[data_name])
# record label_files
self.label_files[data_name] = [
x.replace('images', 'labels_with_ids').replace(
'.png', '.txt').replace('.jpg', '.txt')
for x in self.img_files[data_name]
]
for data_name, label_paths in self.label_files.items():
# using max_ids_dict rather than max_index
max_ids_dict = defaultdict(int)
for lp in label_paths:
lb = np.loadtxt(lp)
if len(lb) < 1:
continue
lb = lb.reshape(-1, 6)
for item in lb:
if item[1] > max_ids_dict[int(item[0])]:
# item[0]: cls_id
# item[1]: track id
max_ids_dict[int(item[0])] = int(item[1])
# track id number
self.tid_num[data_name] = max_ids_dict
last_idx_dict = defaultdict(int)
for i, (k, v) in enumerate(self.tid_num.items()): # each sub dataset
for cls_id, id_num in v.items(): # v is a max_ids_dict
self.tid_start_idx_of_cls_ids[k][cls_id] = last_idx_dict[cls_id]
last_idx_dict[cls_id] += id_num
self.num_identities_dict = defaultdict(int)
for k, v in last_idx_dict.items():
self.num_identities_dict[k] = int(v) # total ids of each category
self.num_imgs_each_data = [len(x) for x in self.img_files.values()]
self.total_imgs = sum(self.num_imgs_each_data)
# cname2cid and cid2cname
cname2cid = {}
if self.label_list is not None:
# if use label_list for multi source mix dataset,
# please make sure label_list in the first sub_dataset at least.
sub_dataset = self.image_lists[0].split('.')[0]
label_path = os.path.join(self.dataset_dir, sub_dataset,
self.label_list)
if not os.path.exists(label_path):
logger.info(
"Note: label_list {} does not exists, use VisDrone 10 classes labels as default.".
format(label_path))
cname2cid = visdrone_mcmot_label()
else:
with open(label_path, 'r') as fr:
label_id = 0
for line in fr.readlines():
cname2cid[line.strip()] = label_id
label_id += 1
else:
cname2cid = visdrone_mcmot_label()
cid2cname = dict([(v, k) for (k, v) in cname2cid.items()])
logger.info('MCMOT dataset summary: ')
logger.info(self.tid_num)
logger.info('Total images: {}'.format(self.total_imgs))
logger.info('Image start index: {}'.format(self.img_start_index))
logger.info('Total identities of each category: ')
num_identities_dict = sorted(
self.num_identities_dict.items(), key=lambda x: x[0])
total_IDs_all_cats = 0
for (k, v) in num_identities_dict:
logger.info('Category {} [{}] has {} IDs.'.format(k, cid2cname[k],
v))
total_IDs_all_cats += v
logger.info('Total identities of all categories: {}'.format(
total_IDs_all_cats))
logger.info('Identity start index of each category: ')
for k, v in self.tid_start_idx_of_cls_ids.items():
sorted_v = sorted(v.items(), key=lambda x: x[0])
for (cls_id, start_idx) in sorted_v:
logger.info('Start index of dataset {} category {:d} is {:d}'
.format(k, cls_id, start_idx))
records = []
for img_index in range(self.total_imgs):
for i, (k, v) in enumerate(self.img_start_index.items()):
if img_index >= v:
data_name = list(self.label_files.keys())[i]
start_index = v
img_file = self.img_files[data_name][img_index - start_index]
lbl_file = self.label_files[data_name][img_index - start_index]
if not os.path.exists(img_file):
logger.warning('Illegal image file: {}, and it will be ignored'.
format(img_file))
continue
if not os.path.isfile(lbl_file):
logger.warning('Illegal label file: {}, and it will be ignored'.
format(lbl_file))
continue
labels = np.loadtxt(lbl_file, dtype=np.float32).reshape(-1, 6)
# each row in labels (N, 6) is [gt_class, gt_identity, cx, cy, w, h]
cx, cy = labels[:, 2], labels[:, 3]
w, h = labels[:, 4], labels[:, 5]
gt_bbox = np.stack((cx, cy, w, h)).T.astype('float32')
gt_class = labels[:, 0:1].astype('int32')
gt_score = np.ones((len(labels), 1)).astype('float32')
gt_ide = labels[:, 1:2].astype('int32')
for i, _ in enumerate(gt_ide):
if gt_ide[i] > -1:
cls_id = int(gt_class[i])
start_idx = self.tid_start_idx_of_cls_ids[data_name][cls_id]
gt_ide[i] += start_idx
mot_rec = {
'im_file': img_file,
'im_id': img_index,
} if 'image' in self.data_fields else {}
gt_rec = {
'gt_class': gt_class,
'gt_score': gt_score,
'gt_bbox': gt_bbox,
'gt_ide': gt_ide,
}
for k, v in gt_rec.items():
if k in self.data_fields:
mot_rec[k] = v
records.append(mot_rec)
if self.sample_num > 0 and img_index >= self.sample_num:
break
assert len(records) > 0, 'not found any mot record in %s' % (
self.image_lists)
self.roidbs, self.cname2cid = records, cname2cid
@register
@serializable
class MOTImageFolder(DetDataset):
"""
Load MOT dataset with MOT format from image folder or video .
Args:
video_file (str): path of the video file, default ''.
frame_rate (int): frame rate of the video, use cv2 VideoCapture if not set.
dataset_dir (str): root directory for dataset.
keep_ori_im (bool): whether to keep original image, default False.
Set True when used during MOT model inference while saving
images or video, or used in DeepSORT.
"""
def __init__(self,
video_file=None,
frame_rate=-1,
dataset_dir=None,
data_root=None,
image_dir=None,
sample_num=-1,
keep_ori_im=False,
anno_path=None,
**kwargs):
super(MOTImageFolder, self).__init__(
dataset_dir, image_dir, sample_num=sample_num)
self.video_file = video_file
self.data_root = data_root
self.keep_ori_im = keep_ori_im
self._imid2path = {}
self.roidbs = None
self.frame_rate = frame_rate
self.anno_path = anno_path
def check_or_download_dataset(self):
return
def parse_dataset(self, ):
if not self.roidbs:
if self.video_file is None:
self.frame_rate = 30 # set as default if infer image folder
self.roidbs = self._load_images()
else:
self.roidbs = self._load_video_images()
def _load_video_images(self):
if self.frame_rate == -1:
# if frame_rate is not set for video, use cv2.VideoCapture
cap = cv2.VideoCapture(self.video_file)
self.frame_rate = int(cap.get(cv2.CAP_PROP_FPS))
extension = self.video_file.split('.')[-1]
output_path = self.video_file.replace('.{}'.format(extension), '')
frames_path = video2frames(self.video_file, output_path,
self.frame_rate)
self.video_frames = sorted(
glob.glob(os.path.join(frames_path, '*.png')))
self.video_length = len(self.video_frames)
logger.info('Length of the video: {:d} frames.'.format(
self.video_length))
ct = 0
records = []
for image in self.video_frames:
assert image != '' and os.path.isfile(image), \
"Image {} not found".format(image)
if self.sample_num > 0 and ct >= self.sample_num:
break
rec = {'im_id': np.array([ct]), 'im_file': image}
if self.keep_ori_im:
rec.update({'keep_ori_im': 1})
self._imid2path[ct] = image
ct += 1
records.append(rec)
assert len(records) > 0, "No image file found"
return records
def _find_images(self):
image_dir = self.image_dir
if not isinstance(image_dir, Sequence):
image_dir = [image_dir]
images = []
for im_dir in image_dir:
if os.path.isdir(im_dir):
im_dir = os.path.join(self.dataset_dir, im_dir)
images.extend(_make_dataset(im_dir))
elif os.path.isfile(im_dir) and _is_valid_file(im_dir):
images.append(im_dir)
return images
def _load_images(self):
images = self._find_images()
ct = 0
records = []
for image in images:
assert image != '' and os.path.isfile(image), \
"Image {} not found".format(image)
if self.sample_num > 0 and ct >= self.sample_num:
break
rec = {'im_id': np.array([ct]), 'im_file': image}
if self.keep_ori_im:
rec.update({'keep_ori_im': 1})
self._imid2path[ct] = image
ct += 1
records.append(rec)
assert len(records) > 0, "No image file found"
return records
def get_imid2path(self):
return self._imid2path
def set_images(self, images):
self.image_dir = images
self.roidbs = self._load_images()
def set_video(self, video_file, frame_rate):
# update video_file and frame_rate by command line of tools/infer_mot.py
self.video_file = video_file
self.frame_rate = frame_rate
assert os.path.isfile(self.video_file) and _is_valid_video(self.video_file), \
"wrong or unsupported file format: {}".format(self.video_file)
self.roidbs = self._load_video_images()
def get_anno(self):
return self.anno_path
def _is_valid_video(f, extensions=('.mp4', '.avi', '.mov', '.rmvb', 'flv')):
return f.lower().endswith(extensions)
def video2frames(video_path, outpath, frame_rate, **kargs):
def _dict2str(kargs):
cmd_str = ''
for k, v in kargs.items():
cmd_str += (' ' + str(k) + ' ' + str(v))
return cmd_str
ffmpeg = ['ffmpeg ', ' -y -loglevel ', ' error ']
vid_name = os.path.basename(video_path).split('.')[0]
out_full_path = os.path.join(outpath, vid_name)
if not os.path.exists(out_full_path):
os.makedirs(out_full_path)
# video file name
outformat = os.path.join(out_full_path, '%08d.png')
cmd = ffmpeg
cmd = ffmpeg + [
' -i ', video_path, ' -r ', str(frame_rate), ' -f image2 ', outformat
]
cmd = ''.join(cmd) + _dict2str(kargs)
if os.system(cmd) != 0:
raise RuntimeError('ffmpeg process video: {} error'.format(video_path))
sys.exit(-1)
sys.stdout.flush()
return out_full_path
def mot_label():
labels_map = {'person': 0}
return labels_map
def visdrone_mcmot_label():
labels_map = {
'pedestrian': 0,
'people': 1,
'bicycle': 2,
'car': 3,
'van': 4,
'truck': 5,
'tricycle': 6,
'awning-tricycle': 7,
'bus': 8,
'motor': 9,
}
return labels_map
================================================
FILE: ppdet/data/source/pose3d_cmb.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import cv2
import numpy as np
import json
import copy
import pycocotools
from pycocotools.coco import COCO
from .dataset import DetDataset
from ppdet.core.workspace import register, serializable
from paddle.io import Dataset
@serializable
class Pose3DDataset(DetDataset):
"""Pose3D Dataset class.
Args:
dataset_dir (str): Root path to the dataset.
anno_list (list of str): each of the element is a relative path to the annotation file.
image_dirs (list of str): each of path is a relative path where images are held.
transform (composed(operators)): A sequence of data transforms.
test_mode (bool): Store True when building test or
validation dataset. Default: False.
24 joints order:
0-2: 'R_Ankle', 'R_Knee', 'R_Hip',
3-5:'L_Hip', 'L_Knee', 'L_Ankle',
6-8:'R_Wrist', 'R_Elbow', 'R_Shoulder',
9-11:'L_Shoulder','L_Elbow','L_Wrist',
12-14:'Neck','Top_of_Head','Pelvis',
15-18:'Thorax','Spine','Jaw','Head',
19-23:'Nose','L_Eye','R_Eye','L_Ear','R_Ear'
"""
def __init__(self,
dataset_dir,
image_dirs,
anno_list,
transform=[],
num_joints=24,
test_mode=False):
super().__init__(dataset_dir, image_dirs, anno_list)
self.image_info = {}
self.ann_info = {}
self.num_joints = num_joints
self.transform = transform
self.test_mode = test_mode
self.img_ids = []
self.dataset_dir = dataset_dir
self.image_dirs = image_dirs
self.anno_list = anno_list
def get_mask(self, mvm_percent=0.3):
num_joints = self.num_joints
mjm_mask = np.ones((num_joints, 1)).astype(np.float32)
if self.test_mode == False:
pb = np.random.random_sample()
masked_num = int(
pb * mvm_percent *
num_joints) # at most x% of the joints could be masked
indices = np.random.choice(
np.arange(num_joints), replace=False, size=masked_num)
mjm_mask[indices, :] = 0.0
# return mjm_mask
num_joints = 10
mvm_mask = np.ones((num_joints, 1)).astype(np.float)
if self.test_mode == False:
num_vertices = num_joints
pb = np.random.random_sample()
masked_num = int(
pb * mvm_percent *
num_vertices) # at most x% of the vertices could be masked
indices = np.random.choice(
np.arange(num_vertices), replace=False, size=masked_num)
mvm_mask[indices, :] = 0.0
mjm_mask = np.concatenate([mjm_mask, mvm_mask], axis=0)
return mjm_mask
def filterjoints(self, x):
if self.num_joints == 24:
return x
elif self.num_joints == 14:
return x[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18], :]
elif self.num_joints == 17:
return x[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19], :]
else:
raise ValueError(
"unsupported joint numbers, only [24 or 17 or 14] is supported!")
def parse_dataset(self):
print("Loading annotations..., please wait")
self.annos = []
im_id = 0
self.human36m_num = 0
for idx, annof in enumerate(self.anno_list):
img_prefix = os.path.join(self.dataset_dir, self.image_dirs[idx])
dataf = os.path.join(self.dataset_dir, annof)
with open(dataf, 'r') as rf:
anno_data = json.load(rf)
annos = anno_data['data']
new_annos = []
print("{} has annos numbers: {}".format(dataf, len(annos)))
for anno in annos:
new_anno = {}
new_anno['im_id'] = im_id
im_id += 1
imagename = anno['imageName']
if imagename.startswith("COCO_train2014_"):
imagename = imagename[len("COCO_train2014_"):]
elif imagename.startswith("COCO_val2014_"):
imagename = imagename[len("COCO_val2014_"):]
imagename = os.path.join(img_prefix, imagename)
if not os.path.exists(imagename):
if "train2017" in imagename:
imagename = imagename.replace("train2017",
"val2017")
if not os.path.exists(imagename):
print("cannot find imagepath:{}".format(
imagename))
continue
else:
print("cannot find imagepath:{}".format(imagename))
continue
new_anno['imageName'] = imagename
if 'human3.6m' in imagename:
self.human36m_num += 1
new_anno['bbox_center'] = anno['bbox_center']
new_anno['bbox_scale'] = anno['bbox_scale']
new_anno['joints_2d'] = np.array(anno[
'gt_keypoint_2d']).astype(np.float32)
if new_anno['joints_2d'].shape[0] == 49:
#if the joints_2d is in SPIN format(which generated by eft), choose the last 24 public joints
#for detail please refer: https://github.com/nkolot/SPIN/blob/master/constants.py
new_anno['joints_2d'] = new_anno['joints_2d'][25:]
new_anno['joints_3d'] = np.array(anno[
'pose3d'])[:, :3].astype(np.float32)
new_anno['mjm_mask'] = self.get_mask()
if not 'has_3d_joints' in anno:
new_anno['has_3d_joints'] = int(1)
new_anno['has_2d_joints'] = int(1)
else:
new_anno['has_3d_joints'] = int(anno['has_3d_joints'])
new_anno['has_2d_joints'] = int(anno['has_2d_joints'])
new_anno['joints_2d'] = self.filterjoints(new_anno[
'joints_2d'])
self.annos.append(new_anno)
del annos
def get_temp_num(self):
"""get temporal data number, like human3.6m"""
return self.human36m_num
def __len__(self):
"""Get dataset length."""
return len(self.annos)
def _get_imganno(self, idx):
"""Get anno for a single image."""
return self.annos[idx]
def __getitem__(self, idx):
"""Prepare image for training given the index."""
records = copy.deepcopy(self._get_imganno(idx))
imgpath = records['imageName']
assert os.path.exists(imgpath), "cannot find image {}".format(imgpath)
records['image'] = cv2.imread(imgpath)
records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB)
records = self.transform(records)
return records
def check_or_download_dataset(self):
alldatafind = True
for image_dir in self.image_dirs:
image_dir = os.path.join(self.dataset_dir, image_dir)
if not os.path.isdir(image_dir):
print("dataset [{}] is not found".format(image_dir))
alldatafind = False
if not alldatafind:
raise ValueError(
"Some dataset is not valid and cannot download automatically now, please prepare the dataset first"
)
@register
@serializable
class Keypoint3DMultiFramesDataset(Dataset):
"""24 keypoints 3D dataset for pose estimation.
each item is a list of images
The dataset loads raw features and apply specified transforms
to return a dict containing the image tensors and other information.
Args:
dataset_dir (str): Root path to the dataset.
image_dir (str): Path to a directory where images are held.
"""
def __init__(
self,
dataset_dir, # 数据集根目录
image_dir, # 图像文件夹
p3d_dir, # 3D关键点文件夹
json_path,
img_size, #图像resize大小
num_frames, # 帧序列长度
anno_path=None, ):
self.dataset_dir = dataset_dir
self.image_dir = image_dir
self.p3d_dir = p3d_dir
self.json_path = json_path
self.img_size = img_size
self.num_frames = num_frames
self.anno_path = anno_path
self.data_labels, self.mf_inds = self._generate_multi_frames_list()
def _generate_multi_frames_list(self):
act_list = os.listdir(self.dataset_dir) # 动作列表
count = 0
mf_list = []
annos_dict = {'images': [], 'annotations': [], 'act_inds': []}
for act in act_list: #对每个动作,生成帧序列
if '.' in act:
continue
json_path = os.path.join(self.dataset_dir, act, self.json_path)
with open(json_path, 'r') as j:
annos = json.load(j)
length = len(annos['images'])
for k, v in annos.items():
if k in annos_dict:
annos_dict[k].extend(v)
annos_dict['act_inds'].extend([act] * length)
mf = [[i + j + count for j in range(self.num_frames)]
for i in range(0, length - self.num_frames + 1)]
mf_list.extend(mf)
count += length
print("total data number:", len(mf_list))
return annos_dict, mf_list
def __call__(self, *args, **kwargs):
return self
def __getitem__(self, index): # 拿一个连续的序列
inds = self.mf_inds[
index] # 如[568, 569, 570, 571, 572, 573],长度为num_frames
images = self.data_labels['images'] # all images
annots = self.data_labels['annotations'] # all annots
act = self.data_labels['act_inds'][inds[0]] # 动作名(文件夹名)
kps3d_list = []
kps3d_vis_list = []
names = []
h, w = 0, 0
for ind in inds: # one image
height = float(images[ind]['height'])
width = float(images[ind]['width'])
name = images[ind]['file_name'] # 图像名称,带有后缀
kps3d_name = name.split('.')[0] + '.obj'
kps3d_path = os.path.join(self.dataset_dir, act, self.p3d_dir,
kps3d_name)
joints, joints_vis = self.kps3d_process(kps3d_path)
joints_vis = np.array(joints_vis, dtype=np.float32)
kps3d_list.append(joints)
kps3d_vis_list.append(joints_vis)
names.append(name)
kps3d = np.array(kps3d_list) # (6, 24, 3),(num_frames, joints_num, 3)
kps3d_vis = np.array(kps3d_vis_list)
# read image
imgs = []
for name in names:
img_path = os.path.join(self.dataset_dir, act, self.image_dir, name)
image = cv2.imread(img_path, cv2.IMREAD_COLOR |
cv2.IMREAD_IGNORE_ORIENTATION)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
imgs.append(np.expand_dims(image, axis=0))
imgs = np.concatenate(imgs, axis=0)
imgs = imgs.astype(
np.float32) # (6, 1080, 1920, 3),(num_frames, h, w, c)
# attention: 此时图像和标注是镜像的
records = {
'kps3d': kps3d,
'kps3d_vis': kps3d_vis,
"image": imgs,
'act': act,
'names': names,
'im_id': index
}
return self.transform(records)
def kps3d_process(self, kps3d_path):
count = 0
kps = []
kps_vis = []
with open(kps3d_path, 'r') as f:
lines = f.readlines()
for line in lines:
if line[0] == 'v':
kps.append([])
line = line.strip('\n').split(' ')[1:]
for kp in line:
kps[-1].append(float(kp))
count += 1
kps_vis.append([1, 1, 1])
kps = np.array(kps) # 52,3
kps_vis = np.array(kps_vis)
kps *= 10 # scale points
kps -= kps[[0], :] # set root point to zero
kps = np.concatenate((kps[0:23], kps[[37]]), axis=0) # 24,3
kps *= 10
kps_vis = np.concatenate((kps_vis[0:23], kps_vis[[37]]), axis=0) # 24,3
return kps, kps_vis
def __len__(self):
return len(self.mf_inds)
def get_anno(self):
if self.anno_path is None:
return
return os.path.join(self.dataset_dir, self.anno_path)
def check_or_download_dataset(self):
return
def parse_dataset(self, ):
return
def set_transform(self, transform):
self.transform = transform
def set_epoch(self, epoch_id):
self._epoch = epoch_id
def set_kwargs(self, **kwargs):
self.mixup_epoch = kwargs.get('mixup_epoch', -1)
self.cutmix_epoch = kwargs.get('cutmix_epoch', -1)
self.mosaic_epoch = kwargs.get('mosaic_epoch', -1)
================================================
FILE: ppdet/data/source/sniper_coco.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import cv2
import json
import copy
import numpy as np
try:
from collections.abc import Sequence
except Exception:
from collections import Sequence
from ppdet.core.workspace import register, serializable
from ppdet.data.crop_utils.annotation_cropper import AnnoCropper
from .coco import COCODataSet
from .dataset import _make_dataset, _is_valid_file
from ppdet.utils.logger import setup_logger
logger = setup_logger('sniper_coco_dataset')
@register
@serializable
class SniperCOCODataSet(COCODataSet):
"""SniperCOCODataSet"""
def __init__(self,
dataset_dir=None,
image_dir=None,
anno_path=None,
proposals_file=None,
data_fields=['image'],
sample_num=-1,
load_crowd=False,
allow_empty=True,
empty_ratio=1.,
is_trainset=True,
image_target_sizes=[2000, 1000],
valid_box_ratio_ranges=[[-1, 0.1],[0.08, -1]],
chip_target_size=500,
chip_target_stride=200,
use_neg_chip=False,
max_neg_num_per_im=8,
max_per_img=-1,
nms_thresh=0.5):
super(SniperCOCODataSet, self).__init__(
dataset_dir=dataset_dir,
image_dir=image_dir,
anno_path=anno_path,
data_fields=data_fields,
sample_num=sample_num,
load_crowd=load_crowd,
allow_empty=allow_empty,
empty_ratio=empty_ratio
)
self.proposals_file = proposals_file
self.proposals = None
self.anno_cropper = None
self.is_trainset = is_trainset
self.image_target_sizes = image_target_sizes
self.valid_box_ratio_ranges = valid_box_ratio_ranges
self.chip_target_size = chip_target_size
self.chip_target_stride = chip_target_stride
self.use_neg_chip = use_neg_chip
self.max_neg_num_per_im = max_neg_num_per_im
self.max_per_img = max_per_img
self.nms_thresh = nms_thresh
def parse_dataset(self):
if not hasattr(self, "roidbs"):
super(SniperCOCODataSet, self).parse_dataset()
if self.is_trainset:
self._parse_proposals()
self._merge_anno_proposals()
self.ori_roidbs = copy.deepcopy(self.roidbs)
self.init_anno_cropper()
self.roidbs = self.generate_chips_roidbs(self.roidbs, self.is_trainset)
def set_proposals_file(self, file_path):
self.proposals_file = file_path
def init_anno_cropper(self):
logger.info("Init AnnoCropper...")
self.anno_cropper = AnnoCropper(
image_target_sizes=self.image_target_sizes,
valid_box_ratio_ranges=self.valid_box_ratio_ranges,
chip_target_size=self.chip_target_size,
chip_target_stride=self.chip_target_stride,
use_neg_chip=self.use_neg_chip,
max_neg_num_per_im=self.max_neg_num_per_im,
max_per_img=self.max_per_img,
nms_thresh=self.nms_thresh
)
def generate_chips_roidbs(self, roidbs, is_trainset):
if is_trainset:
roidbs = self.anno_cropper.crop_anno_records(roidbs)
else:
roidbs = self.anno_cropper.crop_infer_anno_records(roidbs)
return roidbs
def _parse_proposals(self):
if self.proposals_file:
self.proposals = {}
logger.info("Parse proposals file:{}".format(self.proposals_file))
with open(self.proposals_file, 'r') as f:
proposals = json.load(f)
for prop in proposals:
image_id = prop["image_id"]
if image_id not in self.proposals:
self.proposals[image_id] = []
x, y, w, h = prop["bbox"]
self.proposals[image_id].append([x, y, x + w, y + h])
def _merge_anno_proposals(self):
assert self.roidbs
if self.proposals and len(self.proposals.keys()) > 0:
logger.info("merge proposals to annos")
for id, record in enumerate(self.roidbs):
image_id = int(record["im_id"])
if image_id not in self.proposals.keys():
logger.info("image id :{} no proposals".format(image_id))
record["proposals"] = np.array(self.proposals.get(image_id, []), dtype=np.float32)
self.roidbs[id] = record
def get_ori_roidbs(self):
if not hasattr(self, "ori_roidbs"):
return None
return self.ori_roidbs
def get_roidbs(self):
if not hasattr(self, "roidbs"):
self.parse_dataset()
return self.roidbs
def set_roidbs(self, roidbs):
self.roidbs = roidbs
def check_or_download_dataset(self):
return
def _parse(self):
image_dir = self.image_dir
if not isinstance(image_dir, Sequence):
image_dir = [image_dir]
images = []
for im_dir in image_dir:
if os.path.isdir(im_dir):
im_dir = os.path.join(self.dataset_dir, im_dir)
images.extend(_make_dataset(im_dir))
elif os.path.isfile(im_dir) and _is_valid_file(im_dir):
images.append(im_dir)
return images
def _load_images(self):
images = self._parse()
ct = 0
records = []
for image in images:
assert image != '' and os.path.isfile(image), \
"Image {} not found".format(image)
if self.sample_num > 0 and ct >= self.sample_num:
break
im = cv2.imread(image)
h, w, c = im.shape
rec = {'im_id': np.array([ct]), 'im_file': image, "h": h, "w": w}
self._imid2path[ct] = image
ct += 1
records.append(rec)
assert len(records) > 0, "No image file found"
return records
def get_imid2path(self):
return self._imid2path
def set_images(self, images):
self._imid2path = {}
self.image_dir = images
self.roidbs = self._load_images()
================================================
FILE: ppdet/data/source/voc.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy as np
import xml.etree.ElementTree as ET
from ppdet.core.workspace import register, serializable
from .dataset import DetDataset
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
@register
@serializable
class VOCDataSet(DetDataset):
"""
Load dataset with PascalVOC format.
Notes:
`anno_path` must contains xml file and image file path for annotations.
Args:
dataset_dir (str): root directory for dataset.
image_dir (str): directory for images.
anno_path (str): voc annotation file path.
data_fields (list): key name of data dictionary, at least have 'image'.
sample_num (int): number of samples to load, -1 means all.
label_list (str): if use_default_label is False, will load
mapping between category and class index.
allow_empty (bool): whether to load empty entry. False as default
empty_ratio (float): the ratio of empty record number to total
record's, if empty_ratio is out of [0. ,1.), do not sample the
records and use all the empty entries. 1. as default
repeat (int): repeat times for dataset, use in benchmark.
"""
def __init__(self,
dataset_dir=None,
image_dir=None,
anno_path=None,
data_fields=['image'],
sample_num=-1,
label_list=None,
allow_empty=False,
empty_ratio=1.,
repeat=1):
super(VOCDataSet, self).__init__(
dataset_dir=dataset_dir,
image_dir=image_dir,
anno_path=anno_path,
data_fields=data_fields,
sample_num=sample_num,
repeat=repeat)
self.label_list = label_list
self.allow_empty = allow_empty
self.empty_ratio = empty_ratio
def _sample_empty(self, records, num):
# if empty_ratio is out of [0. ,1.), do not sample the records
if self.empty_ratio < 0. or self.empty_ratio >= 1.:
return records
import random
sample_num = min(
int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records))
records = random.sample(records, sample_num)
return records
def parse_dataset(self, ):
anno_path = os.path.join(self.dataset_dir, self.anno_path)
image_dir = os.path.join(self.dataset_dir, self.image_dir)
# mapping category name to class id
# first_class:0, second_class:1, ...
records = []
empty_records = []
ct = 0
cname2cid = {}
if self.label_list:
label_path = os.path.join(self.dataset_dir, self.label_list)
if not os.path.exists(label_path):
raise ValueError("label_list {} does not exists".format(
label_path))
with open(label_path, 'r') as fr:
label_id = 0
for line in fr.readlines():
cname2cid[line.strip()] = label_id
label_id += 1
else:
cname2cid = pascalvoc_label()
with open(anno_path, 'r') as fr:
while True:
line = fr.readline()
if not line:
break
img_file, xml_file = [os.path.join(image_dir, x) \
for x in line.strip().split()[:2]]
if not os.path.exists(img_file):
logger.warning(
'Illegal image file: {}, and it will be ignored'.format(
img_file))
continue
if not os.path.isfile(xml_file):
logger.warning(
'Illegal xml file: {}, and it will be ignored'.format(
xml_file))
continue
tree = ET.parse(xml_file)
if tree.find('id') is None:
im_id = np.array([ct])
else:
im_id = np.array([int(tree.find('id').text)])
objs = tree.findall('object')
im_w = float(tree.find('size').find('width').text)
im_h = float(tree.find('size').find('height').text)
if im_w < 0 or im_h < 0:
logger.warning(
'Illegal width: {} or height: {} in annotation, '
'and {} will be ignored'.format(im_w, im_h, xml_file))
continue
num_bbox, i = len(objs), 0
gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
gt_score = np.zeros((num_bbox, 1), dtype=np.float32)
difficult = np.zeros((num_bbox, 1), dtype=np.int32)
for obj in objs:
cname = obj.find('name').text
# user dataset may not contain difficult field
_difficult = obj.find('difficult')
_difficult = int(
_difficult.text) if _difficult is not None else 0
x1 = float(obj.find('bndbox').find('xmin').text)
y1 = float(obj.find('bndbox').find('ymin').text)
x2 = float(obj.find('bndbox').find('xmax').text)
y2 = float(obj.find('bndbox').find('ymax').text)
x1 = max(0, x1)
y1 = max(0, y1)
x2 = min(im_w - 1, x2)
y2 = min(im_h - 1, y2)
if x2 > x1 and y2 > y1:
gt_bbox[i, :] = [x1, y1, x2, y2]
gt_class[i, 0] = cname2cid[cname]
gt_score[i, 0] = 1.
difficult[i, 0] = _difficult
i += 1
else:
logger.warning(
'Found an invalid bbox in annotations: xml_file: {}'
', x1: {}, y1: {}, x2: {}, y2: {}.'.format(
xml_file, x1, y1, x2, y2))
gt_bbox = gt_bbox[:i, :]
gt_class = gt_class[:i, :]
gt_score = gt_score[:i, :]
difficult = difficult[:i, :]
voc_rec = {
'im_file': img_file,
'im_id': im_id,
'h': im_h,
'w': im_w
} if 'image' in self.data_fields else {}
gt_rec = {
'gt_class': gt_class,
'gt_score': gt_score,
'gt_bbox': gt_bbox,
'difficult': difficult
}
for k, v in gt_rec.items():
if k in self.data_fields:
voc_rec[k] = v
if len(objs) == 0:
empty_records.append(voc_rec)
else:
records.append(voc_rec)
ct += 1
if self.sample_num > 0 and ct >= self.sample_num:
break
assert ct > 0, 'not found any voc record in %s' % (self.anno_path)
logger.debug('{} samples in file {}'.format(ct, anno_path))
if self.allow_empty and len(empty_records) > 0:
empty_records = self._sample_empty(empty_records, len(records))
records += empty_records
self.roidbs, self.cname2cid = records, cname2cid
def get_label_list(self):
return os.path.join(self.dataset_dir, self.label_list)
def pascalvoc_label():
labels_map = {
'aeroplane': 0,
'bicycle': 1,
'bird': 2,
'boat': 3,
'bottle': 4,
'bus': 5,
'car': 6,
'cat': 7,
'chair': 8,
'cow': 9,
'diningtable': 10,
'dog': 11,
'horse': 12,
'motorbike': 13,
'person': 14,
'pottedplant': 15,
'sheep': 16,
'sofa': 17,
'train': 18,
'tvmonitor': 19
}
return labels_map
================================================
FILE: ppdet/data/source/widerface.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import defaultdict
import os
import numpy as np
from scipy.io import loadmat
from ppdet.core.workspace import register, serializable
from .dataset import DetDataset
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
@register
@serializable
class WIDERFaceDataSet(DetDataset):
"""
Load WiderFace records with 'anno_path'
Args:
dataset_dir (str): root directory for dataset.
image_dir (str): directory for images.
anno_path (str): WiderFace annotation data.
data_fields (list): key name of data dictionary, at least have 'image'.
sample_num (int): number of samples to load, -1 means all.
with_lmk (bool): whether to load face landmark keypoint labels.
"""
def __init__(self,
dataset_dir=None,
image_dir=None,
anno_path=None,
data_fields=['image'],
sample_num=-1,
with_lmk=False):
super(WIDERFaceDataSet, self).__init__(
dataset_dir=dataset_dir,
image_dir=image_dir,
anno_path=anno_path,
data_fields=data_fields,
sample_num=sample_num,
with_lmk=with_lmk)
self.anno_path = anno_path
self.sample_num = sample_num
self.roidbs = None
self.cname2cid = None
self.with_lmk = with_lmk
def parse_dataset(self):
anno_path = os.path.join(self.dataset_dir, self.anno_path)
image_dir = os.path.join(self.dataset_dir, self.image_dir)
txt_file = anno_path
records = []
ct = 0
file_lists = self._load_file_list(txt_file)
cname2cid = widerface_label()
for item in file_lists:
im_fname = item[0]
im_id = np.array([ct])
gt_bbox = np.zeros((len(item) - 1, 4), dtype=np.float32)
gt_class = np.zeros((len(item) - 1, 1), dtype=np.int32)
gt_lmk_labels = np.zeros((len(item) - 1, 10), dtype=np.float32)
lmk_ignore_flag = np.zeros((len(item) - 1, 1), dtype=np.int32)
for index_box in range(len(item)):
if index_box < 1:
continue
gt_bbox[index_box - 1] = item[index_box][0]
if self.with_lmk:
gt_lmk_labels[index_box - 1] = item[index_box][1]
lmk_ignore_flag[index_box - 1] = item[index_box][2]
im_fname = os.path.join(image_dir,
im_fname) if image_dir else im_fname
widerface_rec = {
'im_file': im_fname,
'im_id': im_id,
} if 'image' in self.data_fields else {}
gt_rec = {
'gt_bbox': gt_bbox,
'gt_class': gt_class,
}
for k, v in gt_rec.items():
if k in self.data_fields:
widerface_rec[k] = v
if self.with_lmk:
widerface_rec['gt_keypoint'] = gt_lmk_labels
widerface_rec['keypoint_ignore'] = lmk_ignore_flag
if len(item) != 0:
records.append(widerface_rec)
ct += 1
if self.sample_num > 0 and ct >= self.sample_num:
break
assert len(records) > 0, 'not found any widerface in %s' % (anno_path)
logger.debug('{} samples in file {}'.format(ct, anno_path))
self.roidbs, self.cname2cid = records, cname2cid
def _load_file_list(self, input_txt):
with open(input_txt, 'r') as f_dir:
lines_input_txt = f_dir.readlines()
file_dict = {}
num_class = 0
exts = ['jpg', 'jpeg', 'png', 'bmp']
exts += [ext.upper() for ext in exts]
for i in range(len(lines_input_txt)):
line_txt = lines_input_txt[i].strip('\n\t\r')
split_str = line_txt.split(' ')
if len(split_str) == 1:
img_file_name = os.path.split(split_str[0])[1]
split_txt = img_file_name.split('.')
if len(split_txt) < 2:
continue
elif split_txt[-1] in exts:
if i != 0:
num_class += 1
file_dict[num_class] = [line_txt]
else:
if len(line_txt) <= 6:
continue
result_boxs = []
xmin = float(split_str[0])
ymin = float(split_str[1])
w = float(split_str[2])
h = float(split_str[3])
# Filter out wrong labels
if w < 0 or h < 0:
logger.warning('Illegal box with w: {}, h: {} in '
'img: {}, and it will be ignored'.format(
w, h, file_dict[num_class][0]))
continue
xmin = max(0, xmin)
ymin = max(0, ymin)
xmax = xmin + w
ymax = ymin + h
gt_bbox = [xmin, ymin, xmax, ymax]
result_boxs.append(gt_bbox)
if self.with_lmk:
assert len(split_str) > 18, 'When `with_lmk=True`, the number' \
'of characters per line in the annotation file should' \
'exceed 18.'
lmk0_x = float(split_str[5])
lmk0_y = float(split_str[6])
lmk1_x = float(split_str[8])
lmk1_y = float(split_str[9])
lmk2_x = float(split_str[11])
lmk2_y = float(split_str[12])
lmk3_x = float(split_str[14])
lmk3_y = float(split_str[15])
lmk4_x = float(split_str[17])
lmk4_y = float(split_str[18])
lmk_ignore_flag = 0 if lmk0_x == -1 else 1
gt_lmk_label = [
lmk0_x, lmk0_y, lmk1_x, lmk1_y, lmk2_x, lmk2_y, lmk3_x,
lmk3_y, lmk4_x, lmk4_y
]
result_boxs.append(gt_lmk_label)
result_boxs.append(lmk_ignore_flag)
file_dict[num_class].append(result_boxs)
return list(file_dict.values())
def widerface_label():
labels_map = {'face': 0}
return labels_map
@register
@serializable
class WIDERFaceValDataset(WIDERFaceDataSet):
def __init__(self,
dataset_dir=None,
image_dir=None,
anno_path=None,
gt_mat_path=None,
data_fields=['image'],
sample_num=-1,
with_lmk=False):
super().__init__(
dataset_dir=dataset_dir,
image_dir=image_dir,
anno_path=anno_path,
data_fields=data_fields,
sample_num=sample_num,
with_lmk=with_lmk)
self.gt_mat_path = gt_mat_path
self.val_mat = os.path.join(self.dataset_dir, self.gt_mat_path, 'wider_face_val.mat')
self.hard_mat_path = os.path.join(self.dataset_dir, self.gt_mat_path, 'wider_hard_val.mat')
self.medium_mat_path = os.path.join(self.dataset_dir, self.gt_mat_path, 'wider_medium_val.mat')
self.easy_mat_path = os.path.join(self.dataset_dir, self.gt_mat_path, 'wider_easy_val.mat')
assert os.path.exists(self.val_mat), f'{self.val_mat} not exist'
assert os.path.exists(self.hard_mat_path), f'{self.hard_mat_path} not exist'
assert os.path.exists(self.medium_mat_path), f'{self.medium_mat_path} not exist'
assert os.path.exists(self.easy_mat_path), f'{self.easy_mat_path} not exist'
def parse_dataset(self):
super().parse_dataset()
box_list, flie_list, event_list, hard_info_list, medium_info_list, \
easy_info_list = self.get_gt_infos()
setting_infos = [easy_info_list, medium_info_list, hard_info_list]
settings = ['easy', 'medium', 'hard']
info_by_name = defaultdict(dict)
for setting_id in range(3):
info_list = setting_infos[setting_id]
setting = settings[setting_id]
for i in range(len(event_list)):
img_list = flie_list[i][0]
gt_box_list = box_list[i][0]
sub_info_list = info_list[i][0]
for j in range(len(img_list)):
img_name = str(img_list[j][0][0])
gt_boxes = gt_box_list[j][0].astype(np.float32)
info_by_name[img_name]['gt_ori_bbox'] = gt_boxes
keep_index = sub_info_list[j][0]
ignore = np.zeros(gt_boxes.shape[0])
if len(keep_index) != 0:
ignore[keep_index-1] = 1
info_by_name[img_name][f'gt_{setting}_ignore'] = ignore
for roidb in self.roidbs:
img_file = roidb['im_file'].split('/')[-1]
img_name = ".".join(img_file.split(".")[:-1])
roidb.update(info_by_name[img_name])
def get_gt_infos(self):
""" gt dir: (wider_face_val.mat, wider_easy_val.mat, wider_medium_val.mat, wider_hard_val.mat)"""
val_mat = loadmat(self.val_mat)
hard_mat = loadmat(self.hard_mat_path)
medium_mat = loadmat(self.medium_mat_path)
easy_mat = loadmat(self.easy_mat_path)
box_list = val_mat['face_bbx_list']
file_list = val_mat['file_list']
event_list = val_mat['event_list']
hard_info_list = hard_mat['gt_list']
medium_info_list = medium_mat['gt_list']
easy_info_list = easy_mat['gt_list']
return box_list, file_list, event_list, hard_info_list, medium_info_list, easy_info_list
================================================
FILE: ppdet/data/transform/__init__.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import operators
from . import batch_operators
from . import keypoint_operators
from . import mot_operators
from . import rotated_operators
from . import keypoints_3d_operators
from . import culane_operators
from .operators import *
from .batch_operators import *
from .keypoint_operators import *
from .mot_operators import *
from .rotated_operators import *
from .keypoints_3d_operators import *
from .culane_operators import *
__all__ = []
__all__ += registered_ops
__all__ += keypoint_operators.__all__
__all__ += mot_operators.__all__
__all__ += culane_operators.__all__
================================================
FILE: ppdet/data/transform/atss_assigner.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# The code is based on:
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/assigners/atss_assigner.py
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
"""Calculate overlap between two set of bboxes.
If ``is_aligned `` is ``False``, then calculate the overlaps between each
bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
pair of bboxes1 and bboxes2.
Args:
bboxes1 (Tensor): shape (B, m, 4) in format or empty.
bboxes2 (Tensor): shape (B, n, 4) in format or empty.
B indicates the batch dim, in shape (B1, B2, ..., Bn).
If ``is_aligned `` is ``True``, then m and n must be equal.
mode (str): "iou" (intersection over union) or "iof" (intersection over
foreground).
is_aligned (bool, optional): If True, then m and n must be equal.
Default False.
eps (float, optional): A value added to the denominator for numerical
stability. Default 1e-6.
Returns:
Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
"""
assert mode in ['iou', 'iof', 'giou', 'diou'], 'Unsupported mode {}'.format(
mode)
# Either the boxes are empty or the length of boxes's last dimenstion is 4
assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0)
assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0)
# Batch dim must be the same
# Batch dim: (B1, B2, ... Bn)
assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
batch_shape = bboxes1.shape[:-2]
rows = bboxes1.shape[-2] if bboxes1.shape[0] > 0 else 0
cols = bboxes2.shape[-2] if bboxes2.shape[0] > 0 else 0
if is_aligned:
assert rows == cols
if rows * cols == 0:
if is_aligned:
return np.random.random(batch_shape + (rows, ))
else:
return np.random.random(batch_shape + (rows, cols))
area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
bboxes1[..., 3] - bboxes1[..., 1])
area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
bboxes2[..., 3] - bboxes2[..., 1])
if is_aligned:
lt = np.maximum(bboxes1[..., :2], bboxes2[..., :2]) # [B, rows, 2]
rb = np.minimum(bboxes1[..., 2:], bboxes2[..., 2:]) # [B, rows, 2]
wh = (rb - lt).clip(min=0) # [B, rows, 2]
overlap = wh[..., 0] * wh[..., 1]
if mode in ['iou', 'giou']:
union = area1 + area2 - overlap
else:
union = area1
if mode == 'giou':
enclosed_lt = np.minimum(bboxes1[..., :2], bboxes2[..., :2])
enclosed_rb = np.maximum(bboxes1[..., 2:], bboxes2[..., 2:])
if mode == 'diou':
enclosed_lt = np.minimum(bboxes1[..., :2], bboxes2[..., :2])
enclosed_rb = np.maximum(bboxes1[..., 2:], bboxes2[..., 2:])
b1_x1, b1_y1 = bboxes1[..., 0], bboxes1[..., 1]
b1_x2, b1_y2 = bboxes1[..., 2], bboxes1[..., 3]
b2_x1, b2_y1 = bboxes2[..., 0], bboxes2[..., 1]
b2_x2, b2_y2 = bboxes2[..., 2], bboxes2[..., 3]
else:
lt = np.maximum(bboxes1[..., :, None, :2],
bboxes2[..., None, :, :2]) # [B, rows, cols, 2]
rb = np.minimum(bboxes1[..., :, None, 2:],
bboxes2[..., None, :, 2:]) # [B, rows, cols, 2]
wh = (rb - lt).clip(min=0) # [B, rows, cols, 2]
overlap = wh[..., 0] * wh[..., 1]
if mode in ['iou', 'giou']:
union = area1[..., None] + area2[..., None, :] - overlap
else:
union = area1[..., None]
if mode == 'giou':
enclosed_lt = np.minimum(bboxes1[..., :, None, :2],
bboxes2[..., None, :, :2])
enclosed_rb = np.maximum(bboxes1[..., :, None, 2:],
bboxes2[..., None, :, 2:])
if mode == 'diou':
enclosed_lt = np.minimum(bboxes1[..., :, None, :2],
bboxes2[..., None, :, :2])
enclosed_rb = np.maximum(bboxes1[..., :, None, 2:],
bboxes2[..., None, :, 2:])
b1_x1, b1_y1 = bboxes1[..., :, None, 0], bboxes1[..., :, None, 1]
b1_x2, b1_y2 = bboxes1[..., :, None, 2], bboxes1[..., :, None, 3]
b2_x1, b2_y1 = bboxes2[..., None, :, 0], bboxes2[..., None, :, 1]
b2_x2, b2_y2 = bboxes2[..., None, :, 2], bboxes2[..., None, :, 3]
eps = np.array([eps])
union = np.maximum(union, eps)
ious = overlap / union
if mode in ['iou', 'iof']:
return ious
# calculate gious
if mode in ['giou']:
enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
enclose_area = np.maximum(enclose_area, eps)
gious = ious - (enclose_area - union) / enclose_area
return gious
if mode in ['diou']:
left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4
right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4
rho2 = left + right
enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
enclose_c = enclose_wh[..., 0]**2 + enclose_wh[..., 1]**2
enclose_c = np.maximum(enclose_c, eps)
dious = ious - rho2 / enclose_c
return dious
def topk_(input, k, axis=1, largest=True):
x = -input if largest else input
if axis == 0:
row_index = np.arange(input.shape[1 - axis])
if k == x.shape[0]: # argpartition requires index < len(input)
topk_index = np.argpartition(x, k - 1, axis=axis)[0:k, :]
else:
topk_index = np.argpartition(x, k, axis=axis)[0:k, :]
topk_data = x[topk_index, row_index]
topk_index_sort = np.argsort(topk_data, axis=axis)
topk_data_sort = topk_data[topk_index_sort, row_index]
topk_index_sort = topk_index[0:k, :][topk_index_sort, row_index]
else:
column_index = np.arange(x.shape[1 - axis])[:, None]
topk_index = np.argpartition(x, k, axis=axis)[:, 0:k]
topk_data = x[column_index, topk_index]
topk_data = -topk_data if largest else topk_data
topk_index_sort = np.argsort(topk_data, axis=axis)
topk_data_sort = topk_data[column_index, topk_index_sort]
topk_index_sort = topk_index[:, 0:k][column_index, topk_index_sort]
return topk_data_sort, topk_index_sort
class ATSSAssigner(object):
"""Assign a corresponding gt bbox or background to each bbox.
Each proposals will be assigned with `0` or a positive integer
indicating the ground truth index.
- 0: negative sample, no assigned gt
- positive integer: positive sample, index (1-based) of assigned gt
Args:
topk (float): number of bbox selected in each level
"""
def __init__(self, topk=9):
self.topk = topk
def __call__(self,
bboxes,
num_level_bboxes,
gt_bboxes,
gt_bboxes_ignore=None,
gt_labels=None):
"""Assign gt to bboxes.
The assignment is done in following steps
1. compute iou between all bbox (bbox of all pyramid levels) and gt
2. compute center distance between all bbox and gt
3. on each pyramid level, for each gt, select k bbox whose center
are closest to the gt center, so we total select k*l bbox as
candidates for each gt
4. get corresponding iou for the these candidates, and compute the
mean and std, set mean + std as the iou threshold
5. select these candidates whose iou are greater than or equal to
the threshold as postive
6. limit the positive sample's center in gt
Args:
bboxes (np.array): Bounding boxes to be assigned, shape(n, 4).
num_level_bboxes (List): num of bboxes in each level
gt_bboxes (np.array): Groundtruth boxes, shape (k, 4).
gt_bboxes_ignore (np.array, optional): Ground truth bboxes that are
labelled as `ignored`, e.g., crowd boxes in COCO.
gt_labels (np.array, optional): Label of gt_bboxes, shape (k, ).
"""
bboxes = bboxes[:, :4]
num_gt, num_bboxes = gt_bboxes.shape[0], bboxes.shape[0]
# assign 0 by default
assigned_gt_inds = np.zeros((num_bboxes, ), dtype=np.int64)
if num_gt == 0 or num_bboxes == 0:
# No ground truth or boxes, return empty assignment
max_overlaps = np.zeros((num_bboxes, ))
if num_gt == 0:
# No truth, assign everything to background
assigned_gt_inds[:] = 0
if not np.any(gt_labels):
assigned_labels = None
else:
assigned_labels = -np.ones((num_bboxes, ), dtype=np.int64)
return assigned_gt_inds, max_overlaps
# compute iou between all bbox and gt
overlaps = bbox_overlaps(bboxes, gt_bboxes)
# compute center distance between all bbox and gt
gt_cx = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
gt_cy = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
gt_points = np.stack((gt_cx, gt_cy), axis=1)
bboxes_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0
bboxes_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0
bboxes_points = np.stack((bboxes_cx, bboxes_cy), axis=1)
distances = np.sqrt(
np.power((bboxes_points[:, None, :] - gt_points[None, :, :]), 2)
.sum(-1))
# Selecting candidates based on the center distance
candidate_idxs = []
start_idx = 0
for bboxes_per_level in num_level_bboxes:
# on each pyramid level, for each gt,
# select k bbox whose center are closest to the gt center
end_idx = start_idx + bboxes_per_level
distances_per_level = distances[start_idx:end_idx, :]
selectable_k = min(self.topk, bboxes_per_level)
_, topk_idxs_per_level = topk_(
distances_per_level, selectable_k, axis=0, largest=False)
candidate_idxs.append(topk_idxs_per_level + start_idx)
start_idx = end_idx
candidate_idxs = np.concatenate(candidate_idxs, axis=0)
# get corresponding iou for the these candidates, and compute the
# mean and std, set mean + std as the iou threshold
candidate_overlaps = overlaps[candidate_idxs, np.arange(num_gt)]
overlaps_mean_per_gt = candidate_overlaps.mean(0)
overlaps_std_per_gt = candidate_overlaps.std(0)
overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt
is_pos = candidate_overlaps >= overlaps_thr_per_gt[None, :]
# limit the positive sample's center in gt
for gt_idx in range(num_gt):
candidate_idxs[:, gt_idx] += gt_idx * num_bboxes
ep_bboxes_cx = np.broadcast_to(
bboxes_cx.reshape(1, -1), [num_gt, num_bboxes]).reshape(-1)
ep_bboxes_cy = np.broadcast_to(
bboxes_cy.reshape(1, -1), [num_gt, num_bboxes]).reshape(-1)
candidate_idxs = candidate_idxs.reshape(-1)
# calculate the left, top, right, bottom distance between positive
# bbox center and gt side
l_ = ep_bboxes_cx[candidate_idxs].reshape(-1, num_gt) - gt_bboxes[:, 0]
t_ = ep_bboxes_cy[candidate_idxs].reshape(-1, num_gt) - gt_bboxes[:, 1]
r_ = gt_bboxes[:, 2] - ep_bboxes_cx[candidate_idxs].reshape(-1, num_gt)
b_ = gt_bboxes[:, 3] - ep_bboxes_cy[candidate_idxs].reshape(-1, num_gt)
is_in_gts = np.stack([l_, t_, r_, b_], axis=1).min(axis=1) > 0.01
is_pos = is_pos & is_in_gts
# if an anchor box is assigned to multiple gts,
# the one with the highest IoU will be selected.
overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1)
index = candidate_idxs.reshape(-1)[is_pos.reshape(-1)]
overlaps_inf[index] = overlaps.T.reshape(-1)[index]
overlaps_inf = overlaps_inf.reshape(num_gt, -1).T
max_overlaps = overlaps_inf.max(axis=1)
argmax_overlaps = overlaps_inf.argmax(axis=1)
assigned_gt_inds[max_overlaps !=
-np.inf] = argmax_overlaps[max_overlaps != -np.inf] + 1
return assigned_gt_inds, max_overlaps
def get_vlr_region(self,
bboxes,
num_level_bboxes,
gt_bboxes,
gt_bboxes_ignore=None,
gt_labels=None):
"""get vlr region for ld distillation.
Args:
bboxes (np.array): Bounding boxes to be assigned, shape(n, 4).
num_level_bboxes (List): num of bboxes in each level
gt_bboxes (np.array): Groundtruth boxes, shape (k, 4).
gt_bboxes_ignore (np.array, optional): Ground truth bboxes that are
labelled as `ignored`, e.g., crowd boxes in COCO.
gt_labels (np.array, optional): Label of gt_bboxes, shape (k, ).
"""
bboxes = bboxes[:, :4]
num_gt, num_bboxes = gt_bboxes.shape[0], bboxes.shape[0]
# compute iou between all bbox and gt
overlaps = bbox_overlaps(bboxes, gt_bboxes)
# compute diou between all bbox and gt
diou = bbox_overlaps(bboxes, gt_bboxes, mode='diou')
# assign 0 by default
assigned_gt_inds = np.zeros((num_bboxes, ), dtype=np.int64)
vlr_region_iou = (assigned_gt_inds + 0).astype(np.float32)
if num_gt == 0 or num_bboxes == 0:
# No ground truth or boxes, return empty assignment
max_overlaps = np.zeros((num_bboxes, ))
if num_gt == 0:
# No truth, assign everything to background
assigned_gt_inds[:] = 0
if not np.any(gt_labels):
assigned_labels = None
else:
assigned_labels = -np.ones((num_bboxes, ), dtype=np.int64)
return assigned_gt_inds, max_overlaps
# compute center distance between all bbox and gt
gt_cx = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
gt_cy = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
gt_points = np.stack((gt_cx, gt_cy), axis=1)
bboxes_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0
bboxes_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0
bboxes_points = np.stack((bboxes_cx, bboxes_cy), axis=1)
distances = np.sqrt(
np.power((bboxes_points[:, None, :] - gt_points[None, :, :]), 2)
.sum(-1))
# Selecting candidates based on the center distance
candidate_idxs = []
candidate_idxs_t = []
start_idx = 0
for bboxes_per_level in num_level_bboxes:
# on each pyramid level, for each gt,
# select k bbox whose center are closest to the gt center
end_idx = start_idx + bboxes_per_level
distances_per_level = distances[start_idx:end_idx, :]
selectable_t = min(self.topk, bboxes_per_level)
selectable_k = bboxes_per_level #k for all
_, topt_idxs_per_level = topk_(
distances_per_level, selectable_t, axis=0, largest=False)
_, topk_idxs_per_level = topk_(
distances_per_level, selectable_k, axis=0, largest=False)
candidate_idxs_t.append(topt_idxs_per_level + start_idx)
candidate_idxs.append(topk_idxs_per_level + start_idx)
start_idx = end_idx
candidate_idxs_t = np.concatenate(candidate_idxs_t, axis=0)
candidate_idxs = np.concatenate(candidate_idxs, axis=0)
# get corresponding iou for the these candidates, and compute the
# mean and std, set mean + std as the iou threshold
candidate_overlaps_t = overlaps[candidate_idxs_t, np.arange(num_gt)]
# compute tdiou
t_diou = diou[candidate_idxs, np.arange(num_gt)]
overlaps_mean_per_gt = candidate_overlaps_t.mean(0)
overlaps_std_per_gt = candidate_overlaps_t.std(
0, ddof=1) # NOTE: use Bessel correction
overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt
# compute region
is_pos = (t_diou < overlaps_thr_per_gt[None, :]) & (
t_diou >= 0.25 * overlaps_thr_per_gt[None, :])
# limit the positive sample's center in gt
for gt_idx in range(num_gt):
candidate_idxs[:, gt_idx] += gt_idx * num_bboxes
candidate_idxs = candidate_idxs.reshape(-1)
# if an anchor box is assigned to multiple gts,
# the one with the highest IoU will be selected.
overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1)
index = candidate_idxs.reshape(-1)[is_pos.reshape(-1)]
overlaps_inf[index] = overlaps.T.reshape(-1)[index]
overlaps_inf = overlaps_inf.reshape(num_gt, -1).T
max_overlaps = overlaps_inf.max(axis=1)
argmax_overlaps = overlaps_inf.argmax(axis=1)
overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1)
overlaps_inf = overlaps_inf.reshape(num_gt, -1).T
assigned_gt_inds[max_overlaps !=
-np.inf] = argmax_overlaps[max_overlaps != -np.inf] + 1
vlr_region_iou[max_overlaps !=
-np.inf] = max_overlaps[max_overlaps != -np.inf] + 0
return vlr_region_iou
================================================
FILE: ppdet/data/transform/autoaugment_utils.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Reference:
# https://github.com/tensorflow/tpu/blob/master/models/official/detection/utils/autoaugment_utils.py
"""AutoAugment util file."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import inspect
import math
from PIL import Image, ImageEnhance
import numpy as np
import cv2
from copy import deepcopy
# This signifies the max integer that the controller RNN could predict for the
# augmentation scheme.
_MAX_LEVEL = 10.
# Represents an invalid bounding box that is used for checking for padding
# lists of bounding box coordinates for a few augmentation operations
_INVALID_BOX = [[-1.0, -1.0, -1.0, -1.0]]
def policy_v0():
"""Autoaugment policy that was used in AutoAugment Detection Paper."""
# Each tuple is an augmentation operation of the form
# (operation, probability, magnitude). Each element in policy is a
# sub-policy that will be applied sequentially on the image.
policy = [
[('TranslateX_BBox', 0.6, 4), ('Equalize', 0.8, 10)],
[('TranslateY_Only_BBoxes', 0.2, 2), ('Cutout', 0.8, 8)],
[('Sharpness', 0.0, 8), ('ShearX_BBox', 0.4, 0)],
[('ShearY_BBox', 1.0, 2), ('TranslateY_Only_BBoxes', 0.6, 6)],
[('Rotate_BBox', 0.6, 10), ('Color', 1.0, 6)],
]
return policy
def policy_v1():
"""Autoaugment policy that was used in AutoAugment Detection Paper."""
# Each tuple is an augmentation operation of the form
# (operation, probability, magnitude). Each element in policy is a
# sub-policy that will be applied sequentially on the image.
policy = [
[('TranslateX_BBox', 0.6, 4), ('Equalize', 0.8, 10)],
[('TranslateY_Only_BBoxes', 0.2, 2), ('Cutout', 0.8, 8)],
[('Sharpness', 0.0, 8), ('ShearX_BBox', 0.4, 0)],
[('ShearY_BBox', 1.0, 2), ('TranslateY_Only_BBoxes', 0.6, 6)],
[('Rotate_BBox', 0.6, 10), ('Color', 1.0, 6)],
[('Color', 0.0, 0), ('ShearX_Only_BBoxes', 0.8, 4)],
[('ShearY_Only_BBoxes', 0.8, 2), ('Flip_Only_BBoxes', 0.0, 10)],
[('Equalize', 0.6, 10), ('TranslateX_BBox', 0.2, 2)],
[('Color', 1.0, 10), ('TranslateY_Only_BBoxes', 0.4, 6)],
[('Rotate_BBox', 0.8, 10), ('Contrast', 0.0, 10)], # ,
[('Cutout', 0.2, 2), ('Brightness', 0.8, 10)],
[('Color', 1.0, 6), ('Equalize', 1.0, 2)],
[('Cutout_Only_BBoxes', 0.4, 6), ('TranslateY_Only_BBoxes', 0.8, 2)],
[('Color', 0.2, 8), ('Rotate_BBox', 0.8, 10)],
[('Sharpness', 0.4, 4), ('TranslateY_Only_BBoxes', 0.0, 4)],
[('Sharpness', 1.0, 4), ('SolarizeAdd', 0.4, 4)],
[('Rotate_BBox', 1.0, 8), ('Sharpness', 0.2, 8)],
[('ShearY_BBox', 0.6, 10), ('Equalize_Only_BBoxes', 0.6, 8)],
[('ShearX_BBox', 0.2, 6), ('TranslateY_Only_BBoxes', 0.2, 10)],
[('SolarizeAdd', 0.6, 8), ('Brightness', 0.8, 10)],
]
return policy
def policy_vtest():
"""Autoaugment test policy for debugging."""
# Each tuple is an augmentation operation of the form
# (operation, probability, magnitude). Each element in policy is a
# sub-policy that will be applied sequentially on the image.
policy = [[('TranslateX_BBox', 1.0, 4), ('Equalize', 1.0, 10)], ]
return policy
def policy_v2():
"""Additional policy that performs well on object detection."""
# Each tuple is an augmentation operation of the form
# (operation, probability, magnitude). Each element in policy is a
# sub-policy that will be applied sequentially on the image.
policy = [
[('Color', 0.0, 6), ('Cutout', 0.6, 8), ('Sharpness', 0.4, 8)],
[('Rotate_BBox', 0.4, 8), ('Sharpness', 0.4, 2),
('Rotate_BBox', 0.8, 10)],
[('TranslateY_BBox', 1.0, 8), ('AutoContrast', 0.8, 2)],
[('AutoContrast', 0.4, 6), ('ShearX_BBox', 0.8, 8),
('Brightness', 0.0, 10)],
[('SolarizeAdd', 0.2, 6), ('Contrast', 0.0, 10),
('AutoContrast', 0.6, 0)],
[('Cutout', 0.2, 0), ('Solarize', 0.8, 8), ('Color', 1.0, 4)],
[('TranslateY_BBox', 0.0, 4), ('Equalize', 0.6, 8),
('Solarize', 0.0, 10)],
[('TranslateY_BBox', 0.2, 2), ('ShearY_BBox', 0.8, 8),
('Rotate_BBox', 0.8, 8)],
[('Cutout', 0.8, 8), ('Brightness', 0.8, 8), ('Cutout', 0.2, 2)],
[('Color', 0.8, 4), ('TranslateY_BBox', 1.0, 6),
('Rotate_BBox', 0.6, 6)],
[('Rotate_BBox', 0.6, 10), ('BBox_Cutout', 1.0, 4), ('Cutout', 0.2, 8)],
[('Rotate_BBox', 0.0, 0), ('Equalize', 0.6, 6),
('ShearY_BBox', 0.6, 8)],
[('Brightness', 0.8, 8), ('AutoContrast', 0.4, 2),
('Brightness', 0.2, 2)],
[('TranslateY_BBox', 0.4, 8), ('Solarize', 0.4, 6),
('SolarizeAdd', 0.2, 10)],
[('Contrast', 1.0, 10), ('SolarizeAdd', 0.2, 8), ('Equalize', 0.2, 4)],
]
return policy
def policy_v3():
""""Additional policy that performs well on object detection."""
# Each tuple is an augmentation operation of the form
# (operation, probability, magnitude). Each element in policy is a
# sub-policy that will be applied sequentially on the image.
policy = [
[('Posterize', 0.8, 2), ('TranslateX_BBox', 1.0, 8)],
[('BBox_Cutout', 0.2, 10), ('Sharpness', 1.0, 8)],
[('Rotate_BBox', 0.6, 8), ('Rotate_BBox', 0.8, 10)],
[('Equalize', 0.8, 10), ('AutoContrast', 0.2, 10)],
[('SolarizeAdd', 0.2, 2), ('TranslateY_BBox', 0.2, 8)],
[('Sharpness', 0.0, 2), ('Color', 0.4, 8)],
[('Equalize', 1.0, 8), ('TranslateY_BBox', 1.0, 8)],
[('Posterize', 0.6, 2), ('Rotate_BBox', 0.0, 10)],
[('AutoContrast', 0.6, 0), ('Rotate_BBox', 1.0, 6)],
[('Equalize', 0.0, 4), ('Cutout', 0.8, 10)],
[('Brightness', 1.0, 2), ('TranslateY_BBox', 1.0, 6)],
[('Contrast', 0.0, 2), ('ShearY_BBox', 0.8, 0)],
[('AutoContrast', 0.8, 10), ('Contrast', 0.2, 10)],
[('Rotate_BBox', 1.0, 10), ('Cutout', 1.0, 10)],
[('SolarizeAdd', 0.8, 6), ('Equalize', 0.8, 8)],
]
return policy
def _equal(val1, val2, eps=1e-8):
return abs(val1 - val2) <= eps
def blend(image1, image2, factor):
"""Blend image1 and image2 using 'factor'.
Factor can be above 0.0. A value of 0.0 means only image1 is used.
A value of 1.0 means only image2 is used. A value between 0.0 and
1.0 means we linearly interpolate the pixel values between the two
images. A value greater than 1.0 "extrapolates" the difference
between the two pixel values, and we clip the results to values
between 0 and 255.
Args:
image1: An image Tensor of type uint8.
image2: An image Tensor of type uint8.
factor: A floating point value above 0.0.
Returns:
A blended image Tensor of type uint8.
"""
if factor == 0.0:
return image1
if factor == 1.0:
return image2
image1 = image1.astype(np.float32)
image2 = image2.astype(np.float32)
difference = image2 - image1
scaled = factor * difference
# Do addition in float.
temp = image1 + scaled
# Interpolate
if factor > 0.0 and factor < 1.0:
# Interpolation means we always stay within 0 and 255.
return temp.astype(np.uint8)
# Extrapolate:
#
# We need to clip and then cast.
return np.clip(temp, a_min=0, a_max=255).astype(np.uint8)
def cutout(image, pad_size, replace=0):
"""Apply cutout (https://arxiv.org/abs/1708.04552) to image.
This operation applies a (2*pad_size x 2*pad_size) mask of zeros to
a random location within `img`. The pixel values filled in will be of the
value `replace`. The located where the mask will be applied is randomly
chosen uniformly over the whole image.
Args:
image: An image Tensor of type uint8.
pad_size: Specifies how big the zero mask that will be generated is that
is applied to the image. The mask will be of size
(2*pad_size x 2*pad_size).
replace: What pixel value to fill in the image in the area that has
the cutout mask applied to it.
Returns:
An image Tensor that is of type uint8.
Example:
img = cv2.imread( "/home/vis/gry/train/img_data/test.jpg", cv2.COLOR_BGR2RGB )
new_img = cutout(img, pad_size=50, replace=0)
"""
image_height, image_width = image.shape[0], image.shape[1]
cutout_center_height = np.random.randint(low=0, high=image_height)
cutout_center_width = np.random.randint(low=0, high=image_width)
lower_pad = np.maximum(0, cutout_center_height - pad_size)
upper_pad = np.maximum(0, image_height - cutout_center_height - pad_size)
left_pad = np.maximum(0, cutout_center_width - pad_size)
right_pad = np.maximum(0, image_width - cutout_center_width - pad_size)
cutout_shape = [
image_height - (lower_pad + upper_pad),
image_width - (left_pad + right_pad)
]
padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]]
mask = np.pad(np.zeros(
cutout_shape, dtype=image.dtype),
padding_dims,
'constant',
constant_values=1)
mask = np.expand_dims(mask, -1)
mask = np.tile(mask, [1, 1, 3])
image = np.where(
np.equal(mask, 0),
np.ones_like(
image, dtype=image.dtype) * replace,
image)
return image.astype(np.uint8)
def solarize(image, threshold=128):
# For each pixel in the image, select the pixel
# if the value is less than the threshold.
# Otherwise, subtract 255 from the pixel.
return np.where(image < threshold, image, 255 - image)
def solarize_add(image, addition=0, threshold=128):
# For each pixel in the image less than threshold
# we add 'addition' amount to it and then clip the
# pixel value to be between 0 and 255. The value
# of 'addition' is between -128 and 128.
added_image = image.astype(np.int64) + addition
added_image = np.clip(added_image, a_min=0, a_max=255).astype(np.uint8)
return np.where(image < threshold, added_image, image)
def color(image, factor):
"""use cv2 to deal"""
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
degenerate = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
return blend(degenerate, image, factor)
# refer to https://github.com/4uiiurz1/pytorch-auto-augment/blob/024b2eac4140c38df8342f09998e307234cafc80/auto_augment.py#L197
def contrast(img, factor):
img = ImageEnhance.Contrast(Image.fromarray(img)).enhance(factor)
return np.array(img)
def brightness(image, factor):
"""Equivalent of PIL Brightness."""
degenerate = np.zeros_like(image)
return blend(degenerate, image, factor)
def posterize(image, bits):
"""Equivalent of PIL Posterize."""
shift = 8 - bits
return np.left_shift(np.right_shift(image, shift), shift)
def rotate(image, degrees, replace):
"""Rotates the image by degrees either clockwise or counterclockwise.
Args:
image: An image Tensor of type uint8.
degrees: Float, a scalar angle in degrees to rotate all images by. If
degrees is positive the image will be rotated clockwise otherwise it will
be rotated counterclockwise.
replace: A one or three value 1D tensor to fill empty pixels caused by
the rotate operation.
Returns:
The rotated version of image.
"""
image = wrap(image)
image = Image.fromarray(image)
image = image.rotate(degrees)
image = np.array(image, dtype=np.uint8)
return unwrap(image, replace)
def random_shift_bbox(image,
bbox,
pixel_scaling,
replace,
new_min_bbox_coords=None):
"""Move the bbox and the image content to a slightly new random location.
Args:
image: 3D uint8 Tensor.
bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
of type float that represents the normalized coordinates between 0 and 1.
The potential values for the new min corner of the bbox will be between
[old_min - pixel_scaling * bbox_height/2,
old_min - pixel_scaling * bbox_height/2].
pixel_scaling: A float between 0 and 1 that specifies the pixel range
that the new bbox location will be sampled from.
replace: A one or three value 1D tensor to fill empty pixels.
new_min_bbox_coords: If not None, then this is a tuple that specifies the
(min_y, min_x) coordinates of the new bbox. Normally this is randomly
specified, but this allows it to be manually set. The coordinates are
the absolute coordinates between 0 and image height/width and are int32.
Returns:
The new image that will have the shifted bbox location in it along with
the new bbox that contains the new coordinates.
"""
# Obtains image height and width and create helper clip functions.
image_height, image_width = image.shape[0], image.shape[1]
image_height = float(image_height)
image_width = float(image_width)
def clip_y(val):
return np.clip(val, a_min=0, a_max=image_height - 1).astype(np.int32)
def clip_x(val):
return np.clip(val, a_min=0, a_max=image_width - 1).astype(np.int32)
# Convert bbox to pixel coordinates.
min_y = int(image_height * bbox[0])
min_x = int(image_width * bbox[1])
max_y = clip_y(image_height * bbox[2])
max_x = clip_x(image_width * bbox[3])
bbox_height, bbox_width = (max_y - min_y + 1, max_x - min_x + 1)
image_height = int(image_height)
image_width = int(image_width)
# Select the new min/max bbox ranges that are used for sampling the
# new min x/y coordinates of the shifted bbox.
minval_y = clip_y(min_y - np.int32(pixel_scaling * float(bbox_height) /
2.0))
maxval_y = clip_y(min_y + np.int32(pixel_scaling * float(bbox_height) /
2.0))
minval_x = clip_x(min_x - np.int32(pixel_scaling * float(bbox_width) / 2.0))
maxval_x = clip_x(min_x + np.int32(pixel_scaling * float(bbox_width) / 2.0))
# Sample and calculate the new unclipped min/max coordinates of the new bbox.
if new_min_bbox_coords is None:
unclipped_new_min_y = np.random.randint(
low=minval_y, high=maxval_y, dtype=np.int32)
unclipped_new_min_x = np.random.randint(
low=minval_x, high=maxval_x, dtype=np.int32)
else:
unclipped_new_min_y, unclipped_new_min_x = (
clip_y(new_min_bbox_coords[0]), clip_x(new_min_bbox_coords[1]))
unclipped_new_max_y = unclipped_new_min_y + bbox_height - 1
unclipped_new_max_x = unclipped_new_min_x + bbox_width - 1
# Determine if any of the new bbox was shifted outside the current image.
# This is used for determining if any of the original bbox content should be
# discarded.
new_min_y, new_min_x, new_max_y, new_max_x = (
clip_y(unclipped_new_min_y), clip_x(unclipped_new_min_x),
clip_y(unclipped_new_max_y), clip_x(unclipped_new_max_x))
shifted_min_y = (new_min_y - unclipped_new_min_y) + min_y
shifted_max_y = max_y - (unclipped_new_max_y - new_max_y)
shifted_min_x = (new_min_x - unclipped_new_min_x) + min_x
shifted_max_x = max_x - (unclipped_new_max_x - new_max_x)
# Create the new bbox tensor by converting pixel integer values to floats.
new_bbox = np.stack([
float(new_min_y) / float(image_height), float(new_min_x) /
float(image_width), float(new_max_y) / float(image_height),
float(new_max_x) / float(image_width)
])
# Copy the contents in the bbox and fill the old bbox location
# with gray (128).
bbox_content = image[shifted_min_y:shifted_max_y + 1, shifted_min_x:
shifted_max_x + 1, :]
def mask_and_add_image(min_y_, min_x_, max_y_, max_x_, mask, content_tensor,
image_):
"""Applies mask to bbox region in image then adds content_tensor to it."""
mask = np.pad(mask, [[min_y_, (image_height - 1) - max_y_],
[min_x_, (image_width - 1) - max_x_], [0, 0]],
'constant',
constant_values=1)
content_tensor = np.pad(content_tensor,
[[min_y_, (image_height - 1) - max_y_],
[min_x_, (image_width - 1) - max_x_], [0, 0]],
'constant',
constant_values=0)
return image_ * mask + content_tensor
# Zero out original bbox location.
mask = np.zeros_like(image)[min_y:max_y + 1, min_x:max_x + 1, :]
grey_tensor = np.zeros_like(mask) + replace[0]
image = mask_and_add_image(min_y, min_x, max_y, max_x, mask, grey_tensor,
image)
# Fill in bbox content to new bbox location.
mask = np.zeros_like(bbox_content)
image = mask_and_add_image(new_min_y, new_min_x, new_max_y, new_max_x, mask,
bbox_content, image)
return image.astype(np.uint8), new_bbox
def _clip_bbox(min_y, min_x, max_y, max_x):
"""Clip bounding box coordinates between 0 and 1.
Args:
min_y: Normalized bbox coordinate of type float between 0 and 1.
min_x: Normalized bbox coordinate of type float between 0 and 1.
max_y: Normalized bbox coordinate of type float between 0 and 1.
max_x: Normalized bbox coordinate of type float between 0 and 1.
Returns:
Clipped coordinate values between 0 and 1.
"""
min_y = np.clip(min_y, a_min=0, a_max=1.0)
min_x = np.clip(min_x, a_min=0, a_max=1.0)
max_y = np.clip(max_y, a_min=0, a_max=1.0)
max_x = np.clip(max_x, a_min=0, a_max=1.0)
return min_y, min_x, max_y, max_x
def _check_bbox_area(min_y, min_x, max_y, max_x, delta=0.05):
"""Adjusts bbox coordinates to make sure the area is > 0.
Args:
min_y: Normalized bbox coordinate of type float between 0 and 1.
min_x: Normalized bbox coordinate of type float between 0 and 1.
max_y: Normalized bbox coordinate of type float between 0 and 1.
max_x: Normalized bbox coordinate of type float between 0 and 1.
delta: Float, this is used to create a gap of size 2 * delta between
bbox min/max coordinates that are the same on the boundary.
This prevents the bbox from having an area of zero.
Returns:
Tuple of new bbox coordinates between 0 and 1 that will now have a
guaranteed area > 0.
"""
height = max_y - min_y
width = max_x - min_x
def _adjust_bbox_boundaries(min_coord, max_coord):
# Make sure max is never 0 and min is never 1.
max_coord = np.maximum(max_coord, 0.0 + delta)
min_coord = np.minimum(min_coord, 1.0 - delta)
return min_coord, max_coord
if _equal(height, 0):
min_y, max_y = _adjust_bbox_boundaries(min_y, max_y)
if _equal(width, 0):
min_x, max_x = _adjust_bbox_boundaries(min_x, max_x)
return min_y, min_x, max_y, max_x
def _scale_bbox_only_op_probability(prob):
"""Reduce the probability of the bbox-only operation.
Probability is reduced so that we do not distort the content of too many
bounding boxes that are close to each other. The value of 3.0 was a chosen
hyper parameter when designing the autoaugment algorithm that we found
empirically to work well.
Args:
prob: Float that is the probability of applying the bbox-only operation.
Returns:
Reduced probability.
"""
return prob / 3.0
def _apply_bbox_augmentation(image, bbox, augmentation_func, *args):
"""Applies augmentation_func to the subsection of image indicated by bbox.
Args:
image: 3D uint8 Tensor.
bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
of type float that represents the normalized coordinates between 0 and 1.
augmentation_func: Augmentation function that will be applied to the
subsection of image.
*args: Additional parameters that will be passed into augmentation_func
when it is called.
Returns:
A modified version of image, where the bbox location in the image will
have `ugmentation_func applied to it.
"""
image_height = image.shape[0]
image_width = image.shape[1]
min_y = int(image_height * bbox[0])
min_x = int(image_width * bbox[1])
max_y = int(image_height * bbox[2])
max_x = int(image_width * bbox[3])
# Clip to be sure the max values do not fall out of range.
max_y = np.minimum(max_y, image_height - 1)
max_x = np.minimum(max_x, image_width - 1)
# Get the sub-tensor that is the image within the bounding box region.
bbox_content = image[min_y:max_y + 1, min_x:max_x + 1, :]
# Apply the augmentation function to the bbox portion of the image.
augmented_bbox_content = augmentation_func(bbox_content, *args)
# Pad the augmented_bbox_content and the mask to match the shape of original
# image.
augmented_bbox_content = np.pad(
augmented_bbox_content, [[min_y, (image_height - 1) - max_y],
[min_x, (image_width - 1) - max_x], [0, 0]],
'constant',
constant_values=1)
# Create a mask that will be used to zero out a part of the original image.
mask_tensor = np.zeros_like(bbox_content)
mask_tensor = np.pad(mask_tensor,
[[min_y, (image_height - 1) - max_y],
[min_x, (image_width - 1) - max_x], [0, 0]],
'constant',
constant_values=1)
# Replace the old bbox content with the new augmented content.
image = image * mask_tensor + augmented_bbox_content
return image.astype(np.uint8)
def _concat_bbox(bbox, bboxes):
"""Helper function that concates bbox to bboxes along the first dimension."""
# Note if all elements in bboxes are -1 (_INVALID_BOX), then this means
# we discard bboxes and start the bboxes Tensor with the current bbox.
bboxes_sum_check = np.sum(bboxes)
bbox = np.expand_dims(bbox, 0)
# This check will be true when it is an _INVALID_BOX
if _equal(bboxes_sum_check, -4):
bboxes = bbox
else:
bboxes = np.concatenate([bboxes, bbox], 0)
return bboxes
def _apply_bbox_augmentation_wrapper(image, bbox, new_bboxes, prob,
augmentation_func, func_changes_bbox,
*args):
"""Applies _apply_bbox_augmentation with probability prob.
Args:
image: 3D uint8 Tensor.
bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
of type float that represents the normalized coordinates between 0 and 1.
new_bboxes: 2D Tensor that is a list of the bboxes in the image after they
have been altered by aug_func. These will only be changed when
func_changes_bbox is set to true. Each bbox has 4 elements
(min_y, min_x, max_y, max_x) of type float that are the normalized
bbox coordinates between 0 and 1.
prob: Float that is the probability of applying _apply_bbox_augmentation.
augmentation_func: Augmentation function that will be applied to the
subsection of image.
func_changes_bbox: Boolean. Does augmentation_func return bbox in addition
to image.
*args: Additional parameters that will be passed into augmentation_func
when it is called.
Returns:
A tuple. Fist element is a modified version of image, where the bbox
location in the image will have augmentation_func applied to it if it is
chosen to be called with probability `prob`. The second element is a
Tensor of Tensors of length 4 that will contain the altered bbox after
applying augmentation_func.
"""
should_apply_op = (np.random.rand() + prob >= 1)
if func_changes_bbox:
if should_apply_op:
augmented_image, bbox = augmentation_func(image, bbox, *args)
else:
augmented_image, bbox = (image, bbox)
else:
if should_apply_op:
augmented_image = _apply_bbox_augmentation(image, bbox,
augmentation_func, *args)
else:
augmented_image = image
new_bboxes = _concat_bbox(bbox, new_bboxes)
return augmented_image.astype(np.uint8), new_bboxes
def _apply_multi_bbox_augmentation(image, bboxes, prob, aug_func,
func_changes_bbox, *args):
"""Applies aug_func to the image for each bbox in bboxes.
Args:
image: 3D uint8 Tensor.
bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
has 4 elements (min_y, min_x, max_y, max_x) of type float.
prob: Float that is the probability of applying aug_func to a specific
bounding box within the image.
aug_func: Augmentation function that will be applied to the
subsections of image indicated by the bbox values in bboxes.
func_changes_bbox: Boolean. Does augmentation_func return bbox in addition
to image.
*args: Additional parameters that will be passed into augmentation_func
when it is called.
Returns:
A modified version of image, where each bbox location in the image will
have augmentation_func applied to it if it is chosen to be called with
probability prob independently across all bboxes. Also the final
bboxes are returned that will be unchanged if func_changes_bbox is set to
false and if true, the new altered ones will be returned.
"""
# Will keep track of the new altered bboxes after aug_func is repeatedly
# applied. The -1 values are a dummy value and this first Tensor will be
# removed upon appending the first real bbox.
new_bboxes = np.array(_INVALID_BOX)
# If the bboxes are empty, then just give it _INVALID_BOX. The result
# will be thrown away.
bboxes = np.array((_INVALID_BOX)) if bboxes.size == 0 else bboxes
assert bboxes.shape[1] == 4, "bboxes.shape[1] must be 4!!!!"
# pylint:disable=g-long-lambda
# pylint:disable=line-too-long
wrapped_aug_func = lambda _image, bbox, _new_bboxes: _apply_bbox_augmentation_wrapper(_image, bbox, _new_bboxes, prob, aug_func, func_changes_bbox, *args)
# pylint:enable=g-long-lambda
# pylint:enable=line-too-long
# Setup the while_loop.
num_bboxes = bboxes.shape[0] # We loop until we go over all bboxes.
idx = 0 # Counter for the while loop.
# Conditional function when to end the loop once we go over all bboxes
# images_and_bboxes contain (_image, _new_bboxes)
def cond(_idx, _images_and_bboxes):
return _idx < num_bboxes
# Shuffle the bboxes so that the augmentation order is not deterministic if
# we are not changing the bboxes with aug_func.
# if not func_changes_bbox:
# print(bboxes)
# loop_bboxes = np.take(bboxes,np.random.permutation(bboxes.shape[0]),axis=0)
# print(loop_bboxes)
# else:
# loop_bboxes = bboxes
# we can not shuffle the bbox because it does not contain class information here
loop_bboxes = deepcopy(bboxes)
# Main function of while_loop where we repeatedly apply augmentation on the
# bboxes in the image.
# pylint:disable=g-long-lambda
body = lambda _idx, _images_and_bboxes: [
_idx + 1, wrapped_aug_func(_images_and_bboxes[0],
loop_bboxes[_idx],
_images_and_bboxes[1])]
while (cond(idx, (image, new_bboxes))):
idx, (image, new_bboxes) = body(idx, (image, new_bboxes))
# Either return the altered bboxes or the original ones depending on if
# we altered them in anyway.
if func_changes_bbox:
final_bboxes = new_bboxes
else:
final_bboxes = bboxes
return image, final_bboxes
def _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, aug_func,
func_changes_bbox, *args):
"""Checks to be sure num bboxes > 0 before calling inner function."""
num_bboxes = len(bboxes)
new_image = deepcopy(image)
new_bboxes = deepcopy(bboxes)
if num_bboxes != 0:
new_image, new_bboxes = _apply_multi_bbox_augmentation(
new_image, new_bboxes, prob, aug_func, func_changes_bbox, *args)
return new_image, new_bboxes
def rotate_only_bboxes(image, bboxes, prob, degrees, replace):
"""Apply rotate to each bbox in the image with probability prob."""
func_changes_bbox = False
prob = _scale_bbox_only_op_probability(prob)
return _apply_multi_bbox_augmentation_wrapper(
image, bboxes, prob, rotate, func_changes_bbox, degrees, replace)
def shear_x_only_bboxes(image, bboxes, prob, level, replace):
"""Apply shear_x to each bbox in the image with probability prob."""
func_changes_bbox = False
prob = _scale_bbox_only_op_probability(prob)
return _apply_multi_bbox_augmentation_wrapper(
image, bboxes, prob, shear_x, func_changes_bbox, level, replace)
def shear_y_only_bboxes(image, bboxes, prob, level, replace):
"""Apply shear_y to each bbox in the image with probability prob."""
func_changes_bbox = False
prob = _scale_bbox_only_op_probability(prob)
return _apply_multi_bbox_augmentation_wrapper(
image, bboxes, prob, shear_y, func_changes_bbox, level, replace)
def translate_x_only_bboxes(image, bboxes, prob, pixels, replace):
"""Apply translate_x to each bbox in the image with probability prob."""
func_changes_bbox = False
prob = _scale_bbox_only_op_probability(prob)
return _apply_multi_bbox_augmentation_wrapper(
image, bboxes, prob, translate_x, func_changes_bbox, pixels, replace)
def translate_y_only_bboxes(image, bboxes, prob, pixels, replace):
"""Apply translate_y to each bbox in the image with probability prob."""
func_changes_bbox = False
prob = _scale_bbox_only_op_probability(prob)
return _apply_multi_bbox_augmentation_wrapper(
image, bboxes, prob, translate_y, func_changes_bbox, pixels, replace)
def flip_only_bboxes(image, bboxes, prob):
"""Apply flip_lr to each bbox in the image with probability prob."""
func_changes_bbox = False
prob = _scale_bbox_only_op_probability(prob)
return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob,
np.fliplr, func_changes_bbox)
def solarize_only_bboxes(image, bboxes, prob, threshold):
"""Apply solarize to each bbox in the image with probability prob."""
func_changes_bbox = False
prob = _scale_bbox_only_op_probability(prob)
return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, solarize,
func_changes_bbox, threshold)
def equalize_only_bboxes(image, bboxes, prob):
"""Apply equalize to each bbox in the image with probability prob."""
func_changes_bbox = False
prob = _scale_bbox_only_op_probability(prob)
return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, equalize,
func_changes_bbox)
def cutout_only_bboxes(image, bboxes, prob, pad_size, replace):
"""Apply cutout to each bbox in the image with probability prob."""
func_changes_bbox = False
prob = _scale_bbox_only_op_probability(prob)
return _apply_multi_bbox_augmentation_wrapper(
image, bboxes, prob, cutout, func_changes_bbox, pad_size, replace)
def _rotate_bbox(bbox, image_height, image_width, degrees):
"""Rotates the bbox coordinated by degrees.
Args:
bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
of type float that represents the normalized coordinates between 0 and 1.
image_height: Int, height of the image.
image_width: Int, height of the image.
degrees: Float, a scalar angle in degrees to rotate all images by. If
degrees is positive the image will be rotated clockwise otherwise it will
be rotated counterclockwise.
Returns:
A tensor of the same shape as bbox, but now with the rotated coordinates.
"""
image_height, image_width = (float(image_height), float(image_width))
# Convert from degrees to radians.
degrees_to_radians = math.pi / 180.0
radians = degrees * degrees_to_radians
# Translate the bbox to the center of the image and turn the normalized 0-1
# coordinates to absolute pixel locations.
# Y coordinates are made negative as the y axis of images goes down with
# increasing pixel values, so we negate to make sure x axis and y axis points
# are in the traditionally positive direction.
min_y = -int(image_height * (bbox[0] - 0.5))
min_x = int(image_width * (bbox[1] - 0.5))
max_y = -int(image_height * (bbox[2] - 0.5))
max_x = int(image_width * (bbox[3] - 0.5))
coordinates = np.stack([[min_y, min_x], [min_y, max_x], [max_y, min_x],
[max_y, max_x]]).astype(np.float32)
# Rotate the coordinates according to the rotation matrix clockwise if
# radians is positive, else negative
rotation_matrix = np.stack([[math.cos(radians), math.sin(radians)],
[-math.sin(radians), math.cos(radians)]])
new_coords = np.matmul(rotation_matrix,
np.transpose(coordinates)).astype(np.int32)
# Find min/max values and convert them back to normalized 0-1 floats.
min_y = -(float(np.max(new_coords[0, :])) / image_height - 0.5)
min_x = float(np.min(new_coords[1, :])) / image_width + 0.5
max_y = -(float(np.min(new_coords[0, :])) / image_height - 0.5)
max_x = float(np.max(new_coords[1, :])) / image_width + 0.5
# Clip the bboxes to be sure the fall between [0, 1].
min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x)
min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x)
return np.stack([min_y, min_x, max_y, max_x])
def rotate_with_bboxes(image, bboxes, degrees, replace):
# Rotate the image.
image = rotate(image, degrees, replace)
# Convert bbox coordinates to pixel values.
image_height, image_width = image.shape[:2]
# pylint:disable=g-long-lambda
wrapped_rotate_bbox = lambda bbox: _rotate_bbox(bbox, image_height, image_width, degrees)
# pylint:enable=g-long-lambda
new_bboxes = np.zeros_like(bboxes)
for idx in range(len(bboxes)):
new_bboxes[idx] = wrapped_rotate_bbox(bboxes[idx])
return image, new_bboxes
def translate_x(image, pixels, replace):
"""Equivalent of PIL Translate in X dimension."""
image = Image.fromarray(wrap(image))
image = image.transform(image.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0))
return unwrap(np.array(image), replace)
def translate_y(image, pixels, replace):
"""Equivalent of PIL Translate in Y dimension."""
image = Image.fromarray(wrap(image))
image = image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels))
return unwrap(np.array(image), replace)
def _shift_bbox(bbox, image_height, image_width, pixels, shift_horizontal):
"""Shifts the bbox coordinates by pixels.
Args:
bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
of type float that represents the normalized coordinates between 0 and 1.
image_height: Int, height of the image.
image_width: Int, width of the image.
pixels: An int. How many pixels to shift the bbox.
shift_horizontal: Boolean. If true then shift in X dimension else shift in
Y dimension.
Returns:
A tensor of the same shape as bbox, but now with the shifted coordinates.
"""
pixels = int(pixels)
# Convert bbox to integer pixel locations.
min_y = int(float(image_height) * bbox[0])
min_x = int(float(image_width) * bbox[1])
max_y = int(float(image_height) * bbox[2])
max_x = int(float(image_width) * bbox[3])
if shift_horizontal:
min_x = np.maximum(0, min_x - pixels)
max_x = np.minimum(image_width, max_x - pixels)
else:
min_y = np.maximum(0, min_y - pixels)
max_y = np.minimum(image_height, max_y - pixels)
# Convert bbox back to floats.
min_y = float(min_y) / float(image_height)
min_x = float(min_x) / float(image_width)
max_y = float(max_y) / float(image_height)
max_x = float(max_x) / float(image_width)
# Clip the bboxes to be sure the fall between [0, 1].
min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x)
min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x)
return np.stack([min_y, min_x, max_y, max_x])
def translate_bbox(image, bboxes, pixels, replace, shift_horizontal):
"""Equivalent of PIL Translate in X/Y dimension that shifts image and bbox.
Args:
image: 3D uint8 Tensor.
bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
has 4 elements (min_y, min_x, max_y, max_x) of type float with values
between [0, 1].
pixels: An int. How many pixels to shift the image and bboxes
replace: A one or three value 1D tensor to fill empty pixels.
shift_horizontal: Boolean. If true then shift in X dimension else shift in
Y dimension.
Returns:
A tuple containing a 3D uint8 Tensor that will be the result of translating
image by pixels. The second element of the tuple is bboxes, where now
the coordinates will be shifted to reflect the shifted image.
"""
if shift_horizontal:
image = translate_x(image, pixels, replace)
else:
image = translate_y(image, pixels, replace)
# Convert bbox coordinates to pixel values.
image_height, image_width = image.shape[0], image.shape[1]
# pylint:disable=g-long-lambda
wrapped_shift_bbox = lambda bbox: _shift_bbox(bbox, image_height, image_width, pixels, shift_horizontal)
# pylint:enable=g-long-lambda
new_bboxes = deepcopy(bboxes)
num_bboxes = len(bboxes)
for idx in range(num_bboxes):
new_bboxes[idx] = wrapped_shift_bbox(bboxes[idx])
return image.astype(np.uint8), new_bboxes
def shear_x(image, level, replace):
"""Equivalent of PIL Shearing in X dimension."""
# Shear parallel to x axis is a projective transform
# with a matrix form of:
# [1 level
# 0 1].
image = Image.fromarray(wrap(image))
image = image.transform(image.size, Image.AFFINE, (1, level, 0, 0, 1, 0))
return unwrap(np.array(image), replace)
def shear_y(image, level, replace):
"""Equivalent of PIL Shearing in Y dimension."""
# Shear parallel to y axis is a projective transform
# with a matrix form of:
# [1 0
# level 1].
image = Image.fromarray(wrap(image))
image = image.transform(image.size, Image.AFFINE, (1, 0, 0, level, 1, 0))
return unwrap(np.array(image), replace)
def _shear_bbox(bbox, image_height, image_width, level, shear_horizontal):
"""Shifts the bbox according to how the image was sheared.
Args:
bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
of type float that represents the normalized coordinates between 0 and 1.
image_height: Int, height of the image.
image_width: Int, height of the image.
level: Float. How much to shear the image.
shear_horizontal: If true then shear in X dimension else shear in
the Y dimension.
Returns:
A tensor of the same shape as bbox, but now with the shifted coordinates.
"""
image_height, image_width = (float(image_height), float(image_width))
# Change bbox coordinates to be pixels.
min_y = int(image_height * bbox[0])
min_x = int(image_width * bbox[1])
max_y = int(image_height * bbox[2])
max_x = int(image_width * bbox[3])
coordinates = np.stack(
[[min_y, min_x], [min_y, max_x], [max_y, min_x], [max_y, max_x]])
coordinates = coordinates.astype(np.float32)
# Shear the coordinates according to the translation matrix.
if shear_horizontal:
translation_matrix = np.stack([[1, 0], [-level, 1]])
else:
translation_matrix = np.stack([[1, -level], [0, 1]])
translation_matrix = translation_matrix.astype(np.float32)
new_coords = np.matmul(translation_matrix,
np.transpose(coordinates)).astype(np.int32)
# Find min/max values and convert them back to floats.
min_y = float(np.min(new_coords[0, :])) / image_height
min_x = float(np.min(new_coords[1, :])) / image_width
max_y = float(np.max(new_coords[0, :])) / image_height
max_x = float(np.max(new_coords[1, :])) / image_width
# Clip the bboxes to be sure the fall between [0, 1].
min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x)
min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x)
return np.stack([min_y, min_x, max_y, max_x])
def shear_with_bboxes(image, bboxes, level, replace, shear_horizontal):
"""Applies Shear Transformation to the image and shifts the bboxes.
Args:
image: 3D uint8 Tensor.
bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
has 4 elements (min_y, min_x, max_y, max_x) of type float with values
between [0, 1].
level: Float. How much to shear the image. This value will be between
-0.3 to 0.3.
replace: A one or three value 1D tensor to fill empty pixels.
shear_horizontal: Boolean. If true then shear in X dimension else shear in
the Y dimension.
Returns:
A tuple containing a 3D uint8 Tensor that will be the result of shearing
image by level. The second element of the tuple is bboxes, where now
the coordinates will be shifted to reflect the sheared image.
"""
if shear_horizontal:
image = shear_x(image, level, replace)
else:
image = shear_y(image, level, replace)
# Convert bbox coordinates to pixel values.
image_height, image_width = image.shape[:2]
# pylint:disable=g-long-lambda
wrapped_shear_bbox = lambda bbox: _shear_bbox(bbox, image_height, image_width, level, shear_horizontal)
# pylint:enable=g-long-lambda
new_bboxes = deepcopy(bboxes)
num_bboxes = len(bboxes)
for idx in range(num_bboxes):
new_bboxes[idx] = wrapped_shear_bbox(bboxes[idx])
return image.astype(np.uint8), new_bboxes
def autocontrast(image):
"""Implements Autocontrast function from PIL.
Args:
image: A 3D uint8 tensor.
Returns:
The image after it has had autocontrast applied to it and will be of type
uint8.
"""
def scale_channel(image):
"""Scale the 2D image using the autocontrast rule."""
# A possibly cheaper version can be done using cumsum/unique_with_counts
# over the histogram values, rather than iterating over the entire image.
# to compute mins and maxes.
lo = float(np.min(image))
hi = float(np.max(image))
# Scale the image, making the lowest value 0 and the highest value 255.
def scale_values(im):
scale = 255.0 / (hi - lo)
offset = -lo * scale
im = im.astype(np.float32) * scale + offset
img = np.clip(im, a_min=0, a_max=255.0)
return im.astype(np.uint8)
result = scale_values(image) if hi > lo else image
return result
# Assumes RGB for now. Scales each channel independently
# and then stacks the result.
s1 = scale_channel(image[:, :, 0])
s2 = scale_channel(image[:, :, 1])
s3 = scale_channel(image[:, :, 2])
image = np.stack([s1, s2, s3], 2)
return image
def sharpness(image, factor):
"""Implements Sharpness function from PIL."""
orig_image = image
image = image.astype(np.float32)
# Make image 4D for conv operation.
# SMOOTH PIL Kernel.
kernel = np.array([[1, 1, 1], [1, 5, 1], [1, 1, 1]], dtype=np.float32) / 13.
result = cv2.filter2D(image, -1, kernel).astype(np.uint8)
# Blend the final result.
return blend(result, orig_image, factor)
def equalize(image):
"""Implements Equalize function from PIL using."""
def scale_channel(im, c):
"""Scale the data in the channel to implement equalize."""
im = im[:, :, c].astype(np.int32)
# Compute the histogram of the image channel.
histo, _ = np.histogram(im, range=[0, 255], bins=256)
# For the purposes of computing the step, filter out the nonzeros.
nonzero = np.where(np.not_equal(histo, 0))
nonzero_histo = np.reshape(np.take(histo, nonzero), [-1])
step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255
def build_lut(histo, step):
# Compute the cumulative sum, shifting by step // 2
# and then normalization by step.
lut = (np.cumsum(histo) + (step // 2)) // step
# Shift lut, prepending with 0.
lut = np.concatenate([[0], lut[:-1]], 0)
# Clip the counts to be in range. This is done
# in the C code for image.point.
return np.clip(lut, a_min=0, a_max=255).astype(np.uint8)
# If step is zero, return the original image. Otherwise, build
# lut from the full histogram and step and then index from it.
if step == 0:
result = im
else:
result = np.take(build_lut(histo, step), im)
return result.astype(np.uint8)
# Assumes RGB for now. Scales each channel independently
# and then stacks the result.
s1 = scale_channel(image, 0)
s2 = scale_channel(image, 1)
s3 = scale_channel(image, 2)
image = np.stack([s1, s2, s3], 2)
return image
def wrap(image):
"""Returns 'image' with an extra channel set to all 1s."""
shape = image.shape
extended_channel = 255 * np.ones([shape[0], shape[1], 1], image.dtype)
extended = np.concatenate([image, extended_channel], 2).astype(image.dtype)
return extended
def unwrap(image, replace):
"""Unwraps an image produced by wrap.
Where there is a 0 in the last channel for every spatial position,
the rest of the three channels in that spatial dimension are grayed
(set to 128). Operations like translate and shear on a wrapped
Tensor will leave 0s in empty locations. Some transformations look
at the intensity of values to do preprocessing, and we want these
empty pixels to assume the 'average' value, rather than pure black.
Args:
image: A 3D Image Tensor with 4 channels.
replace: A one or three value 1D tensor to fill empty pixels.
Returns:
image: A 3D image Tensor with 3 channels.
"""
image_shape = image.shape
# Flatten the spatial dimensions.
flattened_image = np.reshape(image, [-1, image_shape[2]])
# Find all pixels where the last channel is zero.
alpha_channel = flattened_image[:, 3]
replace = np.concatenate([replace, np.ones([1], image.dtype)], 0)
# Where they are zero, fill them in with 'replace'.
alpha_channel = np.reshape(alpha_channel, (-1, 1))
alpha_channel = np.tile(alpha_channel, reps=(1, flattened_image.shape[1]))
flattened_image = np.where(
np.equal(alpha_channel, 0),
np.ones_like(
flattened_image, dtype=image.dtype) * replace,
flattened_image)
image = np.reshape(flattened_image, image_shape)
image = image[:, :, :3]
return image.astype(np.uint8)
def _cutout_inside_bbox(image, bbox, pad_fraction):
"""Generates cutout mask and the mean pixel value of the bbox.
First a location is randomly chosen within the image as the center where the
cutout mask will be applied. Note this can be towards the boundaries of the
image, so the full cutout mask may not be applied.
Args:
image: 3D uint8 Tensor.
bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
of type float that represents the normalized coordinates between 0 and 1.
pad_fraction: Float that specifies how large the cutout mask should be in
in reference to the size of the original bbox. If pad_fraction is 0.25,
then the cutout mask will be of shape
(0.25 * bbox height, 0.25 * bbox width).
Returns:
A tuple. Fist element is a tensor of the same shape as image where each
element is either a 1 or 0 that is used to determine where the image
will have cutout applied. The second element is the mean of the pixels
in the image where the bbox is located.
mask value: [0,1]
"""
image_height, image_width = image.shape[0], image.shape[1]
# Transform from shape [1, 4] to [4].
bbox = np.squeeze(bbox)
min_y = int(float(image_height) * bbox[0])
min_x = int(float(image_width) * bbox[1])
max_y = int(float(image_height) * bbox[2])
max_x = int(float(image_width) * bbox[3])
# Calculate the mean pixel values in the bounding box, which will be used
# to fill the cutout region.
mean = np.mean(image[min_y:max_y + 1, min_x:max_x + 1], axis=(0, 1))
# Cutout mask will be size pad_size_heigh * 2 by pad_size_width * 2 if the
# region lies entirely within the bbox.
box_height = max_y - min_y + 1
box_width = max_x - min_x + 1
pad_size_height = int(pad_fraction * (box_height / 2))
pad_size_width = int(pad_fraction * (box_width / 2))
# Sample the center location in the image where the zero mask will be applied.
cutout_center_height = np.random.randint(min_y, max_y + 1, dtype=np.int32)
cutout_center_width = np.random.randint(min_x, max_x + 1, dtype=np.int32)
lower_pad = np.maximum(0, cutout_center_height - pad_size_height)
upper_pad = np.maximum(
0, image_height - cutout_center_height - pad_size_height)
left_pad = np.maximum(0, cutout_center_width - pad_size_width)
right_pad = np.maximum(0,
image_width - cutout_center_width - pad_size_width)
cutout_shape = [
image_height - (lower_pad + upper_pad),
image_width - (left_pad + right_pad)
]
padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]]
mask = np.pad(np.zeros(
cutout_shape, dtype=image.dtype),
padding_dims,
'constant',
constant_values=1)
mask = np.expand_dims(mask, 2)
mask = np.tile(mask, [1, 1, 3])
return mask, mean
def bbox_cutout(image, bboxes, pad_fraction, replace_with_mean):
"""Applies cutout to the image according to bbox information.
This is a cutout variant that using bbox information to make more informed
decisions on where to place the cutout mask.
Args:
image: 3D uint8 Tensor.
bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
has 4 elements (min_y, min_x, max_y, max_x) of type float with values
between [0, 1].
pad_fraction: Float that specifies how large the cutout mask should be in
in reference to the size of the original bbox. If pad_fraction is 0.25,
then the cutout mask will be of shape
(0.25 * bbox height, 0.25 * bbox width).
replace_with_mean: Boolean that specified what value should be filled in
where the cutout mask is applied. Since the incoming image will be of
uint8 and will not have had any mean normalization applied, by default
we set the value to be 128. If replace_with_mean is True then we find
the mean pixel values across the channel dimension and use those to fill
in where the cutout mask is applied.
Returns:
A tuple. First element is a tensor of the same shape as image that has
cutout applied to it. Second element is the bboxes that were passed in
that will be unchanged.
"""
def apply_bbox_cutout(image, bboxes, pad_fraction):
"""Applies cutout to a single bounding box within image."""
# Choose a single bounding box to apply cutout to.
random_index = np.random.randint(0, bboxes.shape[0], dtype=np.int32)
# Select the corresponding bbox and apply cutout.
chosen_bbox = np.take(bboxes, random_index, axis=0)
mask, mean = _cutout_inside_bbox(image, chosen_bbox, pad_fraction)
# When applying cutout we either set the pixel value to 128 or to the mean
# value inside the bbox.
replace = mean if replace_with_mean else [128] * 3
# Apply the cutout mask to the image. Where the mask is 0 we fill it with
# `replace`.
image = np.where(
np.equal(mask, 0),
np.ones_like(
image, dtype=image.dtype) * replace,
image).astype(image.dtype)
return image
# Check to see if there are boxes, if so then apply boxcutout.
if len(bboxes) != 0:
image = apply_bbox_cutout(image, bboxes, pad_fraction)
return image, bboxes
NAME_TO_FUNC = {
'AutoContrast': autocontrast,
'Equalize': equalize,
'Posterize': posterize,
'Solarize': solarize,
'SolarizeAdd': solarize_add,
'Color': color,
'Contrast': contrast,
'Brightness': brightness,
'Sharpness': sharpness,
'Cutout': cutout,
'BBox_Cutout': bbox_cutout,
'Rotate_BBox': rotate_with_bboxes,
# pylint:disable=g-long-lambda
'TranslateX_BBox': lambda image, bboxes, pixels, replace: translate_bbox(
image, bboxes, pixels, replace, shift_horizontal=True),
'TranslateY_BBox': lambda image, bboxes, pixels, replace: translate_bbox(
image, bboxes, pixels, replace, shift_horizontal=False),
'ShearX_BBox': lambda image, bboxes, level, replace: shear_with_bboxes(
image, bboxes, level, replace, shear_horizontal=True),
'ShearY_BBox': lambda image, bboxes, level, replace: shear_with_bboxes(
image, bboxes, level, replace, shear_horizontal=False),
# pylint:enable=g-long-lambda
'Rotate_Only_BBoxes': rotate_only_bboxes,
'ShearX_Only_BBoxes': shear_x_only_bboxes,
'ShearY_Only_BBoxes': shear_y_only_bboxes,
'TranslateX_Only_BBoxes': translate_x_only_bboxes,
'TranslateY_Only_BBoxes': translate_y_only_bboxes,
'Flip_Only_BBoxes': flip_only_bboxes,
'Solarize_Only_BBoxes': solarize_only_bboxes,
'Equalize_Only_BBoxes': equalize_only_bboxes,
'Cutout_Only_BBoxes': cutout_only_bboxes,
}
def _randomly_negate_tensor(tensor):
"""With 50% prob turn the tensor negative."""
should_flip = np.floor(np.random.rand() + 0.5) >= 1
final_tensor = tensor if should_flip else -tensor
return final_tensor
def _rotate_level_to_arg(level):
level = (level / _MAX_LEVEL) * 30.
level = _randomly_negate_tensor(level)
return (level, )
def _shrink_level_to_arg(level):
"""Converts level to ratio by which we shrink the image content."""
if level == 0:
return (1.0, ) # if level is zero, do not shrink the image
# Maximum shrinking ratio is 2.9.
level = 2. / (_MAX_LEVEL / level) + 0.9
return (level, )
def _enhance_level_to_arg(level):
return ((level / _MAX_LEVEL) * 1.8 + 0.1, )
def _shear_level_to_arg(level):
level = (level / _MAX_LEVEL) * 0.3
# Flip level to negative with 50% chance.
level = _randomly_negate_tensor(level)
return (level, )
def _translate_level_to_arg(level, translate_const):
level = (level / _MAX_LEVEL) * float(translate_const)
# Flip level to negative with 50% chance.
level = _randomly_negate_tensor(level)
return (level, )
def _bbox_cutout_level_to_arg(level, hparams):
cutout_pad_fraction = (level /
_MAX_LEVEL) * 0.75 # hparams.cutout_max_pad_fraction
return (cutout_pad_fraction, False) # hparams.cutout_bbox_replace_with_mean
def level_to_arg(hparams):
return {
'AutoContrast': lambda level: (),
'Equalize': lambda level: (),
'Posterize': lambda level: (int((level / _MAX_LEVEL) * 4), ),
'Solarize': lambda level: (int((level / _MAX_LEVEL) * 256), ),
'SolarizeAdd': lambda level: (int((level / _MAX_LEVEL) * 110), ),
'Color': _enhance_level_to_arg,
'Contrast': _enhance_level_to_arg,
'Brightness': _enhance_level_to_arg,
'Sharpness': _enhance_level_to_arg,
'Cutout':
lambda level: (int((level / _MAX_LEVEL) * 100), ), # hparams.cutout_const=100
# pylint:disable=g-long-lambda
'BBox_Cutout': lambda level: _bbox_cutout_level_to_arg(level, hparams),
'TranslateX_BBox':
lambda level: _translate_level_to_arg(level, 250), # hparams.translate_const=250
'TranslateY_BBox':
lambda level: _translate_level_to_arg(level, 250), # hparams.translate_cons
# pylint:enable=g-long-lambda
'ShearX_BBox': _shear_level_to_arg,
'ShearY_BBox': _shear_level_to_arg,
'Rotate_BBox': _rotate_level_to_arg,
'Rotate_Only_BBoxes': _rotate_level_to_arg,
'ShearX_Only_BBoxes': _shear_level_to_arg,
'ShearY_Only_BBoxes': _shear_level_to_arg,
# pylint:disable=g-long-lambda
'TranslateX_Only_BBoxes':
lambda level: _translate_level_to_arg(level, 120), # hparams.translate_bbox_const
'TranslateY_Only_BBoxes':
lambda level: _translate_level_to_arg(level, 120), # hparams.translate_bbox_const
# pylint:enable=g-long-lambda
'Flip_Only_BBoxes': lambda level: (),
'Solarize_Only_BBoxes':
lambda level: (int((level / _MAX_LEVEL) * 256), ),
'Equalize_Only_BBoxes': lambda level: (),
# pylint:disable=g-long-lambda
'Cutout_Only_BBoxes':
lambda level: (int((level / _MAX_LEVEL) * 50), ), # hparams.cutout_bbox_const
# pylint:enable=g-long-lambda
}
def bbox_wrapper(func):
"""Adds a bboxes function argument to func and returns unchanged bboxes."""
def wrapper(images, bboxes, *args, **kwargs):
return (func(images, *args, **kwargs), bboxes)
return wrapper
def _parse_policy_info(name, prob, level, replace_value, augmentation_hparams):
"""Return the function that corresponds to `name` and update `level` param."""
func = NAME_TO_FUNC[name]
args = level_to_arg(augmentation_hparams)[name](level)
# Check to see if prob is passed into function. This is used for operations
# where we alter bboxes independently.
# pytype:disable=wrong-arg-types
if 'prob' in inspect.getfullargspec(func)[0]:
args = tuple([prob] + list(args))
# pytype:enable=wrong-arg-types
# Add in replace arg if it is required for the function that is being called.
if 'replace' in inspect.getfullargspec(func)[0]:
# Make sure replace is the final argument
assert 'replace' == inspect.getfullargspec(func)[0][-1]
args = tuple(list(args) + [replace_value])
# Add bboxes as the second positional argument for the function if it does
# not already exist.
if 'bboxes' not in inspect.getfullargspec(func)[0]:
func = bbox_wrapper(func)
return (func, prob, args)
def _apply_func_with_prob(func, image, args, prob, bboxes):
"""Apply `func` to image w/ `args` as input with probability `prob`."""
assert isinstance(args, tuple)
assert 'bboxes' == inspect.getfullargspec(func)[0][1]
# If prob is a function argument, then this randomness is being handled
# inside the function, so make sure it is always called.
if 'prob' in inspect.getfullargspec(func)[0]:
prob = 1.0
# Apply the function with probability `prob`.
should_apply_op = np.floor(np.random.rand() + 0.5) >= 1
if should_apply_op:
augmented_image, augmented_bboxes = func(image, bboxes, *args)
else:
augmented_image, augmented_bboxes = (image, bboxes)
return augmented_image, augmented_bboxes
def select_and_apply_random_policy(policies, image, bboxes):
"""Select a random policy from `policies` and apply it to `image`."""
policy_to_select = np.random.randint(0, len(policies), dtype=np.int32)
# policy_to_select = 6 # for test
for (i, policy) in enumerate(policies):
if i == policy_to_select:
image, bboxes = policy(image, bboxes)
return (image, bboxes)
def build_and_apply_nas_policy(policies, image, bboxes, augmentation_hparams):
"""Build a policy from the given policies passed in and apply to image.
Args:
policies: list of lists of tuples in the form `(func, prob, level)`, `func`
is a string name of the augmentation function, `prob` is the probability
of applying the `func` operation, `level` is the input argument for
`func`.
image: numpy array that the resulting policy will be applied to.
bboxes:
augmentation_hparams: Hparams associated with the NAS learned policy.
Returns:
A version of image that now has data augmentation applied to it based on
the `policies` pass into the function. Additionally, returns bboxes if
a value for them is passed in that is not None
"""
replace_value = [128, 128, 128]
# func is the string name of the augmentation function, prob is the
# probability of applying the operation and level is the parameter associated
# tf_policies are functions that take in an image and return an augmented
# image.
tf_policies = []
for policy in policies:
tf_policy = []
# Link string name to the correct python function and make sure the correct
# argument is passed into that function.
for policy_info in policy:
policy_info = list(
policy_info) + [replace_value, augmentation_hparams]
tf_policy.append(_parse_policy_info(*policy_info))
# Now build the tf policy that will apply the augmentation procedue
# on image.
def make_final_policy(tf_policy_):
def final_policy(image_, bboxes_):
for func, prob, args in tf_policy_:
image_, bboxes_ = _apply_func_with_prob(func, image_, args,
prob, bboxes_)
return image_, bboxes_
return final_policy
tf_policies.append(make_final_policy(tf_policy))
augmented_images, augmented_bboxes = select_and_apply_random_policy(
tf_policies, image, bboxes)
# If no bounding boxes were specified, then just return the images.
return (augmented_images, augmented_bboxes)
# TODO(barretzoph): Add in ArXiv link once paper is out.
def distort_image_with_autoaugment(image, bboxes, augmentation_name):
"""Applies the AutoAugment policy to `image` and `bboxes`.
Args:
image: `Tensor` of shape [height, width, 3] representing an image.
bboxes: `Tensor` of shape [N, 4] representing ground truth boxes that are
normalized between [0, 1].
augmentation_name: The name of the AutoAugment policy to use. The available
options are `v0`, `v1`, `v2`, `v3` and `test`. `v0` is the policy used for
all of the results in the paper and was found to achieve the best results
on the COCO dataset. `v1`, `v2` and `v3` are additional good policies
found on the COCO dataset that have slight variation in what operations
were used during the search procedure along with how many operations are
applied in parallel to a single image (2 vs 3).
Returns:
A tuple containing the augmented versions of `image` and `bboxes`.
"""
available_policies = {
'v0': policy_v0,
'v1': policy_v1,
'v2': policy_v2,
'v3': policy_v3,
'test': policy_vtest
}
if augmentation_name not in available_policies:
raise ValueError('Invalid augmentation_name: {}'.format(
augmentation_name))
policy = available_policies[augmentation_name]()
augmentation_hparams = {}
return build_and_apply_nas_policy(policy, image, bboxes,
augmentation_hparams)
================================================
FILE: ppdet/data/transform/batch_operators.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import typing
try:
from collections.abc import Sequence
except Exception:
from collections import Sequence
import cv2
import copy
import math
import numpy as np
from .operators import register_op, BaseOperator, Resize
from .op_helper import jaccard_overlap, gaussian2D, gaussian_radius, draw_umich_gaussian
from .atss_assigner import ATSSAssigner
from scipy import ndimage
from ppdet.modeling import bbox_utils
from ppdet.utils.logger import setup_logger
from ppdet.modeling.keypoint_utils import get_affine_transform, affine_transform
logger = setup_logger(__name__)
__all__ = [
'PadBatch', 'BatchRandomResize', 'Gt2YoloTarget', 'Gt2FCOSTarget',
'Gt2TTFTarget', 'Gt2Solov2Target', 'Gt2SparseTarget', 'PadMaskBatch',
'Gt2GFLTarget', 'Gt2CenterNetTarget', 'Gt2CenterTrackTarget', 'PadGT',
'PadRGT', 'BatchRandomResizeForSSOD'
]
@register_op
class PadBatch(BaseOperator):
"""
Pad a batch of samples so they can be divisible by a stride.
The layout of each image should be 'CHW'.
Args:
pad_to_stride (int): If `pad_to_stride > 0`, pad zeros to ensure
height and width is divisible by `pad_to_stride`.
"""
def __init__(self, pad_to_stride=0):
super(PadBatch, self).__init__()
self.pad_to_stride = pad_to_stride
def __call__(self, samples, context=None):
"""
Args:
samples (list): a batch of sample, each is dict.
"""
coarsest_stride = self.pad_to_stride
# multi scale input is nested list
if isinstance(samples,
typing.Sequence) and len(samples) > 0 and isinstance(
samples[0], typing.Sequence):
inner_samples = samples[0]
else:
inner_samples = samples
max_shape = np.array(
[data['image'].shape for data in inner_samples]).max(axis=0)
if coarsest_stride > 0:
max_shape[1] = int(
np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride)
max_shape[2] = int(
np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride)
for data in inner_samples:
im = data['image']
im_c, im_h, im_w = im.shape[:]
padding_im = np.zeros(
(im_c, max_shape[1], max_shape[2]), dtype=np.float32)
padding_im[:, :im_h, :im_w] = im
data['image'] = padding_im
if 'semantic' in data and data['semantic'] is not None:
semantic = data['semantic']
padding_sem = np.zeros(
(1, max_shape[1], max_shape[2]), dtype=np.float32)
padding_sem[:, :im_h, :im_w] = semantic
data['semantic'] = padding_sem
if 'gt_segm' in data and data['gt_segm'] is not None:
gt_segm = data['gt_segm']
padding_segm = np.zeros(
(gt_segm.shape[0], max_shape[1], max_shape[2]),
dtype=np.uint8)
padding_segm[:, :im_h, :im_w] = gt_segm
data['gt_segm'] = padding_segm
return samples
@register_op
class BatchRandomResize(BaseOperator):
"""
Resize image to target size randomly. random target_size and interpolation method
Args:
target_size (int, list, tuple): image target size, if random size is True, must be list or tuple
keep_ratio (bool): whether keep_raio or not, default true
interp (int): the interpolation method
random_size (bool): whether random select target size of image
random_interp (bool): whether random select interpolation method
"""
def __init__(self,
target_size,
keep_ratio,
interp=cv2.INTER_NEAREST,
random_size=True,
random_interp=False):
super(BatchRandomResize, self).__init__()
self.keep_ratio = keep_ratio
self.interps = [
cv2.INTER_NEAREST,
cv2.INTER_LINEAR,
cv2.INTER_AREA,
cv2.INTER_CUBIC,
cv2.INTER_LANCZOS4,
]
self.interp = interp
assert isinstance(target_size, (
int, Sequence)), "target_size must be int, list or tuple"
if random_size and not isinstance(target_size, list):
raise TypeError(
"Type of target_size is invalid when random_size is True. Must be List, now is {}".
format(type(target_size)))
self.target_size = target_size
self.random_size = random_size
self.random_interp = random_interp
def __call__(self, samples, context=None):
if self.random_size:
index = np.random.choice(len(self.target_size))
target_size = self.target_size[index]
else:
target_size = self.target_size
if self.random_interp:
interp = np.random.choice(self.interps)
else:
interp = self.interp
resizer = Resize(target_size, keep_ratio=self.keep_ratio, interp=interp)
return resizer(samples, context=context)
@register_op
class Gt2YoloTarget(BaseOperator):
__shared__ = ['num_classes']
"""
Generate YOLOv3 targets by groud truth data, this operator is only used in
fine grained YOLOv3 loss mode
"""
def __init__(self,
anchors,
anchor_masks,
downsample_ratios,
num_classes=80,
iou_thresh=1.):
super(Gt2YoloTarget, self).__init__()
self.anchors = anchors
self.anchor_masks = anchor_masks
self.downsample_ratios = downsample_ratios
self.num_classes = num_classes
self.iou_thresh = iou_thresh
def __call__(self, samples, context=None):
assert len(self.anchor_masks) == len(self.downsample_ratios), \
"anchor_masks', and 'downsample_ratios' should have same length."
h, w = samples[0]['image'].shape[1:3]
an_hw = np.array(self.anchors) / np.array([[w, h]])
for sample in samples:
gt_bbox = sample['gt_bbox']
gt_class = sample['gt_class']
if 'gt_score' not in sample:
sample['gt_score'] = np.ones(
(gt_bbox.shape[0], 1), dtype=np.float32)
gt_score = sample['gt_score']
for i, (
mask, downsample_ratio
) in enumerate(zip(self.anchor_masks, self.downsample_ratios)):
grid_h = int(h / downsample_ratio)
grid_w = int(w / downsample_ratio)
target = np.zeros(
(len(mask), 6 + self.num_classes, grid_h, grid_w),
dtype=np.float32)
for b in range(gt_bbox.shape[0]):
gx, gy, gw, gh = gt_bbox[b, :]
cls = gt_class[b]
score = gt_score[b]
if gw <= 0. or gh <= 0. or score <= 0.:
continue
# find best match anchor index
best_iou = 0.
best_idx = -1
for an_idx in range(an_hw.shape[0]):
iou = jaccard_overlap(
[0., 0., gw, gh],
[0., 0., an_hw[an_idx, 0], an_hw[an_idx, 1]])
if iou > best_iou:
best_iou = iou
best_idx = an_idx
gi = int(gx * grid_w)
gj = int(gy * grid_h)
# gtbox should be regresed in this layes if best match
# anchor index in anchor mask of this layer
if best_idx in mask:
best_n = mask.index(best_idx)
# x, y, w, h, scale
target[best_n, 0, gj, gi] = gx * grid_w - gi
target[best_n, 1, gj, gi] = gy * grid_h - gj
target[best_n, 2, gj, gi] = np.log(
gw * w / self.anchors[best_idx][0])
target[best_n, 3, gj, gi] = np.log(
gh * h / self.anchors[best_idx][1])
target[best_n, 4, gj, gi] = 2.0 - gw * gh
# objectness record gt_score
target[best_n, 5, gj, gi] = score
# classification
target[best_n, 6 + cls, gj, gi] = 1.
# For non-matched anchors, calculate the target if the iou
# between anchor and gt is larger than iou_thresh
if self.iou_thresh < 1:
for idx, mask_i in enumerate(mask):
if mask_i == best_idx: continue
iou = jaccard_overlap(
[0., 0., gw, gh],
[0., 0., an_hw[mask_i, 0], an_hw[mask_i, 1]])
if iou > self.iou_thresh and target[idx, 5, gj,
gi] == 0.:
# x, y, w, h, scale
target[idx, 0, gj, gi] = gx * grid_w - gi
target[idx, 1, gj, gi] = gy * grid_h - gj
target[idx, 2, gj, gi] = np.log(
gw * w / self.anchors[mask_i][0])
target[idx, 3, gj, gi] = np.log(
gh * h / self.anchors[mask_i][1])
target[idx, 4, gj, gi] = 2.0 - gw * gh
# objectness record gt_score
target[idx, 5, gj, gi] = score
# classification
target[idx, 6 + cls, gj, gi] = 1.
sample['target{}'.format(i)] = target
# remove useless gt_class and gt_score after target calculated
sample.pop('gt_class')
sample.pop('gt_score')
return samples
@register_op
class Gt2FCOSTarget(BaseOperator):
"""
Generate FCOS targets by groud truth data
"""
def __init__(self,
object_sizes_boundary,
center_sampling_radius,
downsample_ratios,
num_shift=0.5,
multiply_strides_reg_targets=False,
norm_reg_targets=True):
super(Gt2FCOSTarget, self).__init__()
self.center_sampling_radius = center_sampling_radius
self.downsample_ratios = downsample_ratios
self.INF = np.inf
self.object_sizes_boundary = [-1] + object_sizes_boundary + [self.INF]
object_sizes_of_interest = []
for i in range(len(self.object_sizes_boundary) - 1):
object_sizes_of_interest.append([
self.object_sizes_boundary[i], self.object_sizes_boundary[i + 1]
])
self.object_sizes_of_interest = object_sizes_of_interest
self.num_shift = num_shift
self.multiply_strides_reg_targets = multiply_strides_reg_targets
self.norm_reg_targets = norm_reg_targets
def _compute_points(self, w, h):
"""
compute the corresponding points in each feature map
:param h: image height
:param w: image width
:return: points from all feature map
"""
locations = []
for stride in self.downsample_ratios:
shift_x = np.arange(0, w, stride).astype(np.float32)
shift_y = np.arange(0, h, stride).astype(np.float32)
shift_x, shift_y = np.meshgrid(shift_x, shift_y)
shift_x = shift_x.flatten()
shift_y = shift_y.flatten()
location = np.stack(
[shift_x, shift_y], axis=1) + stride * self.num_shift
locations.append(location)
num_points_each_level = [len(location) for location in locations]
locations = np.concatenate(locations, axis=0)
return locations, num_points_each_level
def _convert_xywh2xyxy(self, gt_bbox, w, h):
"""
convert the bounding box from style xywh to xyxy
:param gt_bbox: bounding boxes normalized into [0, 1]
:param w: image width
:param h: image height
:return: bounding boxes in xyxy style
"""
bboxes = gt_bbox.copy()
bboxes[:, [0, 2]] = bboxes[:, [0, 2]] * w
bboxes[:, [1, 3]] = bboxes[:, [1, 3]] * h
bboxes[:, 2] = bboxes[:, 0] + bboxes[:, 2]
bboxes[:, 3] = bboxes[:, 1] + bboxes[:, 3]
return bboxes
def _check_inside_boxes_limited(self, gt_bbox, xs, ys,
num_points_each_level):
"""
check if points is within the clipped boxes
:param gt_bbox: bounding boxes
:param xs: horizontal coordinate of points
:param ys: vertical coordinate of points
:return: the mask of points is within gt_box or not
"""
bboxes = np.reshape(
gt_bbox, newshape=[1, gt_bbox.shape[0], gt_bbox.shape[1]])
bboxes = np.tile(bboxes, reps=[xs.shape[0], 1, 1])
ct_x = (bboxes[:, :, 0] + bboxes[:, :, 2]) / 2
ct_y = (bboxes[:, :, 1] + bboxes[:, :, 3]) / 2
beg = 0
clipped_box = bboxes.copy()
for lvl, stride in enumerate(self.downsample_ratios):
end = beg + num_points_each_level[lvl]
stride_exp = self.center_sampling_radius * stride
clipped_box[beg:end, :, 0] = np.maximum(
bboxes[beg:end, :, 0], ct_x[beg:end, :] - stride_exp)
clipped_box[beg:end, :, 1] = np.maximum(
bboxes[beg:end, :, 1], ct_y[beg:end, :] - stride_exp)
clipped_box[beg:end, :, 2] = np.minimum(
bboxes[beg:end, :, 2], ct_x[beg:end, :] + stride_exp)
clipped_box[beg:end, :, 3] = np.minimum(
bboxes[beg:end, :, 3], ct_y[beg:end, :] + stride_exp)
beg = end
l_res = xs - clipped_box[:, :, 0]
r_res = clipped_box[:, :, 2] - xs
t_res = ys - clipped_box[:, :, 1]
b_res = clipped_box[:, :, 3] - ys
clipped_box_reg_targets = np.stack([l_res, t_res, r_res, b_res], axis=2)
inside_gt_box = np.min(clipped_box_reg_targets, axis=2) > 0
return inside_gt_box
def __call__(self, samples, context=None):
assert len(self.object_sizes_of_interest) == len(self.downsample_ratios), \
"object_sizes_of_interest', and 'downsample_ratios' should have same length."
for sample in samples:
im = sample['image']
bboxes = sample['gt_bbox']
gt_class = sample['gt_class']
# calculate the locations
h, w = im.shape[1:3]
points, num_points_each_level = self._compute_points(w, h)
object_scale_exp = []
for i, num_pts in enumerate(num_points_each_level):
object_scale_exp.append(
np.tile(
np.array([self.object_sizes_of_interest[i]]),
reps=[num_pts, 1]))
object_scale_exp = np.concatenate(object_scale_exp, axis=0)
gt_area = (bboxes[:, 2] - bboxes[:, 0]) * (
bboxes[:, 3] - bboxes[:, 1])
xs, ys = points[:, 0], points[:, 1]
xs = np.reshape(xs, newshape=[xs.shape[0], 1])
xs = np.tile(xs, reps=[1, bboxes.shape[0]])
ys = np.reshape(ys, newshape=[ys.shape[0], 1])
ys = np.tile(ys, reps=[1, bboxes.shape[0]])
l_res = xs - bboxes[:, 0]
r_res = bboxes[:, 2] - xs
t_res = ys - bboxes[:, 1]
b_res = bboxes[:, 3] - ys
reg_targets = np.stack([l_res, t_res, r_res, b_res], axis=2)
if self.center_sampling_radius > 0:
is_inside_box = self._check_inside_boxes_limited(
bboxes, xs, ys, num_points_each_level)
else:
is_inside_box = np.min(reg_targets, axis=2) > 0
# check if the targets is inside the corresponding level
max_reg_targets = np.max(reg_targets, axis=2)
lower_bound = np.tile(
np.expand_dims(
object_scale_exp[:, 0], axis=1),
reps=[1, max_reg_targets.shape[1]])
high_bound = np.tile(
np.expand_dims(
object_scale_exp[:, 1], axis=1),
reps=[1, max_reg_targets.shape[1]])
is_match_current_level = \
(max_reg_targets > lower_bound) & \
(max_reg_targets < high_bound)
points2gtarea = np.tile(
np.expand_dims(
gt_area, axis=0), reps=[xs.shape[0], 1])
points2gtarea[is_inside_box == 0] = self.INF
points2gtarea[is_match_current_level == 0] = self.INF
points2min_area = points2gtarea.min(axis=1)
points2min_area_ind = points2gtarea.argmin(axis=1)
labels = gt_class[points2min_area_ind] + 1
labels[points2min_area == self.INF] = 0
reg_targets = reg_targets[range(xs.shape[0]), points2min_area_ind]
ctn_targets = np.sqrt((reg_targets[:, [0, 2]].min(axis=1) / \
reg_targets[:, [0, 2]].max(axis=1)) * \
(reg_targets[:, [1, 3]].min(axis=1) / \
reg_targets[:, [1, 3]].max(axis=1))).astype(np.float32)
ctn_targets = np.reshape(
ctn_targets, newshape=[ctn_targets.shape[0], 1])
ctn_targets[labels <= 0] = 0
pos_ind = np.nonzero(labels != 0)
reg_targets_pos = reg_targets[pos_ind[0], :]
split_sections = []
beg = 0
for lvl in range(len(num_points_each_level)):
end = beg + num_points_each_level[lvl]
split_sections.append(end)
beg = end
labels_by_level = np.split(labels, split_sections, axis=0)
reg_targets_by_level = np.split(reg_targets, split_sections, axis=0)
ctn_targets_by_level = np.split(ctn_targets, split_sections, axis=0)
for lvl in range(len(self.downsample_ratios)):
grid_w = int(np.ceil(w / self.downsample_ratios[lvl]))
grid_h = int(np.ceil(h / self.downsample_ratios[lvl]))
if self.norm_reg_targets:
if self.multiply_strides_reg_targets:
sample['reg_target{}'.format(lvl)] = np.reshape(
reg_targets_by_level[lvl],
newshape=[grid_h, grid_w, 4])
else:
sample['reg_target{}'.format(lvl)] = \
np.reshape(
reg_targets_by_level[lvl] / \
self.downsample_ratios[lvl],
newshape=[grid_h, grid_w, 4])
else:
sample['reg_target{}'.format(lvl)] = np.reshape(
reg_targets_by_level[lvl],
newshape=[grid_h, grid_w, 4])
sample['labels{}'.format(lvl)] = np.reshape(
labels_by_level[lvl], newshape=[grid_h, grid_w, 1])
sample['centerness{}'.format(lvl)] = np.reshape(
ctn_targets_by_level[lvl], newshape=[grid_h, grid_w, 1])
sample.pop('is_crowd', None)
sample.pop('difficult', None)
sample.pop('gt_class', None)
sample.pop('gt_bbox', None)
return samples
@register_op
class Gt2GFLTarget(BaseOperator):
__shared__ = ['num_classes']
"""
Generate GFocal loss targets by groud truth data
"""
def __init__(self,
num_classes=80,
downsample_ratios=[8, 16, 32, 64, 128],
grid_cell_scale=4,
cell_offset=0,
compute_vlr_region=False):
super(Gt2GFLTarget, self).__init__()
self.num_classes = num_classes
self.downsample_ratios = downsample_ratios
self.grid_cell_scale = grid_cell_scale
self.cell_offset = cell_offset
self.compute_vlr_region = compute_vlr_region
self.assigner = ATSSAssigner()
def get_grid_cells(self, featmap_size, scale, stride, offset=0):
"""
Generate grid cells of a feature map for target assignment.
Args:
featmap_size: Size of a single level feature map.
scale: Grid cell scale.
stride: Down sample stride of the feature map.
offset: Offset of grid cells.
return:
Grid_cells xyxy position. Size should be [feat_w * feat_h, 4]
"""
cell_size = stride * scale
h, w = featmap_size
x_range = (np.arange(w, dtype=np.float32) + offset) * stride
y_range = (np.arange(h, dtype=np.float32) + offset) * stride
x, y = np.meshgrid(x_range, y_range)
y = y.flatten()
x = x.flatten()
grid_cells = np.stack(
[
x - 0.5 * cell_size, y - 0.5 * cell_size, x + 0.5 * cell_size,
y + 0.5 * cell_size
],
axis=-1)
return grid_cells
def get_sample(self, assign_gt_inds, gt_bboxes):
pos_inds = np.unique(np.nonzero(assign_gt_inds > 0)[0])
neg_inds = np.unique(np.nonzero(assign_gt_inds == 0)[0])
pos_assigned_gt_inds = assign_gt_inds[pos_inds] - 1
if gt_bboxes.size == 0:
# hack for index error case
assert pos_assigned_gt_inds.size == 0
pos_gt_bboxes = np.empty_like(gt_bboxes).reshape(-1, 4)
else:
if len(gt_bboxes.shape) < 2:
gt_bboxes = gt_bboxes.resize(-1, 4)
pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]
return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds
def __call__(self, samples, context=None):
assert len(samples) > 0
batch_size = len(samples)
# get grid cells of image
h, w = samples[0]['image'].shape[1:3]
multi_level_grid_cells = []
for stride in self.downsample_ratios:
featmap_size = (int(math.ceil(h / stride)),
int(math.ceil(w / stride)))
multi_level_grid_cells.append(
self.get_grid_cells(featmap_size, self.grid_cell_scale, stride,
self.cell_offset))
mlvl_grid_cells_list = [
multi_level_grid_cells for i in range(batch_size)
]
# pixel cell number of multi-level feature maps
num_level_cells = [
grid_cells.shape[0] for grid_cells in mlvl_grid_cells_list[0]
]
num_level_cells_list = [num_level_cells] * batch_size
# concat all level cells and to a single array
for i in range(batch_size):
mlvl_grid_cells_list[i] = np.concatenate(mlvl_grid_cells_list[i])
# target assign on all images
for sample, grid_cells, num_level_cells in zip(
samples, mlvl_grid_cells_list, num_level_cells_list):
gt_bboxes = sample['gt_bbox']
gt_labels = sample['gt_class'].squeeze()
if gt_labels.size == 1:
gt_labels = np.array([gt_labels]).astype(np.int32)
gt_bboxes_ignore = None
assign_gt_inds, _ = self.assigner(grid_cells, num_level_cells,
gt_bboxes, gt_bboxes_ignore,
gt_labels)
if self.compute_vlr_region:
vlr_region = self.assigner.get_vlr_region(
grid_cells, num_level_cells, gt_bboxes, gt_bboxes_ignore,
gt_labels)
sample['vlr_regions'] = vlr_region
pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds = self.get_sample(
assign_gt_inds, gt_bboxes)
num_cells = grid_cells.shape[0]
bbox_targets = np.zeros_like(grid_cells)
bbox_weights = np.zeros_like(grid_cells)
labels = np.ones([num_cells], dtype=np.int64) * self.num_classes
label_weights = np.zeros([num_cells], dtype=np.float32)
if len(pos_inds) > 0:
pos_bbox_targets = pos_gt_bboxes
bbox_targets[pos_inds, :] = pos_bbox_targets
bbox_weights[pos_inds, :] = 1.0
if not np.any(gt_labels):
labels[pos_inds] = 0
else:
labels[pos_inds] = gt_labels[pos_assigned_gt_inds]
label_weights[pos_inds] = 1.0
if len(neg_inds) > 0:
label_weights[neg_inds] = 1.0
sample['grid_cells'] = grid_cells
sample['labels'] = labels
sample['label_weights'] = label_weights
sample['bbox_targets'] = bbox_targets
sample['pos_num'] = max(pos_inds.size, 1)
sample.pop('is_crowd', None)
sample.pop('difficult', None)
sample.pop('gt_class', None)
sample.pop('gt_bbox', None)
sample.pop('gt_score', None)
return samples
@register_op
class Gt2TTFTarget(BaseOperator):
__shared__ = ['num_classes']
"""
Gt2TTFTarget
Generate TTFNet targets by ground truth data
Args:
num_classes(int): the number of classes.
down_ratio(int): the down ratio from images to heatmap, 4 by default.
alpha(float): the alpha parameter to generate gaussian target.
0.54 by default.
"""
def __init__(self, num_classes=80, down_ratio=4, alpha=0.54):
super(Gt2TTFTarget, self).__init__()
self.down_ratio = down_ratio
self.num_classes = num_classes
self.alpha = alpha
def __call__(self, samples, context=None):
output_size = samples[0]['image'].shape[1]
feat_size = output_size // self.down_ratio
for sample in samples:
heatmap = np.zeros(
(self.num_classes, feat_size, feat_size), dtype='float32')
box_target = np.ones(
(4, feat_size, feat_size), dtype='float32') * -1
reg_weight = np.zeros((1, feat_size, feat_size), dtype='float32')
gt_bbox = sample['gt_bbox']
gt_class = sample['gt_class']
bbox_w = gt_bbox[:, 2] - gt_bbox[:, 0] + 1
bbox_h = gt_bbox[:, 3] - gt_bbox[:, 1] + 1
area = bbox_w * bbox_h
boxes_areas_log = np.log(area)
boxes_ind = np.argsort(boxes_areas_log, axis=0)[::-1]
boxes_area_topk_log = boxes_areas_log[boxes_ind]
gt_bbox = gt_bbox[boxes_ind]
gt_class = gt_class[boxes_ind]
feat_gt_bbox = gt_bbox / self.down_ratio
feat_gt_bbox = np.clip(feat_gt_bbox, 0, feat_size - 1)
feat_hs, feat_ws = (feat_gt_bbox[:, 3] - feat_gt_bbox[:, 1],
feat_gt_bbox[:, 2] - feat_gt_bbox[:, 0])
ct_inds = np.stack(
[(gt_bbox[:, 0] + gt_bbox[:, 2]) / 2,
(gt_bbox[:, 1] + gt_bbox[:, 3]) / 2],
axis=1) / self.down_ratio
h_radiuses_alpha = (feat_hs / 2. * self.alpha).astype('int32')
w_radiuses_alpha = (feat_ws / 2. * self.alpha).astype('int32')
for k in range(len(gt_bbox)):
cls_id = gt_class[k]
fake_heatmap = np.zeros((feat_size, feat_size), dtype='float32')
self.draw_truncate_gaussian(fake_heatmap, ct_inds[k],
h_radiuses_alpha[k],
w_radiuses_alpha[k])
heatmap[cls_id] = np.maximum(heatmap[cls_id], fake_heatmap)
box_target_inds = fake_heatmap > 0
box_target[:, box_target_inds] = gt_bbox[k][:, None]
local_heatmap = fake_heatmap[box_target_inds]
ct_div = np.sum(local_heatmap)
local_heatmap *= boxes_area_topk_log[k]
reg_weight[0, box_target_inds] = local_heatmap / ct_div
sample['ttf_heatmap'] = heatmap
sample['ttf_box_target'] = box_target
sample['ttf_reg_weight'] = reg_weight
sample.pop('is_crowd', None)
sample.pop('difficult', None)
sample.pop('gt_class', None)
sample.pop('gt_bbox', None)
sample.pop('gt_score', None)
return samples
def draw_truncate_gaussian(self, heatmap, center, h_radius, w_radius):
h, w = 2 * h_radius + 1, 2 * w_radius + 1
sigma_x = w / 6
sigma_y = h / 6
gaussian = gaussian2D((h, w), sigma_x, sigma_y)
x, y = int(center[0]), int(center[1])
height, width = heatmap.shape[0:2]
left, right = min(x, w_radius), min(width - x, w_radius + 1)
top, bottom = min(y, h_radius), min(height - y, h_radius + 1)
masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
masked_gaussian = gaussian[h_radius - top:h_radius + bottom, w_radius -
left:w_radius + right]
if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
heatmap[y - top:y + bottom, x - left:x + right] = np.maximum(
masked_heatmap, masked_gaussian)
return heatmap
@register_op
class Gt2Solov2Target(BaseOperator):
"""Assign mask target and labels in SOLOv2 network.
The code of this function is based on:
https://github.com/WXinlong/SOLO/blob/master/mmdet/models/anchor_heads/solov2_head.py#L271
Args:
num_grids (list): The list of feature map grids size.
scale_ranges (list): The list of mask boundary range.
coord_sigma (float): The coefficient of coordinate area length.
sampling_ratio (float): The ratio of down sampling.
"""
def __init__(self,
num_grids=[40, 36, 24, 16, 12],
scale_ranges=[[1, 96], [48, 192], [96, 384], [192, 768],
[384, 2048]],
coord_sigma=0.2,
sampling_ratio=4.0):
super(Gt2Solov2Target, self).__init__()
self.num_grids = num_grids
self.scale_ranges = scale_ranges
self.coord_sigma = coord_sigma
self.sampling_ratio = sampling_ratio
def _scale_size(self, im, scale):
h, w = im.shape[:2]
new_size = (int(w * float(scale) + 0.5), int(h * float(scale) + 0.5))
resized_img = cv2.resize(
im, None, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
return resized_img
def __call__(self, samples, context=None):
sample_id = 0
max_ins_num = [0] * len(self.num_grids)
for sample in samples:
gt_bboxes_raw = sample['gt_bbox']
gt_labels_raw = sample['gt_class'] + 1
im_c, im_h, im_w = sample['image'].shape[:]
gt_masks_raw = sample['gt_segm'].astype(np.uint8)
mask_feat_size = [
int(im_h / self.sampling_ratio), int(im_w / self.sampling_ratio)
]
gt_areas = np.sqrt((gt_bboxes_raw[:, 2] - gt_bboxes_raw[:, 0]) *
(gt_bboxes_raw[:, 3] - gt_bboxes_raw[:, 1]))
ins_ind_label_list = []
idx = 0
for (lower_bound, upper_bound), num_grid \
in zip(self.scale_ranges, self.num_grids):
hit_indices = ((gt_areas >= lower_bound) &
(gt_areas <= upper_bound)).nonzero()[0]
num_ins = len(hit_indices)
ins_label = []
grid_order = []
cate_label = np.zeros([num_grid, num_grid], dtype=np.int64)
ins_ind_label = np.zeros([num_grid**2], dtype=np.bool_)
if num_ins == 0:
ins_label = np.zeros(
[1, mask_feat_size[0], mask_feat_size[1]],
dtype=np.uint8)
ins_ind_label_list.append(ins_ind_label)
sample['cate_label{}'.format(idx)] = cate_label.flatten()
sample['ins_label{}'.format(idx)] = ins_label
sample['grid_order{}'.format(idx)] = np.asarray(
[sample_id * num_grid * num_grid + 0], dtype=np.int32)
idx += 1
continue
gt_bboxes = gt_bboxes_raw[hit_indices]
gt_labels = gt_labels_raw[hit_indices]
gt_masks = gt_masks_raw[hit_indices, ...]
half_ws = 0.5 * (
gt_bboxes[:, 2] - gt_bboxes[:, 0]) * self.coord_sigma
half_hs = 0.5 * (
gt_bboxes[:, 3] - gt_bboxes[:, 1]) * self.coord_sigma
for seg_mask, gt_label, half_h, half_w in zip(
gt_masks, gt_labels, half_hs, half_ws):
if seg_mask.sum() == 0:
continue
# mass center
upsampled_size = (mask_feat_size[0] * 4,
mask_feat_size[1] * 4)
center_h, center_w = ndimage.measurements.center_of_mass(
seg_mask)
coord_w = int(
(center_w / upsampled_size[1]) // (1. / num_grid))
coord_h = int(
(center_h / upsampled_size[0]) // (1. / num_grid))
# left, top, right, down
top_box = max(0,
int(((center_h - half_h) / upsampled_size[0])
// (1. / num_grid)))
down_box = min(num_grid - 1,
int(((center_h + half_h) / upsampled_size[0])
// (1. / num_grid)))
left_box = max(0,
int(((center_w - half_w) / upsampled_size[1])
// (1. / num_grid)))
right_box = min(num_grid - 1,
int(((center_w + half_w) /
upsampled_size[1]) // (1. / num_grid)))
top = max(top_box, coord_h - 1)
down = min(down_box, coord_h + 1)
left = max(coord_w - 1, left_box)
right = min(right_box, coord_w + 1)
cate_label[top:(down + 1), left:(right + 1)] = gt_label
seg_mask = self._scale_size(
seg_mask, scale=1. / self.sampling_ratio)
for i in range(top, down + 1):
for j in range(left, right + 1):
label = int(i * num_grid + j)
cur_ins_label = np.zeros(
[mask_feat_size[0], mask_feat_size[1]],
dtype=np.uint8)
cur_ins_label[:seg_mask.shape[0], :seg_mask.shape[
1]] = seg_mask
ins_label.append(cur_ins_label)
ins_ind_label[label] = True
grid_order.append(sample_id * num_grid * num_grid +
label)
if ins_label == []:
ins_label = np.zeros(
[1, mask_feat_size[0], mask_feat_size[1]],
dtype=np.uint8)
ins_ind_label_list.append(ins_ind_label)
sample['cate_label{}'.format(idx)] = cate_label.flatten()
sample['ins_label{}'.format(idx)] = ins_label
sample['grid_order{}'.format(idx)] = np.asarray(
[sample_id * num_grid * num_grid + 0], dtype=np.int32)
else:
ins_label = np.stack(ins_label, axis=0)
ins_ind_label_list.append(ins_ind_label)
sample['cate_label{}'.format(idx)] = cate_label.flatten()
sample['ins_label{}'.format(idx)] = ins_label
sample['grid_order{}'.format(idx)] = np.asarray(
grid_order, dtype=np.int32)
assert len(grid_order) > 0
max_ins_num[idx] = max(
max_ins_num[idx],
sample['ins_label{}'.format(idx)].shape[0])
idx += 1
ins_ind_labels = np.concatenate([
ins_ind_labels_level_img
for ins_ind_labels_level_img in ins_ind_label_list
])
fg_num = np.sum(ins_ind_labels)
sample['fg_num'] = fg_num
sample_id += 1
sample.pop('is_crowd')
sample.pop('gt_class')
sample.pop('gt_bbox')
sample.pop('gt_poly')
sample.pop('gt_segm')
# padding batch
for data in samples:
for idx in range(len(self.num_grids)):
gt_ins_data = np.zeros(
[
max_ins_num[idx],
data['ins_label{}'.format(idx)].shape[1],
data['ins_label{}'.format(idx)].shape[2]
],
dtype=np.uint8)
gt_ins_data[0:data['ins_label{}'.format(idx)].shape[
0], :, :] = data['ins_label{}'.format(idx)]
gt_grid_order = np.zeros([max_ins_num[idx]], dtype=np.int32)
gt_grid_order[0:data['grid_order{}'.format(idx)].shape[
0]] = data['grid_order{}'.format(idx)]
data['ins_label{}'.format(idx)] = gt_ins_data
data['grid_order{}'.format(idx)] = gt_grid_order
return samples
@register_op
class Gt2SparseTarget(BaseOperator):
def __init__(self, use_padding_shape=False):
super(Gt2SparseTarget, self).__init__()
self.use_padding_shape = use_padding_shape
def __call__(self, samples, context=None):
for sample in samples:
ori_h, ori_w = sample['h'], sample['w']
if self.use_padding_shape:
h, w = sample["image"].shape[1:3]
if "scale_factor" in sample:
sf_w, sf_h = sample["scale_factor"][1], sample[
"scale_factor"][0]
sample["scale_factor_whwh"] = np.array(
[sf_w, sf_h, sf_w, sf_h], dtype=np.float32)
else:
sample["scale_factor_whwh"] = np.array(
[1.0, 1.0, 1.0, 1.0], dtype=np.float32)
else:
h, w = round(sample['im_shape'][0]), round(sample['im_shape'][
1])
sample["scale_factor_whwh"] = np.array(
[w / ori_w, h / ori_h, w / ori_w, h / ori_h],
dtype=np.float32)
sample["img_whwh"] = np.array([w, h, w, h], dtype=np.float32)
sample["ori_shape"] = np.array([ori_h, ori_w], dtype=np.int32)
return samples
@register_op
class PadMaskBatch(BaseOperator):
"""
Pad a batch of samples so that they can be divisible by a stride.
The layout of each image should be 'CHW'.
Args:
pad_to_stride (int): If `pad_to_stride > 0`, pad zeros to ensure
height and width is divisible by `pad_to_stride`.
return_pad_mask (bool): If `return_pad_mask = True`, return
`pad_mask` for transformer.
"""
def __init__(self, pad_to_stride=0, return_pad_mask=True):
super(PadMaskBatch, self).__init__()
self.pad_to_stride = pad_to_stride
self.return_pad_mask = return_pad_mask
def __call__(self, samples, context=None):
"""
Args:
samples (list): a batch of sample, each is dict.
"""
coarsest_stride = self.pad_to_stride
max_shape = np.array([data['image'].shape for data in samples]).max(
axis=0)
if coarsest_stride > 0:
max_shape[1] = int(
np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride)
max_shape[2] = int(
np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride)
for data in samples:
im = data['image']
im_c, im_h, im_w = im.shape[:]
padding_im = np.zeros(
(im_c, max_shape[1], max_shape[2]), dtype=np.float32)
padding_im[:, :im_h, :im_w] = im.astype(np.float32)
data['image'] = padding_im
if 'semantic' in data and data['semantic'] is not None:
semantic = data['semantic']
padding_sem = np.zeros(
(1, max_shape[1], max_shape[2]), dtype=np.float32)
padding_sem[:, :im_h, :im_w] = semantic
data['semantic'] = padding_sem
if 'gt_segm' in data and data['gt_segm'] is not None:
gt_segm = data['gt_segm']
padding_segm = np.zeros(
(gt_segm.shape[0], max_shape[1], max_shape[2]),
dtype=np.uint8)
padding_segm[:, :im_h, :im_w] = gt_segm
data['gt_segm'] = padding_segm
if self.return_pad_mask:
padding_mask = np.zeros(
(max_shape[1], max_shape[2]), dtype=np.float32)
padding_mask[:im_h, :im_w] = 1.
data['pad_mask'] = padding_mask
return samples
@register_op
class Gt2CenterNetTarget(BaseOperator):
__shared__ = ['num_classes']
"""Gt2CenterNetTarget
Genterate CenterNet targets by ground-truth
Args:
down_ratio (int): The down sample ratio between output feature and
input image.
num_classes (int): The number of classes, 80 by default.
max_objs (int): The maximum objects detected, 128 by default.
"""
def __init__(self, num_classes=80, down_ratio=4, max_objs=128):
super(Gt2CenterNetTarget, self).__init__()
self.nc = num_classes
self.down_ratio = down_ratio
self.max_objs = max_objs
def __call__(self, sample, context=None):
input_h, input_w = sample['image'].shape[1:]
output_h = input_h // self.down_ratio
output_w = input_w // self.down_ratio
gt_bbox = sample['gt_bbox']
gt_class = sample['gt_class']
hm = np.zeros((self.nc, output_h, output_w), dtype=np.float32)
wh = np.zeros((self.max_objs, 2), dtype=np.float32)
reg = np.zeros((self.max_objs, 2), dtype=np.float32)
ind = np.zeros((self.max_objs), dtype=np.int64)
reg_mask = np.zeros((self.max_objs), dtype=np.int32)
cat_spec_wh = np.zeros((self.max_objs, self.nc * 2), dtype=np.float32)
cat_spec_mask = np.zeros((self.max_objs, self.nc * 2), dtype=np.int32)
trans_output = get_affine_transform(
center=sample['center'],
input_size=[sample['scale'], sample['scale']],
rot=0,
output_size=[output_w, output_h])
gt_det = []
for i, (bbox, cls) in enumerate(zip(gt_bbox, gt_class)):
cls = int(cls)
bbox[:2] = affine_transform(bbox[:2], trans_output)
bbox[2:] = affine_transform(bbox[2:], trans_output)
bbox_amodal = copy.deepcopy(bbox)
bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1)
bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1)
h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
if h > 0 and w > 0:
radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7)
radius = max(0, int(radius))
ct = np.array(
[(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
dtype=np.float32)
ct_int = ct.astype(np.int32)
# get hm,wh,reg,ind,ind_mask
draw_umich_gaussian(hm[cls], ct_int, radius)
wh[i] = 1. * w, 1. * h
reg[i] = ct - ct_int
ind[i] = ct_int[1] * output_w + ct_int[0]
reg_mask[i] = 1
cat_spec_wh[i, cls * 2:cls * 2 + 2] = wh[i]
cat_spec_mask[i, cls * 2:cls * 2 + 2] = 1
gt_det.append([
ct[0] - w / 2, ct[1] - h / 2, ct[0] + w / 2, ct[1] + h / 2,
1, cls
])
sample.pop('gt_bbox', None)
sample.pop('gt_class', None)
sample.pop('center', None)
sample.pop('scale', None)
sample.pop('is_crowd', None)
sample.pop('difficult', None)
sample['index'] = ind
sample['index_mask'] = reg_mask
sample['heatmap'] = hm
sample['size'] = wh
sample['offset'] = reg
return sample
@register_op
class PadGT(BaseOperator):
"""
Pad 0 to `gt_class`, `gt_bbox`, `gt_score`...
The num_max_boxes is the largest for batch.
Args:
return_gt_mask (bool): If true, return `pad_gt_mask`,
1 means bbox, 0 means no bbox.
"""
def __init__(self, return_gt_mask=True, pad_img=False, minimum_gtnum=0, only_origin_box=False):
super(PadGT, self).__init__()
self.return_gt_mask = return_gt_mask
self.pad_img = pad_img
self.minimum_gtnum = minimum_gtnum
self.only_origin_box = only_origin_box
def _impad(self,
img: np.ndarray,
*,
shape=None,
padding=None,
pad_val=0,
padding_mode='constant') -> np.ndarray:
"""Pad the given image to a certain shape or pad on all sides with
specified padding mode and padding value.
Args:
img (ndarray): Image to be padded.
shape (tuple[int]): Expected padding shape (h, w). Default: None.
padding (int or tuple[int]): Padding on each border. If a single int is
provided this is used to pad all borders. If tuple of length 2 is
provided this is the padding on left/right and top/bottom
respectively. If a tuple of length 4 is provided this is the
padding for the left, top, right and bottom borders respectively.
Default: None. Note that `shape` and `padding` can not be both
set.
pad_val (Number | Sequence[Number]): Values to be filled in padding
areas when padding_mode is 'constant'. Default: 0.
padding_mode (str): Type of padding. Should be: constant, edge,
reflect or symmetric. Default: constant.
- constant: pads with a constant value, this value is specified
with pad_val.
- edge: pads with the last value at the edge of the image.
- reflect: pads with reflection of image without repeating the last
value on the edge. For example, padding [1, 2, 3, 4] with 2
elements on both sides in reflect mode will result in
[3, 2, 1, 2, 3, 4, 3, 2].
- symmetric: pads with reflection of image repeating the last value
on the edge. For example, padding [1, 2, 3, 4] with 2 elements on
both sides in symmetric mode will result in
[2, 1, 1, 2, 3, 4, 4, 3]
Returns:
ndarray: The padded image.
"""
assert (shape is not None) ^ (padding is not None)
if shape is not None:
width = max(shape[1] - img.shape[1], 0)
height = max(shape[0] - img.shape[0], 0)
padding = (0, 0, int(width), int(height))
# check pad_val
import numbers
if isinstance(pad_val, tuple):
assert len(pad_val) == img.shape[-1]
elif not isinstance(pad_val, numbers.Number):
raise TypeError('pad_val must be a int or a tuple. '
f'But received {type(pad_val)}')
# check padding
if isinstance(padding, tuple) and len(padding) in [2, 4]:
if len(padding) == 2:
padding = (padding[0], padding[1], padding[0], padding[1])
elif isinstance(padding, numbers.Number):
padding = (padding, padding, padding, padding)
else:
raise ValueError('Padding must be a int or a 2, or 4 element tuple.'
f'But received {padding}')
# check padding mode
assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']
border_type = {
'constant': cv2.BORDER_CONSTANT,
'edge': cv2.BORDER_REPLICATE,
'reflect': cv2.BORDER_REFLECT_101,
'symmetric': cv2.BORDER_REFLECT
}
img = cv2.copyMakeBorder(
img,
padding[1],
padding[3],
padding[0],
padding[2],
border_type[padding_mode],
value=pad_val)
return img
def checkmaxshape(self, samples):
maxh, maxw = 0, 0
for sample in samples:
h, w = sample['im_shape']
if h > maxh:
maxh = h
if w > maxw:
maxw = w
return (maxh, maxw)
def __call__(self, samples, context=None):
num_max_boxes = max([len(s['gt_bbox']) for s in samples])
num_max_boxes = max(self.minimum_gtnum, num_max_boxes)
if self.pad_img:
maxshape = self.checkmaxshape(samples)
if self.only_origin_box:
for sample in samples:
if self.pad_img:
img = sample['image']
padimg = self._impad(img, shape=maxshape)
sample['image'] = padimg
if self.return_gt_mask:
sample['pad_origin_gt_mask'] = np.zeros(
(num_max_boxes, 1), dtype=np.float32)
if num_max_boxes == 0:
continue
num_gt = len(sample['origin_gt_bbox'])
pad_origin_gt_class = np.zeros((num_max_boxes, 1), dtype=np.int32)
pad_origin_gt_bbox = np.zeros((num_max_boxes, 4), dtype=np.float32)
if num_gt > 0:
pad_origin_gt_class[:num_gt] = sample['origin_gt_class']
pad_origin_gt_bbox[:num_gt] = sample['origin_gt_bbox']
sample['origin_gt_class'] = pad_origin_gt_class
sample['origin_gt_bbox'] = pad_origin_gt_bbox
if 'pad_origin_gt_mask' in sample:
sample['pad_origin_gt_mask'][:num_gt] = 1
else:
for sample in samples:
if self.pad_img:
img = sample['image']
padimg = self._impad(img, shape=maxshape)
sample['image'] = padimg
if self.return_gt_mask:
sample['pad_gt_mask'] = np.zeros(
(num_max_boxes, 1), dtype=np.float32)
if num_max_boxes == 0:
continue
num_gt = len(sample['gt_bbox'])
pad_gt_class = np.zeros((num_max_boxes, 1), dtype=np.int32)
pad_gt_bbox = np.zeros((num_max_boxes, 4), dtype=np.float32)
if num_gt > 0:
pad_gt_class[:num_gt] = sample['gt_class']
pad_gt_bbox[:num_gt] = sample['gt_bbox']
sample['gt_class'] = pad_gt_class
sample['gt_bbox'] = pad_gt_bbox
# pad_gt_mask
if 'pad_gt_mask' in sample:
sample['pad_gt_mask'][:num_gt] = 1
# gt_score
if 'gt_score' in sample:
pad_gt_score = np.zeros((num_max_boxes, 1), dtype=np.float32)
if num_gt > 0:
pad_gt_score[:num_gt] = sample['gt_score']
sample['gt_score'] = pad_gt_score
if 'is_crowd' in sample:
pad_is_crowd = np.zeros((num_max_boxes, 1), dtype=np.int32)
if num_gt > 0:
pad_is_crowd[:num_gt] = sample['is_crowd']
sample['is_crowd'] = pad_is_crowd
if 'difficult' in sample:
pad_diff = np.zeros((num_max_boxes, 1), dtype=np.int32)
if num_gt > 0:
pad_diff[:num_gt] = sample['difficult']
sample['difficult'] = pad_diff
if 'gt_joints' in sample:
num_joints = sample['gt_joints'].shape[1]
pad_gt_joints = np.zeros(
(num_max_boxes, num_joints, 3), dtype=np.float32)
if num_gt > 0:
pad_gt_joints[:num_gt] = sample['gt_joints']
sample['gt_joints'] = pad_gt_joints
if 'gt_areas' in sample:
pad_gt_areas = np.zeros((num_max_boxes, 1), dtype=np.float32)
if num_gt > 0:
pad_gt_areas[:num_gt, 0] = sample['gt_areas']
sample['gt_areas'] = pad_gt_areas
# gt_segm
if 'gt_segm' in sample:
pad_gt_segm = np.zeros(
(num_max_boxes, *sample['gt_segm'].shape[-2:]),
dtype=np.uint8)
if num_gt > 0:
pad_gt_segm[:num_gt] = sample['gt_segm']
sample['gt_segm'] = pad_gt_segm.astype(np.float32)
return samples
@register_op
class PadRGT(BaseOperator):
"""
Pad 0 to `gt_class`, `gt_bbox`, `gt_score`...
The num_max_boxes is the largest for batch.
Args:
return_gt_mask (bool): If true, return `pad_gt_mask`,
1 means bbox, 0 means no bbox.
"""
def __init__(self, return_gt_mask=True):
super(PadRGT, self).__init__()
self.return_gt_mask = return_gt_mask
def pad_field(self, sample, field, num_gt):
name, shape, dtype = field
if name in sample:
pad_v = np.zeros(shape, dtype=dtype)
if num_gt > 0:
pad_v[:num_gt] = sample[name]
sample[name] = pad_v
def __call__(self, samples, context=None):
num_max_boxes = max([len(s['gt_bbox']) for s in samples])
for sample in samples:
if self.return_gt_mask:
sample['pad_gt_mask'] = np.zeros(
(num_max_boxes, 1), dtype=np.float32)
if num_max_boxes == 0:
continue
num_gt = len(sample['gt_bbox'])
pad_gt_class = np.zeros((num_max_boxes, 1), dtype=np.int32)
pad_gt_bbox = np.zeros((num_max_boxes, 4), dtype=np.float32)
if num_gt > 0:
pad_gt_class[:num_gt] = sample['gt_class']
pad_gt_bbox[:num_gt] = sample['gt_bbox']
sample['gt_class'] = pad_gt_class
sample['gt_bbox'] = pad_gt_bbox
# pad_gt_mask
if 'pad_gt_mask' in sample:
sample['pad_gt_mask'][:num_gt] = 1
# gt_score
names = ['gt_score', 'is_crowd', 'difficult', 'gt_poly', 'gt_rbox']
dims = [1, 1, 1, 8, 5]
dtypes = [np.float32, np.int32, np.int32, np.float32, np.float32]
for name, dim, dtype in zip(names, dims, dtypes):
self.pad_field(sample, [name, (num_max_boxes, dim), dtype],
num_gt)
return samples
@register_op
class Gt2CenterTrackTarget(BaseOperator):
__shared__ = ['num_classes']
"""Gt2CenterTrackTarget
Genterate CenterTrack targets by ground-truth
Args:
num_classes (int): The number of classes, 1 by default.
down_ratio (int): The down sample ratio between output feature and
input image.
max_objs (int): The maximum objects detected, 256 by default.
"""
def __init__(self,
num_classes=1,
down_ratio=4,
max_objs=256,
hm_disturb=0.05,
lost_disturb=0.4,
fp_disturb=0.1,
pre_hm=True,
add_tracking=True,
add_ltrb_amodal=True):
super(Gt2CenterTrackTarget, self).__init__()
self.nc = num_classes
self.down_ratio = down_ratio
self.max_objs = max_objs
self.hm_disturb = hm_disturb
self.lost_disturb = lost_disturb
self.fp_disturb = fp_disturb
self.pre_hm = pre_hm
self.add_tracking = add_tracking
self.add_ltrb_amodal = add_ltrb_amodal
def _get_pre_dets(self, input_h, input_w, trans_input_pre, gt_bbox_pre,
gt_class_pre, gt_track_id_pre):
hm_h, hm_w = input_h, input_w
reutrn_hm = self.pre_hm
pre_hm = np.zeros(
(1, hm_h, hm_w), dtype=np.float32) if reutrn_hm else None
pre_cts, track_ids = [], []
for i, (
bbox, cls, track_id
) in enumerate(zip(gt_bbox_pre, gt_class_pre, gt_track_id_pre)):
cls = int(cls)
bbox[:2] = affine_transform(bbox[:2], trans_input_pre)
bbox[2:] = affine_transform(bbox[2:], trans_input_pre)
bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, hm_w - 1)
bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, hm_h - 1)
h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
max_rad = 1
if (h > 0 and w > 0):
radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7)
radius = max(0, int(radius))
max_rad = max(max_rad, radius)
ct = np.array(
[(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
dtype=np.float32)
ct0 = ct.copy()
conf = 1
ct[0] = ct[0] + np.random.randn() * self.hm_disturb * w
ct[1] = ct[1] + np.random.randn() * self.hm_disturb * h
conf = 1 if np.random.rand() > self.lost_disturb else 0
ct_int = ct.astype(np.int32)
if conf == 0:
pre_cts.append(ct / self.down_ratio)
else:
pre_cts.append(ct0 / self.down_ratio)
track_ids.append(track_id)
if reutrn_hm:
draw_umich_gaussian(pre_hm[0], ct_int, radius, k=conf)
if np.random.rand() < self.fp_disturb and reutrn_hm:
ct2 = ct0.copy()
# Hard code heatmap disturb ratio, haven't tried other numbers.
ct2[0] = ct2[0] + np.random.randn() * 0.05 * w
ct2[1] = ct2[1] + np.random.randn() * 0.05 * h
ct2_int = ct2.astype(np.int32)
draw_umich_gaussian(pre_hm[0], ct2_int, radius, k=conf)
return pre_hm, pre_cts, track_ids
def __call__(self, sample, context=None):
input_h, input_w = sample['image'].shape[1:]
output_h = input_h // self.down_ratio
output_w = input_w // self.down_ratio
gt_bbox = sample['gt_bbox']
gt_class = sample['gt_class']
# init
hm = np.zeros((self.nc, output_h, output_w), dtype=np.float32)
wh = np.zeros((self.max_objs, 2), dtype=np.float32)
reg = np.zeros((self.max_objs, 2), dtype=np.float32)
ind = np.zeros((self.max_objs), dtype=np.int64)
reg_mask = np.zeros((self.max_objs), dtype=np.int32)
if self.add_tracking:
tr = np.zeros((self.max_objs, 2), dtype=np.float32)
if self.add_ltrb_amodal:
ltrb_amodal = np.zeros((self.max_objs, 4), dtype=np.float32)
trans_output = get_affine_transform(
center=sample['center'],
input_size=[sample['scale'], sample['scale']],
rot=0,
output_size=[output_w, output_h])
pre_hm, pre_cts, track_ids = self._get_pre_dets(
input_h, input_w, sample['trans_input'], sample['pre_gt_bbox'],
sample['pre_gt_class'], sample['pre_gt_track_id'])
for i, (bbox, cls) in enumerate(zip(gt_bbox, gt_class)):
cls = int(cls)
rect = np.array(
[[bbox[0], bbox[1]], [bbox[0], bbox[3]], [bbox[2], bbox[3]],
[bbox[2], bbox[1]]],
dtype=np.float32)
for t in range(4):
rect[t] = affine_transform(rect[t], trans_output)
bbox[:2] = rect[:, 0].min(), rect[:, 1].min()
bbox[2:] = rect[:, 0].max(), rect[:, 1].max()
bbox_amodal = copy.deepcopy(bbox)
bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1)
bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1)
h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
if h > 0 and w > 0:
radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7)
radius = max(0, int(radius))
ct = np.array(
[(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
dtype=np.float32)
ct_int = ct.astype(np.int32)
# get hm,wh,reg,ind,ind_mask
draw_umich_gaussian(hm[cls], ct_int, radius)
wh[i] = 1. * w, 1. * h
reg[i] = ct - ct_int
ind[i] = ct_int[1] * output_w + ct_int[0]
reg_mask[i] = 1
if self.add_tracking:
if sample['gt_track_id'][i] in track_ids:
pre_ct = pre_cts[track_ids.index(sample['gt_track_id'][
i])]
tr[i] = pre_ct - ct_int
if self.add_ltrb_amodal:
ltrb_amodal[i] = \
bbox_amodal[0] - ct_int[0], bbox_amodal[1] - ct_int[1], \
bbox_amodal[2] - ct_int[0], bbox_amodal[3] - ct_int[1]
new_sample = {'image': sample['image']}
new_sample['index'] = ind
new_sample['index_mask'] = reg_mask
new_sample['heatmap'] = hm
new_sample['size'] = wh
new_sample['offset'] = reg
if self.add_tracking:
new_sample['tracking'] = tr
if self.add_ltrb_amodal:
new_sample['ltrb_amodal'] = ltrb_amodal
new_sample['pre_image'] = sample['pre_image']
new_sample['pre_hm'] = pre_hm
del sample
return new_sample
@register_op
class BatchRandomResizeForSSOD(BaseOperator):
"""
Resize image to target size randomly. random target_size and interpolation method
Args:
target_size (int, list, tuple): image target size, if random size is True, must be list or tuple
keep_ratio (bool): whether keep_raio or not, default true
interp (int): the interpolation method
random_size (bool): whether random select target size of image
random_interp (bool): whether random select interpolation method
"""
def __init__(self,
target_size,
keep_ratio,
interp=cv2.INTER_NEAREST,
random_size=True,
random_interp=False):
super(BatchRandomResizeForSSOD, self).__init__()
self.keep_ratio = keep_ratio
self.interps = [
cv2.INTER_NEAREST,
cv2.INTER_LINEAR,
cv2.INTER_AREA,
cv2.INTER_CUBIC,
cv2.INTER_LANCZOS4,
]
self.interp = interp
assert isinstance(target_size, (
int, Sequence)), "target_size must be int, list or tuple"
if random_size and not isinstance(target_size, list):
raise TypeError(
"Type of target_size is invalid when random_size is True. Must be List, now is {}".
format(type(target_size)))
self.target_size = target_size
self.random_size = random_size
self.random_interp = random_interp
def __call__(self, samples, context=None):
if self.random_size:
index = np.random.choice(len(self.target_size))
target_size = self.target_size[index]
else:
target_size = self.target_size
if context is not None:
target_size = self.target_size[context]
if self.random_interp:
interp = np.random.choice(self.interps)
else:
interp = self.interp
resizer = Resize(target_size, keep_ratio=self.keep_ratio, interp=interp)
return [resizer(samples, context=context), index]
================================================
FILE: ppdet/data/transform/culane_operators.py
================================================
import numpy as np
import imgaug.augmenters as iaa
from .operators import BaseOperator, register_op
from ppdet.utils.logger import setup_logger
from ppdet.data.culane_utils import linestrings_to_lanes, transform_annotation
logger = setup_logger(__name__)
__all__ = [
"CULaneTrainProcess", "CULaneDataProcess", "HorizontalFlip",
"ChannelShuffle", "CULaneAffine", "CULaneResize", "OneOfBlur",
"MultiplyAndAddToBrightness", "AddToHueAndSaturation"
]
def trainTransforms(img_h, img_w):
transforms = [{
'name': 'Resize',
'parameters': dict(size=dict(
height=img_h, width=img_w)),
'p': 1.0
}, {
'name': 'HorizontalFlip',
'parameters': dict(p=1.0),
'p': 0.5
}, {
'name': 'ChannelShuffle',
'parameters': dict(p=1.0),
'p': 0.1
}, {
'name': 'MultiplyAndAddToBrightness',
'parameters': dict(
mul=(0.85, 1.15), add=(-10, 10)),
'p': 0.6
}, {
'name': 'AddToHueAndSaturation',
'parameters': dict(value=(-10, 10)),
'p': 0.7
}, {
'name': 'OneOf',
'transforms': [
dict(
name='MotionBlur', parameters=dict(k=(3, 5))), dict(
name='MedianBlur', parameters=dict(k=(3, 5)))
],
'p': 0.2
}, {
'name': 'Affine',
'parameters': dict(
translate_percent=dict(
x=(-0.1, 0.1), y=(-0.1, 0.1)),
rotate=(-10, 10),
scale=(0.8, 1.2)),
'p': 0.7
}, {
'name': 'Resize',
'parameters': dict(size=dict(
height=img_h, width=img_w)),
'p': 1.0
}]
return transforms
@register_op
class CULaneTrainProcess(BaseOperator):
def __init__(self, img_w, img_h):
super(CULaneTrainProcess, self).__init__()
self.img_w = img_w
self.img_h = img_h
self.transforms = trainTransforms(self.img_h, self.img_w)
if self.transforms is not None:
img_transforms = []
for aug in self.transforms:
p = aug['p']
if aug['name'] != 'OneOf':
img_transforms.append(
iaa.Sometimes(
p=p,
then_list=getattr(iaa, aug['name'])(**aug[
'parameters'])))
else:
img_transforms.append(
iaa.Sometimes(
p=p,
then_list=iaa.OneOf([
getattr(iaa, aug_['name'])(**aug_['parameters'])
for aug_ in aug['transforms']
])))
else:
img_transforms = []
self.iaa_transform = iaa.Sequential(img_transforms)
def apply(self, sample, context=None):
img, line_strings, seg = self.iaa_transform(
image=sample['image'],
line_strings=sample['lanes'],
segmentation_maps=sample['mask'])
sample['image'] = img
sample['lanes'] = line_strings
sample['mask'] = seg
return sample
@register_op
class CULaneDataProcess(BaseOperator):
def __init__(self, img_w, img_h, num_points, max_lanes):
super(CULaneDataProcess, self).__init__()
self.img_w = img_w
self.img_h = img_h
self.num_points = num_points
self.n_offsets = num_points
self.n_strips = num_points - 1
self.strip_size = self.img_h / self.n_strips
self.max_lanes = max_lanes
self.offsets_ys = np.arange(self.img_h, -1, -self.strip_size)
def apply(self, sample, context=None):
data = {}
line_strings = sample['lanes']
line_strings.clip_out_of_image_()
new_anno = {'lanes': linestrings_to_lanes(line_strings)}
for i in range(30):
try:
annos = transform_annotation(
self.img_w, self.img_h, self.max_lanes, self.n_offsets,
self.offsets_ys, self.n_strips, self.strip_size, new_anno)
label = annos['label']
lane_endpoints = annos['lane_endpoints']
break
except:
if (i + 1) == 30:
logger.critical('Transform annotation failed 30 times :(')
exit()
sample['image'] = sample['image'].astype(np.float32) / 255.
data['image'] = sample['image'].transpose(2, 0, 1)
data['lane_line'] = label
data['seg'] = sample['seg']
data['full_img_path'] = sample['full_img_path']
data['img_name'] = sample['img_name']
data['im_id'] = sample['im_id']
if 'mask' in sample.keys():
data['seg'] = sample['mask'].get_arr()
data['im_shape'] = np.array([self.img_w, self.img_h], dtype=np.float32)
data['scale_factor'] = np.array([1., 1.], dtype=np.float32)
return data
@register_op
class CULaneResize(BaseOperator):
def __init__(self, img_h, img_w, prob=0.5):
super(CULaneResize, self).__init__()
self.img_h = img_h
self.img_w = img_w
self.prob = prob
def apply(self, sample, context=None):
transform = iaa.Sometimes(self.prob,
iaa.Resize({
"height": self.img_h,
"width": self.img_w
}))
if 'mask' in sample.keys():
img, line_strings, seg = transform(
image=sample['image'],
line_strings=sample['lanes'],
segmentation_maps=sample['mask'])
sample['image'] = img
sample['lanes'] = line_strings
sample['mask'] = seg
else:
img, line_strings = transform(
image=sample['image'].copy().astype(np.uint8),
line_strings=sample['lanes'])
sample['image'] = img
sample['lanes'] = line_strings
return sample
@register_op
class HorizontalFlip(BaseOperator):
def __init__(self, prob=0.5):
super(HorizontalFlip, self).__init__()
self.prob = prob
def apply(self, sample, context=None):
transform = iaa.Sometimes(self.prob, iaa.HorizontalFlip(1.0))
if 'mask' in sample.keys():
img, line_strings, seg = transform(
image=sample['image'],
line_strings=sample['lanes'],
segmentation_maps=sample['mask'])
sample['image'] = img
sample['lanes'] = line_strings
sample['mask'] = seg
else:
img, line_strings = transform(
image=sample['image'], line_strings=sample['lanes'])
sample['image'] = img
sample['lanes'] = line_strings
return sample
@register_op
class ChannelShuffle(BaseOperator):
def __init__(self, prob=0.1):
super(ChannelShuffle, self).__init__()
self.prob = prob
def apply(self, sample, context=None):
transform = iaa.Sometimes(self.prob, iaa.ChannelShuffle(1.0))
if 'mask' in sample.keys():
img, line_strings, seg = transform(
image=sample['image'],
line_strings=sample['lanes'],
segmentation_maps=sample['mask'])
sample['image'] = img
sample['lanes'] = line_strings
sample['mask'] = seg
else:
img, line_strings = transform(
image=sample['image'], line_strings=sample['lanes'])
sample['image'] = img
sample['lanes'] = line_strings
return sample
@register_op
class MultiplyAndAddToBrightness(BaseOperator):
def __init__(self, mul=(0.85, 1.15), add=(-10, 10), prob=0.5):
super(MultiplyAndAddToBrightness, self).__init__()
self.mul = tuple(mul)
self.add = tuple(add)
self.prob = prob
def apply(self, sample, context=None):
transform = iaa.Sometimes(
self.prob,
iaa.MultiplyAndAddToBrightness(
mul=self.mul, add=self.add))
if 'mask' in sample.keys():
img, line_strings, seg = transform(
image=sample['image'],
line_strings=sample['lanes'],
segmentation_maps=sample['mask'])
sample['image'] = img
sample['lanes'] = line_strings
sample['mask'] = seg
else:
img, line_strings = transform(
image=sample['image'], line_strings=sample['lanes'])
sample['image'] = img
sample['lanes'] = line_strings
return sample
@register_op
class AddToHueAndSaturation(BaseOperator):
def __init__(self, value=(-10, 10), prob=0.5):
super(AddToHueAndSaturation, self).__init__()
self.value = tuple(value)
self.prob = prob
def apply(self, sample, context=None):
transform = iaa.Sometimes(
self.prob, iaa.AddToHueAndSaturation(value=self.value))
if 'mask' in sample.keys():
img, line_strings, seg = transform(
image=sample['image'],
line_strings=sample['lanes'],
segmentation_maps=sample['mask'])
sample['image'] = img
sample['lanes'] = line_strings
sample['mask'] = seg
else:
img, line_strings = transform(
image=sample['image'], line_strings=sample['lanes'])
sample['image'] = img
sample['lanes'] = line_strings
return sample
@register_op
class OneOfBlur(BaseOperator):
def __init__(self, MotionBlur_k=(3, 5), MedianBlur_k=(3, 5), prob=0.5):
super(OneOfBlur, self).__init__()
self.MotionBlur_k = tuple(MotionBlur_k)
self.MedianBlur_k = tuple(MedianBlur_k)
self.prob = prob
def apply(self, sample, context=None):
transform = iaa.Sometimes(
self.prob,
iaa.OneOf([
iaa.MotionBlur(k=self.MotionBlur_k),
iaa.MedianBlur(k=self.MedianBlur_k)
]))
if 'mask' in sample.keys():
img, line_strings, seg = transform(
image=sample['image'],
line_strings=sample['lanes'],
segmentation_maps=sample['mask'])
sample['image'] = img
sample['lanes'] = line_strings
sample['mask'] = seg
else:
img, line_strings = transform(
image=sample['image'], line_strings=sample['lanes'])
sample['image'] = img
sample['lanes'] = line_strings
return sample
@register_op
class CULaneAffine(BaseOperator):
def __init__(self,
translate_percent_x=(-0.1, 0.1),
translate_percent_y=(-0.1, 0.1),
rotate=(3, 5),
scale=(0.8, 1.2),
prob=0.5):
super(CULaneAffine, self).__init__()
self.translate_percent = {
'x': tuple(translate_percent_x),
'y': tuple(translate_percent_y)
}
self.rotate = tuple(rotate)
self.scale = tuple(scale)
self.prob = prob
def apply(self, sample, context=None):
transform = iaa.Sometimes(
self.prob,
iaa.Affine(
translate_percent=self.translate_percent,
rotate=self.rotate,
scale=self.scale))
if 'mask' in sample.keys():
img, line_strings, seg = transform(
image=sample['image'],
line_strings=sample['lanes'],
segmentation_maps=sample['mask'])
sample['image'] = img
sample['lanes'] = line_strings
sample['mask'] = seg
else:
img, line_strings = transform(
image=sample['image'], line_strings=sample['lanes'])
sample['image'] = img
sample['lanes'] = line_strings
return sample
================================================
FILE: ppdet/data/transform/gridmask_utils.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# The code is based on:
# https://github.com/dvlab-research/GridMask/blob/master/detection_grid/maskrcnn_benchmark/data/transforms/grid.py
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import numpy as np
from PIL import Image
class Gridmask(object):
def __init__(self,
use_h=True,
use_w=True,
rotate=1,
offset=False,
ratio=0.5,
mode=1,
prob=0.7,
upper_iter=360000):
super(Gridmask, self).__init__()
self.use_h = use_h
self.use_w = use_w
self.rotate = rotate
self.offset = offset
self.ratio = ratio
self.mode = mode
self.prob = prob
self.st_prob = prob
self.upper_iter = upper_iter
def __call__(self, x, curr_iter):
self.prob = self.st_prob * min(1, 1.0 * curr_iter / self.upper_iter)
if np.random.rand() > self.prob:
return x
h, w, _ = x.shape
hh = int(1.5 * h)
ww = int(1.5 * w)
d = np.random.randint(2, h)
self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1)
mask = np.ones((hh, ww), np.float32)
st_h = np.random.randint(d)
st_w = np.random.randint(d)
if self.use_h:
for i in range(hh // d):
s = d * i + st_h
t = min(s + self.l, hh)
mask[s:t, :] *= 0
if self.use_w:
for i in range(ww // d):
s = d * i + st_w
t = min(s + self.l, ww)
mask[:, s:t] *= 0
r = np.random.randint(self.rotate)
mask = Image.fromarray(np.uint8(mask))
mask = mask.rotate(r)
mask = np.asarray(mask)
mask = mask[(hh - h) // 2:(hh - h) // 2 + h, (ww - w) // 2:(ww - w) // 2
+ w].astype(np.float32)
if self.mode == 1:
mask = 1 - mask
mask = np.expand_dims(mask, axis=-1)
if self.offset:
offset = (2 * (np.random.rand(h, w) - 0.5)).astype(np.float32)
x = (x * mask + offset * (1 - mask)).astype(x.dtype)
else:
x = (x * mask).astype(x.dtype)
return x
================================================
FILE: ppdet/data/transform/keypoint_operators.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# function:
# operators to process sample,
# eg: decode/resize/crop image
from __future__ import absolute_import
try:
from collections.abc import Sequence
except Exception:
from collections import Sequence
import cv2
import numpy as np
import math
import copy
from ...modeling.keypoint_utils import get_affine_mat_kernel, warp_affine_joints, get_affine_transform, affine_transform, get_warp_matrix
from ppdet.core.workspace import serializable
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
registered_ops = []
__all__ = [
'RandomAffine', 'KeyPointFlip', 'TagGenerate', 'ToHeatmaps',
'NormalizePermute', 'EvalAffine', 'RandomFlipHalfBodyTransform',
'TopDownRandomFlip', 'TopDownRandomShiftBboxCenter', 'TopDownGetRandomScaleRotation',
'TopDownAffine', 'ToHeatmapsTopDown', 'ToHeatmapsTopDown_DARK',
'ToHeatmapsTopDown_UDP', 'TopDownEvalAffine',
'AugmentationbyInformantionDropping', 'SinglePoseAffine', 'NoiseJitter',
'FlipPose', 'PETR_Resize'
]
def register_keypointop(cls):
return serializable(cls)
@register_keypointop
class KeyPointFlip(object):
"""Get the fliped image by flip_prob. flip the coords also
the left coords and right coords should exchange while flip, for the right keypoint will be left keypoint after image fliped
Args:
flip_permutation (list[17]): the left-right exchange order list corresponding to [0,1,2,...,16]
hmsize (list[2]): output heatmap's shape list of different scale outputs of higherhrnet
flip_prob (float): the ratio whether to flip the image
records(dict): the dict contained the image, mask and coords
Returns:
records(dict): contain the image, mask and coords after tranformed
"""
def __init__(self, flip_permutation, hmsize=None, flip_prob=0.5):
super(KeyPointFlip, self).__init__()
assert isinstance(flip_permutation, Sequence)
self.flip_permutation = flip_permutation
self.flip_prob = flip_prob
self.hmsize = hmsize
def _flipjoints(self, records, sizelst):
'''
records['gt_joints'] is Sequence in higherhrnet
'''
if not ('gt_joints' in records and len(records['gt_joints']) > 0):
return records
kpts_lst = records['gt_joints']
if isinstance(kpts_lst, Sequence):
for idx, hmsize in enumerate(sizelst):
if kpts_lst[idx].ndim == 3:
kpts_lst[idx] = kpts_lst[idx][:, self.flip_permutation]
else:
kpts_lst[idx] = kpts_lst[idx][self.flip_permutation]
kpts_lst[idx][..., 0] = hmsize - kpts_lst[idx][..., 0]
else:
hmsize = sizelst[0]
if kpts_lst.ndim == 3:
kpts_lst = kpts_lst[:, self.flip_permutation]
else:
kpts_lst = kpts_lst[self.flip_permutation]
kpts_lst[..., 0] = hmsize - kpts_lst[..., 0]
records['gt_joints'] = kpts_lst
return records
def _flipmask(self, records, sizelst):
if not 'mask' in records:
return records
mask_lst = records['mask']
for idx, hmsize in enumerate(sizelst):
if len(mask_lst) > idx:
mask_lst[idx] = mask_lst[idx][:, ::-1]
records['mask'] = mask_lst
return records
def _flipbbox(self, records, sizelst):
if not 'gt_bbox' in records:
return records
bboxes = records['gt_bbox']
hmsize = sizelst[0]
bboxes[:, 0::2] = hmsize - bboxes[:, 0::2][:, ::-1]
bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, hmsize)
records['gt_bbox'] = bboxes
return records
def __call__(self, records):
flip = np.random.random() < self.flip_prob
if flip:
image = records['image']
image = image[:, ::-1]
records['image'] = image
if self.hmsize is None:
sizelst = [image.shape[1]]
else:
sizelst = self.hmsize
self._flipjoints(records, sizelst)
self._flipmask(records, sizelst)
self._flipbbox(records, sizelst)
return records
@register_keypointop
class RandomAffine(object):
"""apply affine transform to image, mask and coords
to achieve the rotate, scale and shift effect for training image
Args:
max_degree (float): the max abslute rotate degree to apply, transform range is [-max_degree, max_degree]
max_scale (list[2]): the scale range to apply, transform range is [min, max]
max_shift (float): the max abslute shift ratio to apply, transform range is [-max_shift*imagesize, max_shift*imagesize]
hmsize (list[2]): output heatmap's shape list of different scale outputs of higherhrnet
trainsize (list[2]): the standard length used to train, the 'scale_type' of [h,w] will be resize to trainsize for standard
scale_type (str): the length of [h,w] to used for trainsize, chosed between 'short' and 'long'
records(dict): the dict contained the image, mask and coords
Returns:
records(dict): contain the image, mask and coords after tranformed
"""
def __init__(self,
max_degree=30,
scale=[0.75, 1.5],
max_shift=0.2,
hmsize=None,
trainsize=[512, 512],
scale_type='short',
boldervalue=[114, 114, 114]):
super(RandomAffine, self).__init__()
self.max_degree = max_degree
self.min_scale = scale[0]
self.max_scale = scale[1]
self.max_shift = max_shift
self.hmsize = hmsize
self.trainsize = trainsize
self.scale_type = scale_type
self.boldervalue = boldervalue
def _get_affine_matrix_old(self, center, scale, res, rot=0):
"""Generate transformation matrix."""
h = scale
t = np.zeros((3, 3), dtype=np.float32)
t[0, 0] = float(res[1]) / h
t[1, 1] = float(res[0]) / h
t[0, 2] = res[1] * (-float(center[0]) / h + .5)
t[1, 2] = res[0] * (-float(center[1]) / h + .5)
t[2, 2] = 1
if rot != 0:
rot = -rot # To match direction of rotation from cropping
rot_mat = np.zeros((3, 3), dtype=np.float32)
rot_rad = rot * np.pi / 180
sn, cs = np.sin(rot_rad), np.cos(rot_rad)
rot_mat[0, :2] = [cs, -sn]
rot_mat[1, :2] = [sn, cs]
rot_mat[2, 2] = 1
# Need to rotate around center
t_mat = np.eye(3)
t_mat[0, 2] = -res[1] / 2
t_mat[1, 2] = -res[0] / 2
t_inv = t_mat.copy()
t_inv[:2, 2] *= -1
t = np.dot(t_inv, np.dot(rot_mat, np.dot(t_mat, t)))
return t
def _get_affine_matrix(self, center, scale, res, rot=0):
"""Generate transformation matrix."""
w, h = scale
t = np.zeros((3, 3), dtype=np.float32)
t[0, 0] = float(res[0]) / w
t[1, 1] = float(res[1]) / h
t[0, 2] = res[0] * (-float(center[0]) / w + .5)
t[1, 2] = res[1] * (-float(center[1]) / h + .5)
t[2, 2] = 1
if rot != 0:
rot = -rot # To match direction of rotation from cropping
rot_mat = np.zeros((3, 3), dtype=np.float32)
rot_rad = rot * np.pi / 180
sn, cs = np.sin(rot_rad), np.cos(rot_rad)
rot_mat[0, :2] = [cs, -sn]
rot_mat[1, :2] = [sn, cs]
rot_mat[2, 2] = 1
# Need to rotate around center
t_mat = np.eye(3)
t_mat[0, 2] = -res[0] / 2
t_mat[1, 2] = -res[1] / 2
t_inv = t_mat.copy()
t_inv[:2, 2] *= -1
t = np.dot(t_inv, np.dot(rot_mat, np.dot(t_mat, t)))
return t
def _affine_joints_mask(self,
degree,
center,
roi_size,
dsize,
keypoints=None,
heatmap_mask=None,
gt_bbox=None):
kpts = None
mask = None
bbox = None
mask_affine_mat = self._get_affine_matrix(center, roi_size, dsize,
degree)[:2]
if heatmap_mask is not None:
mask = cv2.warpAffine(heatmap_mask, mask_affine_mat, dsize)
mask = ((mask / 255) > 0.5).astype(np.float32)
if keypoints is not None:
kpts = copy.deepcopy(keypoints)
kpts[..., 0:2] = warp_affine_joints(kpts[..., 0:2].copy(),
mask_affine_mat)
kpts[(kpts[..., 0]) > dsize[0], :] = 0
kpts[(kpts[..., 1]) > dsize[1], :] = 0
kpts[(kpts[..., 0]) < 0, :] = 0
kpts[(kpts[..., 1]) < 0, :] = 0
if gt_bbox is not None:
temp_bbox = gt_bbox[:, [0, 3, 2, 1]]
cat_bbox = np.concatenate((gt_bbox, temp_bbox), axis=-1)
gt_bbox_warped = warp_affine_joints(cat_bbox, mask_affine_mat)
bbox = np.zeros_like(gt_bbox)
bbox[:, 0] = gt_bbox_warped[:, 0::2].min(1).clip(0, dsize[0])
bbox[:, 2] = gt_bbox_warped[:, 0::2].max(1).clip(0, dsize[0])
bbox[:, 1] = gt_bbox_warped[:, 1::2].min(1).clip(0, dsize[1])
bbox[:, 3] = gt_bbox_warped[:, 1::2].max(1).clip(0, dsize[1])
return kpts, mask, bbox
def __call__(self, records):
image = records['image']
shape = np.array(image.shape[:2][::-1])
keypoints = None
heatmap_mask = None
gt_bbox = None
if 'gt_joints' in records:
keypoints = records['gt_joints']
if 'mask' in records:
heatmap_mask = records['mask']
heatmap_mask *= 255
if 'gt_bbox' in records:
gt_bbox = records['gt_bbox']
degree = (np.random.random() * 2 - 1) * self.max_degree
center = center = np.array((np.array(shape) / 2))
aug_scale = np.random.random() * (self.max_scale - self.min_scale
) + self.min_scale
if self.scale_type == 'long':
scale = np.array([max(shape[0], shape[1]) / 1.0] * 2)
elif self.scale_type == 'short':
scale = np.array([min(shape[0], shape[1]) / 1.0] * 2)
elif self.scale_type == 'wh':
scale = shape
else:
raise ValueError('Unknown scale type: {}'.format(self.scale_type))
roi_size = aug_scale * scale
dx = int(0)
dy = int(0)
if self.max_shift > 0:
dx = np.random.randint(-self.max_shift * roi_size[0],
self.max_shift * roi_size[0])
dy = np.random.randint(-self.max_shift * roi_size[0],
self.max_shift * roi_size[1])
center += np.array([dx, dy])
input_size = 2 * center
if self.trainsize != -1:
dsize = self.trainsize
imgshape = (dsize)
else:
dsize = scale
imgshape = (shape.tolist())
image_affine_mat = self._get_affine_matrix(center, roi_size, dsize,
degree)[:2]
image = cv2.warpAffine(
image,
image_affine_mat,
imgshape,
flags=cv2.INTER_LINEAR,
borderValue=self.boldervalue)
if self.hmsize is None:
kpts, mask, gt_bbox = self._affine_joints_mask(
degree, center, roi_size, dsize, keypoints, heatmap_mask,
gt_bbox)
records['image'] = image
if kpts is not None: records['gt_joints'] = kpts
if mask is not None: records['mask'] = mask
if gt_bbox is not None: records['gt_bbox'] = gt_bbox
return records
kpts_lst = []
mask_lst = []
for hmsize in self.hmsize:
kpts, mask, gt_bbox = self._affine_joints_mask(
degree, center, roi_size, [hmsize, hmsize], keypoints,
heatmap_mask, gt_bbox)
kpts_lst.append(kpts)
mask_lst.append(mask)
records['image'] = image
if 'gt_joints' in records:
records['gt_joints'] = kpts_lst
if 'mask' in records:
records['mask'] = mask_lst
if 'gt_bbox' in records:
records['gt_bbox'] = gt_bbox
return records
@register_keypointop
class EvalAffine(object):
"""apply affine transform to image
resize the short of [h,w] to standard size for eval
Args:
size (int): the standard length used to train, the 'short' of [h,w] will be resize to trainsize for standard
records(dict): the dict contained the image, mask and coords
Returns:
records(dict): contain the image, mask and coords after tranformed
"""
def __init__(self, size, stride=64):
super(EvalAffine, self).__init__()
self.size = size
self.stride = stride
def __call__(self, records):
image = records['image']
mask = records['mask'] if 'mask' in records else None
s = self.size
h, w, _ = image.shape
trans, size_resized = get_affine_mat_kernel(h, w, s, inv=False)
image_resized = cv2.warpAffine(image, trans, size_resized)
if mask is not None:
mask = cv2.warpAffine(mask, trans, size_resized)
records['mask'] = mask
if 'gt_joints' in records:
del records['gt_joints']
records['image'] = image_resized
records['scale_factor'] = self.size / min(h, w)
return records
@register_keypointop
class NormalizePermute(object):
def __init__(self,
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.120, 57.375],
is_scale=True):
super(NormalizePermute, self).__init__()
self.mean = mean
self.std = std
self.is_scale = is_scale
def __call__(self, records):
image = records['image']
image = image.astype(np.float32)
if self.is_scale:
image /= 255.
image = image.transpose((2, 0, 1))
mean = np.array(self.mean, dtype=np.float32)
std = np.array(self.std, dtype=np.float32)
invstd = 1. / std
for v, m, s in zip(image, mean, invstd):
v.__isub__(m).__imul__(s)
records['image'] = image
return records
@register_keypointop
class TagGenerate(object):
"""record gt coords for aeloss to sample coords value in tagmaps
Args:
num_joints (int): the keypoint numbers of dataset to train
num_people (int): maxmum people to support for sample aeloss
records(dict): the dict contained the image, mask and coords
Returns:
records(dict): contain the gt coords used in tagmap
"""
def __init__(self, num_joints, max_people=30):
super(TagGenerate, self).__init__()
self.max_people = max_people
self.num_joints = num_joints
def __call__(self, records):
kpts_lst = records['gt_joints']
kpts = kpts_lst[0]
tagmap = np.zeros((self.max_people, self.num_joints, 4), dtype=np.int64)
inds = np.where(kpts[..., 2] > 0)
p, j = inds[0], inds[1]
visible = kpts[inds]
# tagmap is [p, j, 3], where last dim is j, y, x
tagmap[p, j, 0] = j
tagmap[p, j, 1] = visible[..., 1] # y
tagmap[p, j, 2] = visible[..., 0] # x
tagmap[p, j, 3] = 1
records['tagmap'] = tagmap
del records['gt_joints']
return records
@register_keypointop
class ToHeatmaps(object):
"""to generate the gaussin heatmaps of keypoint for heatmap loss
Args:
num_joints (int): the keypoint numbers of dataset to train
hmsize (list[2]): output heatmap's shape list of different scale outputs of higherhrnet
sigma (float): the std of gaussin kernel genereted
records(dict): the dict contained the image, mask and coords
Returns:
records(dict): contain the heatmaps used to heatmaploss
"""
def __init__(self, num_joints, hmsize, sigma=None):
super(ToHeatmaps, self).__init__()
self.num_joints = num_joints
self.hmsize = np.array(hmsize)
if sigma is None:
sigma = hmsize[0] // 64
self.sigma = sigma
r = 6 * sigma + 3
x = np.arange(0, r, 1, np.float32)
y = x[:, None]
x0, y0 = 3 * sigma + 1, 3 * sigma + 1
self.gaussian = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2))
def __call__(self, records):
kpts_lst = records['gt_joints']
mask_lst = records['mask']
for idx, hmsize in enumerate(self.hmsize):
mask = mask_lst[idx]
kpts = kpts_lst[idx]
heatmaps = np.zeros((self.num_joints, hmsize, hmsize))
inds = np.where(kpts[..., 2] > 0)
visible = kpts[inds].astype(np.int64)[..., :2]
ul = np.round(visible - 3 * self.sigma - 1)
br = np.round(visible + 3 * self.sigma + 2)
sul = np.maximum(0, -ul)
sbr = np.minimum(hmsize, br) - ul
dul = np.clip(ul, 0, hmsize - 1)
dbr = np.clip(br, 0, hmsize)
for i in range(len(visible)):
if visible[i][0] < 0 or visible[i][1] < 0 or visible[i][
0] >= hmsize or visible[i][1] >= hmsize:
continue
dx1, dy1 = dul[i]
dx2, dy2 = dbr[i]
sx1, sy1 = sul[i]
sx2, sy2 = sbr[i]
heatmaps[inds[1][i], dy1:dy2, dx1:dx2] = np.maximum(
self.gaussian[sy1:sy2, sx1:sx2],
heatmaps[inds[1][i], dy1:dy2, dx1:dx2])
records['heatmap_gt{}x'.format(idx + 1)] = heatmaps
records['mask_{}x'.format(idx + 1)] = mask
del records['mask']
return records
@register_keypointop
class RandomFlipHalfBodyTransform(object):
"""apply data augment to image and coords
to achieve the flip, scale, rotate and half body transform effect for training image
Args:
trainsize (list):[w, h], Image target size
upper_body_ids (list): The upper body joint ids
flip_pairs (list): The left-right joints exchange order list
pixel_std (int): The pixel std of the scale
scale (float): The scale factor to transform the image
rot (int): The rotate factor to transform the image
num_joints_half_body (int): The joints threshold of the half body transform
prob_half_body (float): The threshold of the half body transform
flip (bool): Whether to flip the image
Returns:
records(dict): contain the image and coords after tranformed
"""
def __init__(self,
trainsize,
upper_body_ids,
flip_pairs,
pixel_std,
scale=0.35,
rot=40,
num_joints_half_body=8,
prob_half_body=0.3,
flip=True,
rot_prob=0.6):
super(RandomFlipHalfBodyTransform, self).__init__()
self.trainsize = trainsize
self.upper_body_ids = upper_body_ids
self.flip_pairs = flip_pairs
self.pixel_std = pixel_std
self.scale = scale
self.rot = rot
self.num_joints_half_body = num_joints_half_body
self.prob_half_body = prob_half_body
self.flip = flip
self.aspect_ratio = trainsize[0] * 1.0 / trainsize[1]
self.rot_prob = rot_prob
def halfbody_transform(self, joints, joints_vis):
upper_joints = []
lower_joints = []
for joint_id in range(joints.shape[0]):
if joints_vis[joint_id][0] > 0:
if joint_id in self.upper_body_ids:
upper_joints.append(joints[joint_id])
else:
lower_joints.append(joints[joint_id])
if np.random.randn() < 0.5 and len(upper_joints) > 2:
selected_joints = upper_joints
else:
selected_joints = lower_joints if len(
lower_joints) > 2 else upper_joints
if len(selected_joints) < 2:
return None, None
selected_joints = np.array(selected_joints, dtype=np.float32)
center = selected_joints.mean(axis=0)[:2]
left_top = np.amin(selected_joints, axis=0)
right_bottom = np.amax(selected_joints, axis=0)
w = right_bottom[0] - left_top[0]
h = right_bottom[1] - left_top[1]
if w > self.aspect_ratio * h:
h = w * 1.0 / self.aspect_ratio
elif w < self.aspect_ratio * h:
w = h * self.aspect_ratio
scale = np.array(
[w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
dtype=np.float32)
scale = scale * 1.5
return center, scale
def flip_joints(self, joints, joints_vis, width, matched_parts):
joints[:, 0] = width - joints[:, 0] - 1
for pair in matched_parts:
joints[pair[0], :], joints[pair[1], :] = \
joints[pair[1], :], joints[pair[0], :].copy()
joints_vis[pair[0], :], joints_vis[pair[1], :] = \
joints_vis[pair[1], :], joints_vis[pair[0], :].copy()
return joints * joints_vis, joints_vis
def __call__(self, records):
image = records['image']
joints = records['gt_joints']
joints_vis = records['joints_vis']
c = records['center']
s = records['scale']
r = 0
if (np.sum(joints_vis[:, 0]) > self.num_joints_half_body and
np.random.rand() < self.prob_half_body):
c_half_body, s_half_body = self.halfbody_transform(joints,
joints_vis)
if c_half_body is not None and s_half_body is not None:
c, s = c_half_body, s_half_body
sf = self.scale
rf = self.rot
s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
r = np.clip(np.random.randn() * rf, -rf * 2,
rf * 2) if np.random.random() <= self.rot_prob else 0
if self.flip and np.random.random() <= 0.5:
image = image[:, ::-1, :]
joints, joints_vis = self.flip_joints(
joints, joints_vis, image.shape[1], self.flip_pairs)
c[0] = image.shape[1] - c[0] - 1
records['image'] = image
records['gt_joints'] = joints
records['joints_vis'] = joints_vis
records['center'] = c
records['scale'] = s
records['rotate'] = r
return records
@register_keypointop
class AugmentationbyInformantionDropping(object):
"""AID: Augmentation by Informantion Dropping. Please refer
to https://arxiv.org/abs/2008.07139
Args:
prob_cutout (float): The probability of the Cutout augmentation.
offset_factor (float): Offset factor of cutout center.
num_patch (int): Number of patches to be cutout.
records(dict): the dict contained the image and coords
Returns:
records (dict): contain the image and coords after tranformed
"""
def __init__(self,
trainsize,
prob_cutout=0.0,
offset_factor=0.2,
num_patch=1):
self.prob_cutout = prob_cutout
self.offset_factor = offset_factor
self.num_patch = num_patch
self.trainsize = trainsize
def _cutout(self, img, joints, joints_vis):
height, width, _ = img.shape
img = img.reshape((height * width, -1))
feat_x_int = np.arange(0, width)
feat_y_int = np.arange(0, height)
feat_x_int, feat_y_int = np.meshgrid(feat_x_int, feat_y_int)
feat_x_int = feat_x_int.reshape((-1, ))
feat_y_int = feat_y_int.reshape((-1, ))
for _ in range(self.num_patch):
vis_idx, _ = np.where(joints_vis > 0)
occlusion_joint_id = np.random.choice(vis_idx)
center = joints[occlusion_joint_id, 0:2]
offset = np.random.randn(2) * self.trainsize[0] * self.offset_factor
center = center + offset
radius = np.random.uniform(0.1, 0.2) * self.trainsize[0]
x_offset = (center[0] - feat_x_int) / radius
y_offset = (center[1] - feat_y_int) / radius
dis = x_offset**2 + y_offset**2
keep_pos = np.where((dis <= 1) & (dis >= 0))[0]
img[keep_pos, :] = 0
img = img.reshape((height, width, -1))
return img
def __call__(self, records):
img = records['image']
joints = records['gt_joints']
joints_vis = records['joints_vis']
if np.random.rand() < self.prob_cutout:
img = self._cutout(img, joints, joints_vis)
records['image'] = img
return records
@register_keypointop
class TopDownRandomFlip(object):
"""Data augmentation with random image flip.
Args:
flip_perm: (list[tuple]): Pairs of keypoints which are mirrored
(for example, left ear and right ear).
flip_prob (float): Probability of flip.
"""
def __init__(self, flip_perm=[], flip_prob=0.5):
self.flip_perm = flip_perm
self.flip_prob = flip_prob
def flip_joints(self, joints_3d, joints_3d_visible, img_width, flip_pairs):
assert len(joints_3d) == len(joints_3d_visible)
assert img_width > 0
joints_3d_flipped = joints_3d.copy()
joints_3d_visible_flipped = joints_3d_visible.copy()
# Swap left-right parts
for left, right in flip_pairs:
joints_3d_flipped[left, :] = joints_3d[right, :]
joints_3d_flipped[right, :] = joints_3d[left, :]
joints_3d_visible_flipped[left, :] = joints_3d_visible[right, :]
joints_3d_visible_flipped[right, :] = joints_3d_visible[left, :]
# Flip horizontally
joints_3d_flipped[:, 0] = img_width - 1 - joints_3d_flipped[:, 0]
joints_3d_flipped = joints_3d_flipped * (joints_3d_visible_flipped > 0)
return joints_3d_flipped, joints_3d_visible_flipped
def __call__(self, results):
"""Perform data augmentation with random image flip."""
if np.random.rand() <= self.flip_prob:
return results
img = results['image']
joints_3d = results['gt_joints']
joints_3d_visible = results['joints_vis']
center = results['center']
# A flag indicating whether the image is flipped,
# which can be used by child class.
if not isinstance(img, list):
img = img[:, ::-1, :]
else:
img = [i[:, ::-1, :] for i in img]
if not isinstance(img, list):
joints_3d, joints_3d_visible = self.flip_joints(
joints_3d, joints_3d_visible, img.shape[1],
self.flip_perm)
center[0] = img.shape[1] - center[0] - 1
else:
joints_3d, joints_3d_visible = self.flip_joints(
joints_3d, joints_3d_visible, img[0].shape[1],
self.flip_perm)
center[0] = img[0].shape[1] - center[0] - 1
results['image'] = img
results['gt_joints'] = joints_3d
results['joints_vis'] = joints_3d_visible
results['center'] = center
return results
@register_keypointop
class TopDownRandomShiftBboxCenter(object):
"""Random shift the bbox center.
Args:
shift_factor (float): The factor to control the shift range, which is
scale*pixel_std*scale_factor. Default: 0.16
shift_prob (float): Probability of applying random shift. Default: 0.3
"""
def __init__(self, shift_factor=0.16, shift_prob=0.3):
self.shift_factor = shift_factor
self.shift_prob = shift_prob
def __call__(self, results):
center = results['center']
scale = results['scale']
if np.random.rand() < self.shift_prob:
center += np.random.uniform(
-1, 1, 2) * self.shift_factor * scale * 200.0
results['center'] = center
return results
@register_keypointop
class TopDownGetRandomScaleRotation(object):
"""Data augmentation with random scaling & rotating.
Args:
rot_factor (int): Rotating to ``[-2*rot_factor, 2*rot_factor]``.
scale_factor (float): Scaling to ``[1-scale_factor, 1+scale_factor]``.
rot_prob (float): Probability of random rotation.
"""
def __init__(self, rot_factor=40, scale_factor=0.5, rot_prob=0.6):
self.rot_factor = rot_factor
self.scale_factor = scale_factor
self.rot_prob = rot_prob
def __call__(self, results):
"""Perform data augmentation with random scaling & rotating."""
s = results['scale']
sf = self.scale_factor
rf = self.rot_factor
s_factor = np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
s = s * s_factor
r_factor = np.clip(np.random.randn() * rf, -rf * 2, rf * 2)
r = r_factor if np.random.rand() <= self.rot_prob else 0
results['scale'] = s
results['rotate'] = r
return results
@register_keypointop
class TopDownAffine(object):
"""apply affine transform to image and coords
Args:
trainsize (list): [w, h], the standard size used to train
use_udp (bool): whether to use Unbiased Data Processing.
records(dict): the dict contained the image and coords
Returns:
records (dict): contain the image and coords after tranformed
"""
def __init__(self, trainsize, use_udp=False):
self.trainsize = trainsize
self.use_udp = use_udp
def __call__(self, records):
image = records['image']
joints = records['gt_joints']
joints_vis = records['joints_vis']
rot = records['rotate'] if "rotate" in records else 0
if self.use_udp:
trans = get_warp_matrix(
rot, records['center'] * 2.0,
[self.trainsize[0] - 1.0, self.trainsize[1] - 1.0],
records['scale'] * 200.0)
image = cv2.warpAffine(
image,
trans, (int(self.trainsize[0]), int(self.trainsize[1])),
flags=cv2.INTER_LINEAR)
joints[:, 0:2] = warp_affine_joints(joints[:, 0:2].copy(), trans)
else:
trans = get_affine_transform(records['center'], records['scale'] *
200, rot, self.trainsize)
image = cv2.warpAffine(
image,
trans, (int(self.trainsize[0]), int(self.trainsize[1])),
flags=cv2.INTER_LINEAR)
for i in range(joints.shape[0]):
if joints_vis[i, 0] > 0.0:
joints[i, 0:2] = affine_transform(joints[i, 0:2], trans)
records['image'] = image
records['gt_joints'] = joints
return records
@register_keypointop
class SinglePoseAffine(object):
"""apply affine transform to image and coords
Args:
trainsize (list): [w, h], the standard size used to train
use_udp (bool): whether to use Unbiased Data Processing.
records(dict): the dict contained the image and coords
Returns:
records (dict): contain the image and coords after tranformed
"""
def __init__(self,
trainsize,
rotate=[1.0, 30],
scale=[1.0, 0.25],
use_udp=False):
self.trainsize = trainsize
self.use_udp = use_udp
self.rot_prob = rotate[0]
self.rot_range = rotate[1]
self.scale_prob = scale[0]
self.scale_ratio = scale[1]
def __call__(self, records):
image = records['image']
if 'joints_2d' in records:
joints = records['joints_2d'] if 'joints_2d' in records else None
joints_vis = records[
'joints_vis'] if 'joints_vis' in records else np.ones(
(len(joints), 1))
rot = 0
s = 1.
if np.random.random() < self.rot_prob:
rot = np.clip(np.random.randn() * self.rot_range,
-self.rot_range * 2, self.rot_range * 2)
if np.random.random() < self.scale_prob:
s = np.clip(np.random.randn() * self.scale_ratio + 1,
1 - self.scale_ratio, 1 + self.scale_ratio)
if self.use_udp:
trans = get_warp_matrix(
rot,
np.array(records['bbox_center']) * 2.0,
[self.trainsize[0] - 1.0, self.trainsize[1] - 1.0],
records['bbox_scale'] * 200.0 * s)
image = cv2.warpAffine(
image,
trans, (int(self.trainsize[0]), int(self.trainsize[1])),
flags=cv2.INTER_LINEAR)
if 'joints_2d' in records:
joints[:, 0:2] = warp_affine_joints(joints[:, 0:2].copy(),
trans)
else:
trans = get_affine_transform(
np.array(records['bbox_center']),
records['bbox_scale'] * s * 200, rot, self.trainsize)
image = cv2.warpAffine(
image,
trans, (int(self.trainsize[0]), int(self.trainsize[1])),
flags=cv2.INTER_LINEAR)
if 'joints_2d' in records:
for i in range(len(joints)):
if joints_vis[i, 0] > 0.0:
joints[i, 0:2] = affine_transform(joints[i, 0:2], trans)
if 'joints_3d' in records:
pose3d = records['joints_3d']
if not rot == 0:
trans_3djoints = np.eye(3)
rot_rad = -rot * np.pi / 180
sn, cs = np.sin(rot_rad), np.cos(rot_rad)
trans_3djoints[0, :2] = [cs, -sn]
trans_3djoints[1, :2] = [sn, cs]
pose3d[:, :3] = np.einsum('ij,kj->ki', trans_3djoints,
pose3d[:, :3])
records['joints_3d'] = pose3d
records['image'] = image
if 'joints_2d' in records:
records['joints_2d'] = joints
return records
@register_keypointop
class NoiseJitter(object):
"""apply NoiseJitter to image
Args:
noise_factor (float): the noise factor ratio used to generate the jitter
Returns:
records (dict): contain the image and coords after tranformed
"""
def __init__(self, noise_factor=0.4):
self.noise_factor = noise_factor
def __call__(self, records):
self.pn = np.random.uniform(1 - self.noise_factor,
1 + self.noise_factor, 3)
rgb_img = records['image']
rgb_img[:, :, 0] = np.minimum(
255.0, np.maximum(0.0, rgb_img[:, :, 0] * self.pn[0]))
rgb_img[:, :, 1] = np.minimum(
255.0, np.maximum(0.0, rgb_img[:, :, 1] * self.pn[1]))
rgb_img[:, :, 2] = np.minimum(
255.0, np.maximum(0.0, rgb_img[:, :, 2] * self.pn[2]))
records['image'] = rgb_img
return records
@register_keypointop
class FlipPose(object):
"""random apply flip to image
Args:
noise_factor (float): the noise factor ratio used to generate the jitter
Returns:
records (dict): contain the image and coords after tranformed
"""
def __init__(self, flip_prob=0.5, img_res=224, num_joints=14):
self.flip_pob = flip_prob
self.img_res = img_res
if num_joints == 24:
self.perm = [
5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6, 12, 13, 14, 15, 16, 17,
18, 19, 21, 20, 23, 22
]
elif num_joints == 14:
self.perm = [5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6, 12, 13]
else:
print("error num_joints in flip :{}".format(num_joints))
def __call__(self, records):
if np.random.random() < self.flip_pob:
img = records['image']
img = np.fliplr(img)
if 'joints_2d' in records:
joints_2d = records['joints_2d']
joints_2d = joints_2d[self.perm]
joints_2d[:, 0] = self.img_res - joints_2d[:, 0]
records['joints_2d'] = joints_2d
if 'joints_3d' in records:
joints_3d = records['joints_3d']
joints_3d = joints_3d[self.perm]
joints_3d[:, 0] = -joints_3d[:, 0]
records['joints_3d'] = joints_3d
records['image'] = img
return records
@register_keypointop
class TopDownEvalAffine(object):
"""apply affine transform to image and coords
Args:
trainsize (list): [w, h], the standard size used to train
use_udp (bool): whether to use Unbiased Data Processing.
records(dict): the dict contained the image and coords
Returns:
records (dict): contain the image and coords after tranformed
"""
def __init__(self, trainsize, use_udp=False):
self.trainsize = trainsize
self.use_udp = use_udp
def __call__(self, records):
image = records['image']
rot = 0
imshape = records['im_shape'][::-1]
center = imshape / 2.
scale = imshape
if self.use_udp:
trans = get_warp_matrix(
rot, center * 2.0,
[self.trainsize[0] - 1.0, self.trainsize[1] - 1.0], scale)
image = cv2.warpAffine(
image,
trans, (int(self.trainsize[0]), int(self.trainsize[1])),
flags=cv2.INTER_LINEAR)
else:
trans = get_affine_transform(center, scale, rot, self.trainsize)
image = cv2.warpAffine(
image,
trans, (int(self.trainsize[0]), int(self.trainsize[1])),
flags=cv2.INTER_LINEAR)
records['image'] = image
return records
@register_keypointop
class ToHeatmapsTopDown(object):
"""to generate the gaussin heatmaps of keypoint for heatmap loss
Args:
hmsize (list): [w, h] output heatmap's size
sigma (float): the std of gaussin kernel genereted
records(dict): the dict contained the image and coords
Returns:
records (dict): contain the heatmaps used to heatmaploss
"""
def __init__(self, hmsize, sigma):
super(ToHeatmapsTopDown, self).__init__()
self.hmsize = np.array(hmsize)
self.sigma = sigma
def __call__(self, records):
"""refer to
https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
Copyright (c) Microsoft, under the MIT License.
"""
joints = records['gt_joints']
joints_vis = records['joints_vis']
num_joints = joints.shape[0]
image_size = np.array(
[records['image'].shape[1], records['image'].shape[0]])
target_weight = np.ones((num_joints, 1), dtype=np.float32)
target_weight[:, 0] = joints_vis[:, 0]
target = np.zeros(
(num_joints, self.hmsize[1], self.hmsize[0]), dtype=np.float32)
tmp_size = self.sigma * 3
feat_stride = image_size / self.hmsize
for joint_id in range(num_joints):
mu_x = int(joints[joint_id][0] / feat_stride[0] + 0.5)
mu_y = int(joints[joint_id][1] / feat_stride[1] + 0.5)
# Check that any part of the gaussian is in-bounds
ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
if ul[0] >= self.hmsize[0] or ul[1] >= self.hmsize[1] or br[
0] < 0 or br[1] < 0:
# If not, just return the image as is
target_weight[joint_id] = 0
continue
# # Generate gaussian
size = 2 * tmp_size + 1
x = np.arange(0, size, 1, np.float32)
y = x[:, np.newaxis]
x0 = y0 = size // 2
# The gaussian is not normalized, we want the center value to equal 1
g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * self.sigma**2))
# Usable gaussian range
g_x = max(0, -ul[0]), min(br[0], self.hmsize[0]) - ul[0]
g_y = max(0, -ul[1]), min(br[1], self.hmsize[1]) - ul[1]
# Image range
img_x = max(0, ul[0]), min(br[0], self.hmsize[0])
img_y = max(0, ul[1]), min(br[1], self.hmsize[1])
v = target_weight[joint_id]
if v > 0.5:
target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = g[g_y[
0]:g_y[1], g_x[0]:g_x[1]]
records['target'] = target
records['target_weight'] = target_weight
del records['gt_joints'], records['joints_vis']
return records
@register_keypointop
class ToHeatmapsTopDown_DARK(object):
"""to generate the gaussin heatmaps of keypoint for heatmap loss
Args:
hmsize (list): [w, h] output heatmap's size
sigma (float): the std of gaussin kernel genereted
records(dict): the dict contained the image and coords
Returns:
records (dict): contain the heatmaps used to heatmaploss
"""
def __init__(self, hmsize, sigma):
super(ToHeatmapsTopDown_DARK, self).__init__()
self.hmsize = np.array(hmsize)
self.sigma = sigma
def __call__(self, records):
joints = records['gt_joints']
joints_vis = records['joints_vis']
num_joints = joints.shape[0]
image_size = np.array(
[records['image'].shape[1], records['image'].shape[0]])
target_weight = np.ones((num_joints, 1), dtype=np.float32)
target_weight[:, 0] = joints_vis[:, 0]
target = np.zeros(
(num_joints, self.hmsize[1], self.hmsize[0]), dtype=np.float32)
tmp_size = self.sigma * 3
feat_stride = image_size / self.hmsize
for joint_id in range(num_joints):
mu_x = joints[joint_id][0] / feat_stride[0]
mu_y = joints[joint_id][1] / feat_stride[1]
# Check that any part of the gaussian is in-bounds
ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
if ul[0] >= self.hmsize[0] or ul[1] >= self.hmsize[1] or br[
0] < 0 or br[1] < 0:
# If not, just return the image as is
target_weight[joint_id] = 0
continue
x = np.arange(0, self.hmsize[0], 1, np.float32)
y = np.arange(0, self.hmsize[1], 1, np.float32)
y = y[:, np.newaxis]
v = target_weight[joint_id]
if v > 0.5:
target[joint_id] = np.exp(-(
(x - mu_x)**2 + (y - mu_y)**2) / (2 * self.sigma**2))
records['target'] = target
records['target_weight'] = target_weight
del records['gt_joints'], records['joints_vis']
return records
@register_keypointop
class ToHeatmapsTopDown_UDP(object):
"""This code is based on:
https://github.com/HuangJunJie2017/UDP-Pose/blob/master/deep-high-resolution-net.pytorch/lib/dataset/JointsDataset.py
to generate the gaussian heatmaps of keypoint for heatmap loss.
ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing
for Human Pose Estimation (CVPR 2020).
Args:
hmsize (list): [w, h] output heatmap's size
sigma (float): the std of gaussin kernel genereted
records(dict): the dict contained the image and coords
Returns:
records (dict): contain the heatmaps used to heatmaploss
"""
def __init__(self, hmsize, sigma):
super(ToHeatmapsTopDown_UDP, self).__init__()
self.hmsize = np.array(hmsize)
self.sigma = sigma
def __call__(self, records):
joints = records['gt_joints']
joints_vis = records['joints_vis']
num_joints = joints.shape[0]
image_size = np.array(
[records['image'].shape[1], records['image'].shape[0]])
target_weight = np.ones((num_joints, 1), dtype=np.float32)
target_weight[:, 0] = joints_vis[:, 0]
target = np.zeros(
(num_joints, self.hmsize[1], self.hmsize[0]), dtype=np.float32)
tmp_size = self.sigma * 3
size = 2 * tmp_size + 1
x = np.arange(0, size, 1, np.float32)
y = x[:, None]
feat_stride = (image_size - 1.0) / (self.hmsize - 1.0)
for joint_id in range(num_joints):
mu_x = int(joints[joint_id][0] / feat_stride[0] + 0.5)
mu_y = int(joints[joint_id][1] / feat_stride[1] + 0.5)
# Check that any part of the gaussian is in-bounds
ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
if ul[0] >= self.hmsize[0] or ul[1] >= self.hmsize[1] or br[
0] < 0 or br[1] < 0:
# If not, just return the image as is
target_weight[joint_id] = 0
continue
mu_x_ac = joints[joint_id][0] / feat_stride[0]
mu_y_ac = joints[joint_id][1] / feat_stride[1]
x0 = y0 = size // 2
x0 += mu_x_ac - mu_x
y0 += mu_y_ac - mu_y
g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * self.sigma**2))
# Usable gaussian range
g_x = max(0, -ul[0]), min(br[0], self.hmsize[0]) - ul[0]
g_y = max(0, -ul[1]), min(br[1], self.hmsize[1]) - ul[1]
# Image range
img_x = max(0, ul[0]), min(br[0], self.hmsize[0])
img_y = max(0, ul[1]), min(br[1], self.hmsize[1])
v = target_weight[joint_id]
if v > 0.5:
target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = g[g_y[
0]:g_y[1], g_x[0]:g_x[1]]
records['target'] = target
records['target_weight'] = target_weight
del records['gt_joints'], records['joints_vis']
return records
from typing import Optional, Tuple, Union, List
import numbers
def _scale_size(
size: Tuple[int, int],
scale: Union[float, int, tuple], ) -> Tuple[int, int]:
"""Rescale a size by a ratio.
Args:
size (tuple[int]): (w, h).
scale (float | tuple(float)): Scaling factor.
Returns:
tuple[int]: scaled size.
"""
if isinstance(scale, (float, int)):
scale = (scale, scale)
w, h = size
return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5)
def rescale_size(old_size: tuple,
scale: Union[float, int, tuple],
return_scale: bool=False) -> tuple:
"""Calculate the new size to be rescaled to.
Args:
old_size (tuple[int]): The old size (w, h) of image.
scale (float | tuple[int]): The scaling factor or maximum size.
If it is a float number, then the image will be rescaled by this
factor, else if it is a tuple of 2 integers, then the image will
be rescaled as large as possible within the scale.
return_scale (bool): Whether to return the scaling factor besides the
rescaled image size.
Returns:
tuple[int]: The new rescaled image size.
"""
w, h = old_size
if isinstance(scale, (float, int)):
if scale <= 0:
raise ValueError(f'Invalid scale {scale}, must be positive.')
scale_factor = scale
elif isinstance(scale, list):
max_long_edge = max(scale)
max_short_edge = min(scale)
scale_factor = min(max_long_edge / max(h, w),
max_short_edge / min(h, w))
else:
raise TypeError(
f'Scale must be a number or tuple of int, but got {type(scale)}')
new_size = _scale_size((w, h), scale_factor)
if return_scale:
return new_size, scale_factor
else:
return new_size
def imrescale(img: np.ndarray,
scale: Union[float, Tuple[int, int]],
return_scale: bool=False,
interpolation: str='bilinear',
backend: Optional[str]=None) -> Union[np.ndarray, Tuple[
np.ndarray, float]]:
"""Resize image while keeping the aspect ratio.
Args:
img (ndarray): The input image.
scale (float | tuple[int]): The scaling factor or maximum size.
If it is a float number, then the image will be rescaled by this
factor, else if it is a tuple of 2 integers, then the image will
be rescaled as large as possible within the scale.
return_scale (bool): Whether to return the scaling factor besides the
rescaled image.
interpolation (str): Same as :func:`resize`.
backend (str | None): Same as :func:`resize`.
Returns:
ndarray: The rescaled image.
"""
h, w = img.shape[:2]
new_size, scale_factor = rescale_size((w, h), scale, return_scale=True)
rescaled_img = imresize(
img, new_size, interpolation=interpolation, backend=backend)
if return_scale:
return rescaled_img, scale_factor
else:
return rescaled_img
def imresize(
img: np.ndarray,
size: Tuple[int, int],
return_scale: bool=False,
interpolation: str='bilinear',
out: Optional[np.ndarray]=None,
backend: Optional[str]=None,
interp=cv2.INTER_LINEAR, ) -> Union[Tuple[np.ndarray, float, float],
np.ndarray]:
"""Resize image to a given size.
Args:
img (ndarray): The input image.
size (tuple[int]): Target size (w, h).
return_scale (bool): Whether to return `w_scale` and `h_scale`.
interpolation (str): Interpolation method, accepted values are
"nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
backend, "nearest", "bilinear" for 'pillow' backend.
out (ndarray): The output destination.
backend (str | None): The image resize backend type. Options are `cv2`,
`pillow`, `None`. If backend is None, the global imread_backend
specified by ``mmcv.use_backend()`` will be used. Default: None.
Returns:
tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
`resized_img`.
"""
h, w = img.shape[:2]
if backend is None:
backend = imread_backend
if backend not in ['cv2', 'pillow']:
raise ValueError(f'backend: {backend} is not supported for resize.'
f"Supported backends are 'cv2', 'pillow'")
if backend == 'pillow':
assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
pil_image = Image.fromarray(img)
pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])
resized_img = np.array(pil_image)
else:
resized_img = cv2.resize(img, size, dst=out, interpolation=interp)
if not return_scale:
return resized_img
else:
w_scale = size[0] / w
h_scale = size[1] / h
return resized_img, w_scale, h_scale
class PETR_Resize:
"""Resize images & bbox & mask.
This transform resizes the input image to some scale. Bboxes and masks are
then resized with the same scale factor. If the input dict contains the key
"scale", then the scale in the input dict is used, otherwise the specified
scale in the init method is used. If the input dict contains the key
"scale_factor" (if MultiScaleFlipAug does not give img_scale but
scale_factor), the actual scale will be computed by image shape and
scale_factor.
`img_scale` can either be a tuple (single-scale) or a list of tuple
(multi-scale). There are 3 multiscale modes:
- ``ratio_range is not None``: randomly sample a ratio from the ratio \
range and multiply it with the image scale.
- ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \
sample a scale from the multiscale range.
- ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \
sample a scale from multiple scales.
Args:
img_scale (tuple or list[tuple]): Images scales for resizing.
multiscale_mode (str): Either "range" or "value".
ratio_range (tuple[float]): (min_ratio, max_ratio)
keep_ratio (bool): Whether to keep the aspect ratio when resizing the
image.
bbox_clip_border (bool, optional): Whether to clip the objects outside
the border of the image. In some dataset like MOT17, the gt bboxes
are allowed to cross the border of images. Therefore, we don't
need to clip the gt bboxes in these cases. Defaults to True.
backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
These two backends generates slightly different results. Defaults
to 'cv2'.
interpolation (str): Interpolation method, accepted values are
"nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
backend, "nearest", "bilinear" for 'pillow' backend.
override (bool, optional): Whether to override `scale` and
`scale_factor` so as to call resize twice. Default False. If True,
after the first resizing, the existed `scale` and `scale_factor`
will be ignored so the second resizing can be allowed.
This option is a work-around for multiple times of resize in DETR.
Defaults to False.
"""
def __init__(self,
img_scale=None,
multiscale_mode='range',
ratio_range=None,
keep_ratio=True,
bbox_clip_border=True,
backend='cv2',
interpolation='bilinear',
override=False,
keypoint_clip_border=True):
if img_scale is None:
self.img_scale = None
else:
if isinstance(img_scale, list):
self.img_scale = img_scale
else:
self.img_scale = [img_scale]
assert isinstance(self.img_scale, list)
if ratio_range is not None:
# mode 1: given a scale and a range of image ratio
assert len(self.img_scale) == 1
else:
# mode 2: given multiple scales or a range of scales
assert multiscale_mode in ['value', 'range']
self.backend = backend
self.multiscale_mode = multiscale_mode
self.ratio_range = ratio_range
self.keep_ratio = keep_ratio
# TODO: refactor the override option in Resize
self.interpolation = interpolation
self.override = override
self.bbox_clip_border = bbox_clip_border
self.keypoint_clip_border = keypoint_clip_border
@staticmethod
def random_select(img_scales):
"""Randomly select an img_scale from given candidates.
Args:
img_scales (list[tuple]): Images scales for selection.
Returns:
(tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \
where ``img_scale`` is the selected image scale and \
``scale_idx`` is the selected index in the given candidates.
"""
assert isinstance(img_scales, list)
scale_idx = np.random.randint(len(img_scales))
img_scale = img_scales[scale_idx]
return img_scale, scale_idx
@staticmethod
def random_sample(img_scales):
"""Randomly sample an img_scale when ``multiscale_mode=='range'``.
Args:
img_scales (list[tuple]): Images scale range for sampling.
There must be two tuples in img_scales, which specify the lower
and upper bound of image scales.
Returns:
(tuple, None): Returns a tuple ``(img_scale, None)``, where \
``img_scale`` is sampled scale and None is just a placeholder \
to be consistent with :func:`random_select`.
"""
assert isinstance(img_scales, list) and len(img_scales) == 2
img_scale_long = [max(s) for s in img_scales]
img_scale_short = [min(s) for s in img_scales]
long_edge = np.random.randint(
min(img_scale_long), max(img_scale_long) + 1)
short_edge = np.random.randint(
min(img_scale_short), max(img_scale_short) + 1)
img_scale = (long_edge, short_edge)
return img_scale, None
@staticmethod
def random_sample_ratio(img_scale, ratio_range):
"""Randomly sample an img_scale when ``ratio_range`` is specified.
A ratio will be randomly sampled from the range specified by
``ratio_range``. Then it would be multiplied with ``img_scale`` to
generate sampled scale.
Args:
img_scale (list): Images scale base to multiply with ratio.
ratio_range (tuple[float]): The minimum and maximum ratio to scale
the ``img_scale``.
Returns:
(tuple, None): Returns a tuple ``(scale, None)``, where \
``scale`` is sampled ratio multiplied with ``img_scale`` and \
None is just a placeholder to be consistent with \
:func:`random_select`.
"""
assert isinstance(img_scale, list) and len(img_scale) == 2
min_ratio, max_ratio = ratio_range
assert min_ratio <= max_ratio
ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
return scale, None
def _random_scale(self, results):
"""Randomly sample an img_scale according to ``ratio_range`` and
``multiscale_mode``.
If ``ratio_range`` is specified, a ratio will be sampled and be
multiplied with ``img_scale``.
If multiple scales are specified by ``img_scale``, a scale will be
sampled according to ``multiscale_mode``.
Otherwise, single scale will be used.
Args:
results (dict): Result dict from :obj:`dataset`.
Returns:
dict: Two new keys 'scale` and 'scale_idx` are added into \
``results``, which would be used by subsequent pipelines.
"""
if self.ratio_range is not None:
scale, scale_idx = self.random_sample_ratio(self.img_scale[0],
self.ratio_range)
elif len(self.img_scale) == 1:
scale, scale_idx = self.img_scale[0], 0
elif self.multiscale_mode == 'range':
scale, scale_idx = self.random_sample(self.img_scale)
elif self.multiscale_mode == 'value':
scale, scale_idx = self.random_select(self.img_scale)
else:
raise NotImplementedError
results['scale'] = scale
results['scale_idx'] = scale_idx
def _resize_img(self, results):
"""Resize images with ``results['scale']``."""
for key in ['image'] if 'image' in results else []:
if self.keep_ratio:
img, scale_factor = imrescale(
results[key],
results['scale'],
return_scale=True,
interpolation=self.interpolation,
backend=self.backend)
# the w_scale and h_scale has minor difference
# a real fix should be done in the imrescale in the future
new_h, new_w = img.shape[:2]
h, w = results[key].shape[:2]
w_scale = new_w / w
h_scale = new_h / h
else:
img, w_scale, h_scale = imresize(
results[key],
results['scale'],
return_scale=True,
interpolation=self.interpolation,
backend=self.backend)
scale_factor = np.array(
[w_scale, h_scale, w_scale, h_scale], dtype=np.float32)
results['im_shape'] = np.array(img.shape)
# in case that there is no padding
results['pad_shape'] = img.shape
results['scale_factor'] = scale_factor
results['keep_ratio'] = self.keep_ratio
# img_pad = self.impad(img, shape=results['scale'])
results[key] = img
def _resize_bboxes(self, results):
"""Resize bounding boxes with ``results['scale_factor']``."""
for key in ['gt_bbox'] if 'gt_bbox' in results else []:
bboxes = results[key] * results['scale_factor']
if self.bbox_clip_border:
img_shape = results['im_shape']
bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
results[key] = bboxes
def _resize_masks(self, results):
"""Resize masks with ``results['scale']``"""
for key in ['mask'] if 'mask' in results else []:
if results[key] is None:
continue
if self.keep_ratio:
results[key] = results[key].rescale(results['scale'])
else:
results[key] = results[key].resize(results['im_shape'][:2])
def _resize_seg(self, results):
"""Resize semantic segmentation map with ``results['scale']``."""
for key in ['seg'] if 'seg' in results else []:
if self.keep_ratio:
gt_seg = imrescale(
results[key],
results['scale'],
interpolation='nearest',
backend=self.backend)
else:
gt_seg = imresize(
results[key],
results['scale'],
interpolation='nearest',
backend=self.backend)
results[key] = gt_seg
def _resize_keypoints(self, results):
"""Resize keypoints with ``results['scale_factor']``."""
for key in ['gt_joints'] if 'gt_joints' in results else []:
keypoints = results[key].copy()
keypoints[..., 0] = keypoints[..., 0] * results['scale_factor'][0]
keypoints[..., 1] = keypoints[..., 1] * results['scale_factor'][1]
if self.keypoint_clip_border:
img_shape = results['im_shape']
keypoints[..., 0] = np.clip(keypoints[..., 0], 0, img_shape[1])
keypoints[..., 1] = np.clip(keypoints[..., 1], 0, img_shape[0])
results[key] = keypoints
def _resize_areas(self, results):
"""Resize mask areas with ``results['scale_factor']``."""
for key in ['gt_areas'] if 'gt_areas' in results else []:
areas = results[key].copy()
areas = areas * results['scale_factor'][0] * results[
'scale_factor'][1]
results[key] = areas
def __call__(self, results):
"""Call function to resize images, bounding boxes, masks, semantic
segmentation map.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Resized results, 'im_shape', 'pad_shape', 'scale_factor', \
'keep_ratio' keys are added into result dict.
"""
if 'scale' not in results:
if 'scale_factor' in results:
img_shape = results['image'].shape[:2]
scale_factor = results['scale_factor'][0]
# assert isinstance(scale_factor, float)
results['scale'] = [int(x * scale_factor)
for x in img_shape][::-1]
else:
self._random_scale(results)
else:
if not self.override:
assert 'scale_factor' not in results, (
'scale and scale_factor cannot be both set.')
else:
results.pop('scale')
if 'scale_factor' in results:
results.pop('scale_factor')
self._random_scale(results)
self._resize_img(results)
self._resize_bboxes(results)
self._resize_masks(results)
self._resize_seg(results)
self._resize_keypoints(results)
self._resize_areas(results)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(img_scale={self.img_scale}, '
repr_str += f'multiscale_mode={self.multiscale_mode}, '
repr_str += f'ratio_range={self.ratio_range}, '
repr_str += f'keep_ratio={self.keep_ratio}, '
repr_str += f'bbox_clip_border={self.bbox_clip_border})'
repr_str += f'keypoint_clip_border={self.keypoint_clip_border})'
return repr_str
================================================
FILE: ppdet/data/transform/keypoints_3d_operators.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
try:
from collections.abc import Sequence
except Exception:
from collections import Sequence
import cv2
import numpy as np
import math
import copy
import random
import uuid
from numbers import Number, Integral
from ...modeling.keypoint_utils import get_affine_mat_kernel, warp_affine_joints, get_affine_transform, affine_transform, get_warp_matrix
from ppdet.core.workspace import serializable
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
registered_ops = []
__all__ = [
'CropAndFlipImages', 'PermuteImages', 'RandomFlipHalfBody3DTransformImages'
]
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw
from mpl_toolkits.mplot3d import Axes3D
def register_keypointop(cls):
return serializable(cls)
def register_op(cls):
registered_ops.append(cls.__name__)
if not hasattr(BaseOperator, cls.__name__):
setattr(BaseOperator, cls.__name__, cls)
else:
raise KeyError("The {} class has been registered.".format(cls.__name__))
return serializable(cls)
class BaseOperator(object):
def __init__(self, name=None):
if name is None:
name = self.__class__.__name__
self._id = name + '_' + str(uuid.uuid4())[-6:]
def apply(self, sample, context=None):
""" Process a sample.
Args:
sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
context (dict): info about this sample processing
Returns:
result (dict): a processed sample
"""
return sample
def __call__(self, sample, context=None):
""" Process a sample.
Args:
sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
context (dict): info about this sample processing
Returns:
result (dict): a processed sample
"""
if isinstance(sample, Sequence): # for batch_size
for i in range(len(sample)):
sample[i] = self.apply(sample[i], context)
else:
# image.shape changed
sample = self.apply(sample, context)
return sample
def __str__(self):
return str(self._id)
@register_keypointop
class CropAndFlipImages(object):
"""Crop all images"""
def __init__(self, crop_range, flip_pairs=None):
super(CropAndFlipImages, self).__init__()
self.crop_range = crop_range
self.flip_pairs = flip_pairs
def __call__(self, records): # tuple
images = records["image"]
images = images[:, :, ::-1, :]
images = images[:, :, self.crop_range[0]:self.crop_range[1]]
records["image"] = images
if "kps2d" in records.keys():
kps2d = records["kps2d"]
width, height = images.shape[2], images.shape[1]
kps2d = np.array(kps2d)
kps2d[:, :, 0] = kps2d[:, :, 0] - self.crop_range[0]
for pair in self.flip_pairs:
kps2d[:, pair[0], :], kps2d[:,pair[1], :] = \
kps2d[:,pair[1], :], kps2d[:,pair[0], :].copy()
records["kps2d"] = kps2d
return records
@register_op
class PermuteImages(BaseOperator):
def __init__(self):
"""
Change the channel to be (batch_size, C, H, W) #(6, 3, 1080, 1920)
"""
super(PermuteImages, self).__init__()
def apply(self, sample, context=None):
images = sample["image"]
images = images.transpose((0, 3, 1, 2))
sample["image"] = images
return sample
@register_keypointop
class RandomFlipHalfBody3DTransformImages(object):
"""apply data augment to images and coords
to achieve the flip, scale, rotate and half body transform effect for training image
Args:
trainsize (list):[w, h], Image target size
upper_body_ids (list): The upper body joint ids
flip_pairs (list): The left-right joints exchange order list
pixel_std (int): The pixel std of the scale
scale (float): The scale factor to transform the image
rot (int): The rotate factor to transform the image
num_joints_half_body (int): The joints threshold of the half body transform
prob_half_body (float): The threshold of the half body transform
flip (bool): Whether to flip the image
Returns:
records(dict): contain the image and coords after tranformed
"""
def __init__(self,
trainsize,
upper_body_ids,
flip_pairs,
pixel_std,
scale=0.35,
rot=40,
num_joints_half_body=8,
prob_half_body=0.3,
flip=True,
rot_prob=0.6,
do_occlusion=False):
super(RandomFlipHalfBody3DTransformImages, self).__init__()
self.trainsize = trainsize
self.upper_body_ids = upper_body_ids
self.flip_pairs = flip_pairs
self.pixel_std = pixel_std
self.scale = scale
self.rot = rot
self.num_joints_half_body = num_joints_half_body
self.prob_half_body = prob_half_body
self.flip = flip
self.aspect_ratio = trainsize[0] * 1.0 / trainsize[1]
self.rot_prob = rot_prob
self.do_occlusion = do_occlusion
def halfbody_transform(self, joints, joints_vis):
upper_joints = []
lower_joints = []
for joint_id in range(joints.shape[0]):
if joints_vis[joint_id][0] > 0:
if joint_id in self.upper_body_ids:
upper_joints.append(joints[joint_id])
else:
lower_joints.append(joints[joint_id])
if np.random.randn() < 0.5 and len(upper_joints) > 2:
selected_joints = upper_joints
else:
selected_joints = lower_joints if len(
lower_joints) > 2 else upper_joints
if len(selected_joints) < 2:
return None, None
selected_joints = np.array(selected_joints, dtype=np.float32)
center = selected_joints.mean(axis=0)[:2]
left_top = np.amin(selected_joints, axis=0)
right_bottom = np.amax(selected_joints, axis=0)
w = right_bottom[0] - left_top[0]
h = right_bottom[1] - left_top[1]
if w > self.aspect_ratio * h:
h = w * 1.0 / self.aspect_ratio
elif w < self.aspect_ratio * h:
w = h * self.aspect_ratio
scale = np.array(
[w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
dtype=np.float32)
scale = scale * 1.5
return center, scale
def flip_joints(self, joints, joints_vis, width, matched_parts, kps2d=None):
# joints: (6, 24, 3),(num_frames, num_joints, 3)
joints[:, :, 0] = width - joints[:, :, 0] - 1 # x
if kps2d is not None:
kps2d[:, :, 0] = width - kps2d[:, :, 0] - 1
for pair in matched_parts:
joints[:, pair[0], :], joints[:,pair[1], :] = \
joints[:,pair[1], :], joints[:,pair[0], :].copy()
joints_vis[:,pair[0], :], joints_vis[:,pair[1], :] = \
joints_vis[:,pair[1], :], joints_vis[:,pair[0], :].copy()
if kps2d is not None:
kps2d[:, pair[0], :], kps2d[:,pair[1], :] = \
kps2d[:,pair[1], :], kps2d[:,pair[0], :].copy()
# move to zero
joints -= joints[:, [0], :] # (batch_size, 24, 3),numpy.ndarray
return joints, joints_vis, kps2d
def __call__(self, records):
images = records[
'image'] #kps3d, kps3d_vis, images. images.shape(num_frames, width, height, 3)
joints = records['kps3d']
joints_vis = records['kps3d_vis']
kps2d = None
if 'kps2d' in records.keys():
kps2d = records['kps2d']
if self.flip and np.random.random() <= 0.5:
images = images[:, :, ::-1, :] # 图像水平翻转 (6, 1080, 810, 3)
joints, joints_vis, kps2d = self.flip_joints(
joints, joints_vis, images.shape[2], self.flip_pairs,
kps2d) # 关键点左右对称翻转
occlusion = False
if self.do_occlusion and random.random() <= 0.5: # 随机遮挡
height = images[0].shape[0]
width = images[0].shape[1]
occlusion = True
while True:
area_min = 0.0
area_max = 0.2
synth_area = (random.random() *
(area_max - area_min) + area_min) * width * height
ratio_min = 0.3
ratio_max = 1 / 0.3
synth_ratio = (random.random() *
(ratio_max - ratio_min) + ratio_min)
synth_h = math.sqrt(synth_area * synth_ratio)
synth_w = math.sqrt(synth_area / synth_ratio)
synth_xmin = random.random() * (width - synth_w - 1)
synth_ymin = random.random() * (height - synth_h - 1)
if synth_xmin >= 0 and synth_ymin >= 0 and synth_xmin + synth_w < width and synth_ymin + synth_h < height:
xmin = int(synth_xmin)
ymin = int(synth_ymin)
w = int(synth_w)
h = int(synth_h)
mask = np.random.rand(h, w, 3) * 255
images[:, ymin:ymin + h, xmin:xmin + w, :] = mask[
None, :, :, :]
break
records['image'] = images
records['kps3d'] = joints
records['kps3d_vis'] = joints_vis
if kps2d is not None:
records['kps2d'] = kps2d
return records
================================================
FILE: ppdet/data/transform/mot_operators.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
try:
from collections.abc import Sequence
except Exception:
from collections import Sequence
from numbers import Integral
import cv2
import copy
import numpy as np
import random
import math
from .operators import BaseOperator, register_op
from .batch_operators import Gt2TTFTarget
from ppdet.modeling.bbox_utils import bbox_iou_np_expand
from ppdet.utils.logger import setup_logger
from .op_helper import gaussian_radius
logger = setup_logger(__name__)
__all__ = [
'RGBReverse', 'LetterBoxResize', 'MOTRandomAffine', 'Gt2JDETargetThres',
'Gt2JDETargetMax', 'Gt2FairMOTTarget'
]
@register_op
class RGBReverse(BaseOperator):
"""RGB to BGR, or BGR to RGB, sensitive to MOTRandomAffine
"""
def __init__(self):
super(RGBReverse, self).__init__()
def apply(self, sample, context=None):
im = sample['image']
sample['image'] = np.ascontiguousarray(im[:, :, ::-1])
return sample
@register_op
class LetterBoxResize(BaseOperator):
def __init__(self, target_size):
"""
Resize image to target size, convert normalized xywh to pixel xyxy
format ([x_center, y_center, width, height] -> [x0, y0, x1, y1]).
Args:
target_size (int|list): image target size.
"""
super(LetterBoxResize, self).__init__()
if not isinstance(target_size, (Integral, Sequence)):
raise TypeError(
"Type of target_size is invalid. Must be Integer or List or Tuple, now is {}".
format(type(target_size)))
if isinstance(target_size, Integral):
target_size = [target_size, target_size]
self.target_size = target_size
def apply_image(self, img, height, width, color=(127.5, 127.5, 127.5)):
# letterbox: resize a rectangular image to a padded rectangular
shape = img.shape[:2] # [height, width]
ratio_h = float(height) / shape[0]
ratio_w = float(width) / shape[1]
ratio = min(ratio_h, ratio_w)
new_shape = (round(shape[1] * ratio),
round(shape[0] * ratio)) # [width, height]
padw = (width - new_shape[0]) / 2
padh = (height - new_shape[1]) / 2
top, bottom = round(padh - 0.1), round(padh + 0.1)
left, right = round(padw - 0.1), round(padw + 0.1)
img = cv2.resize(
img, new_shape, interpolation=cv2.INTER_AREA) # resized, no border
img = cv2.copyMakeBorder(
img, top, bottom, left, right, cv2.BORDER_CONSTANT,
value=color) # padded rectangular
return img, ratio, padw, padh
def apply_bbox(self, bbox0, h, w, ratio, padw, padh):
bboxes = bbox0.copy()
bboxes[:, 0] = ratio * w * (bbox0[:, 0] - bbox0[:, 2] / 2) + padw
bboxes[:, 1] = ratio * h * (bbox0[:, 1] - bbox0[:, 3] / 2) + padh
bboxes[:, 2] = ratio * w * (bbox0[:, 0] + bbox0[:, 2] / 2) + padw
bboxes[:, 3] = ratio * h * (bbox0[:, 1] + bbox0[:, 3] / 2) + padh
return bboxes
def apply(self, sample, context=None):
""" Resize the image numpy.
"""
im = sample['image']
h, w = sample['im_shape']
if not isinstance(im, np.ndarray):
raise TypeError("{}: image type is not numpy.".format(self))
if len(im.shape) != 3:
from PIL import UnidentifiedImageError
raise UnidentifiedImageError(
'{}: image is not 3-dimensional.'.format(self))
# apply image
height, width = self.target_size
img, ratio, padw, padh = self.apply_image(
im, height=height, width=width)
sample['image'] = img
new_shape = (round(h * ratio), round(w * ratio))
sample['im_shape'] = np.asarray(new_shape, dtype=np.float32)
sample['scale_factor'] = np.asarray([ratio, ratio], dtype=np.float32)
# apply bbox
if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], h, w, ratio,
padw, padh)
return sample
@register_op
class MOTRandomAffine(BaseOperator):
"""
Affine transform to image and coords to achieve the rotate, scale and
shift effect for training image.
Args:
degrees (list[2]): the rotate range to apply, transform range is [min, max]
translate (list[2]): the translate range to apply, transform range is [min, max]
scale (list[2]): the scale range to apply, transform range is [min, max]
shear (list[2]): the shear range to apply, transform range is [min, max]
borderValue (list[3]): value used in case of a constant border when appling
the perspective transformation
reject_outside (bool): reject warped bounding bboxes outside of image
Returns:
records(dict): contain the image and coords after tranformed
"""
def __init__(self,
degrees=(-5, 5),
translate=(0.10, 0.10),
scale=(0.50, 1.20),
shear=(-2, 2),
borderValue=(127.5, 127.5, 127.5),
reject_outside=True):
super(MOTRandomAffine, self).__init__()
self.degrees = degrees
self.translate = translate
self.scale = scale
self.shear = shear
self.borderValue = borderValue
self.reject_outside = reject_outside
def apply(self, sample, context=None):
# https://medium.com/uruvideo/dataset-augmentation-with-random-homographies-a8f4b44830d4
border = 0 # width of added border (optional)
img = sample['image']
height, width = img.shape[0], img.shape[1]
# Rotation and Scale
R = np.eye(3)
a = random.random() * (self.degrees[1] - self.degrees[0]
) + self.degrees[0]
s = random.random() * (self.scale[1] - self.scale[0]) + self.scale[0]
R[:2] = cv2.getRotationMatrix2D(
angle=a, center=(width / 2, height / 2), scale=s)
# Translation
T = np.eye(3)
T[0, 2] = (
random.random() * 2 - 1
) * self.translate[0] * height + border # x translation (pixels)
T[1, 2] = (
random.random() * 2 - 1
) * self.translate[1] * width + border # y translation (pixels)
# Shear
S = np.eye(3)
S[0, 1] = math.tan((random.random() *
(self.shear[1] - self.shear[0]) + self.shear[0]) *
math.pi / 180) # x shear (deg)
S[1, 0] = math.tan((random.random() *
(self.shear[1] - self.shear[0]) + self.shear[0]) *
math.pi / 180) # y shear (deg)
M = S @T @R # Combined rotation matrix. ORDER IS IMPORTANT HERE!!
imw = cv2.warpPerspective(
img,
M,
dsize=(width, height),
flags=cv2.INTER_LINEAR,
borderValue=self.borderValue) # BGR order borderValue
if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
targets = sample['gt_bbox']
n = targets.shape[0]
points = targets.copy()
area0 = (points[:, 2] - points[:, 0]) * (
points[:, 3] - points[:, 1])
# warp points
xy = np.ones((n * 4, 3))
xy[:, :2] = points[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
n * 4, 2) # x1y1, x2y2, x1y2, x2y1
xy = (xy @M.T)[:, :2].reshape(n, 8)
# create new boxes
x = xy[:, [0, 2, 4, 6]]
y = xy[:, [1, 3, 5, 7]]
xy = np.concatenate(
(x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
# apply angle-based reduction
radians = a * math.pi / 180
reduction = max(abs(math.sin(radians)), abs(math.cos(radians)))**0.5
x = (xy[:, 2] + xy[:, 0]) / 2
y = (xy[:, 3] + xy[:, 1]) / 2
w = (xy[:, 2] - xy[:, 0]) * reduction
h = (xy[:, 3] - xy[:, 1]) * reduction
xy = np.concatenate(
(x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T
# reject warped points outside of image
if self.reject_outside:
np.clip(xy[:, 0], 0, width, out=xy[:, 0])
np.clip(xy[:, 2], 0, width, out=xy[:, 2])
np.clip(xy[:, 1], 0, height, out=xy[:, 1])
np.clip(xy[:, 3], 0, height, out=xy[:, 3])
w = xy[:, 2] - xy[:, 0]
h = xy[:, 3] - xy[:, 1]
area = w * h
ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16))
i = (w > 4) & (h > 4) & (area / (area0 + 1e-16) > 0.1) & (ar < 10)
if sum(i) > 0:
sample['gt_bbox'] = xy[i].astype(sample['gt_bbox'].dtype)
sample['gt_class'] = sample['gt_class'][i]
if 'difficult' in sample:
sample['difficult'] = sample['difficult'][i]
if 'gt_ide' in sample:
sample['gt_ide'] = sample['gt_ide'][i]
if 'is_crowd' in sample:
sample['is_crowd'] = sample['is_crowd'][i]
sample['image'] = imw
return sample
else:
return sample
@register_op
class Gt2JDETargetThres(BaseOperator):
__shared__ = ['num_classes']
"""
Generate JDE targets by groud truth data when training
Args:
anchors (list): anchors of JDE model
anchor_masks (list): anchor_masks of JDE model
downsample_ratios (list): downsample ratios of JDE model
ide_thresh (float): thresh of identity, higher is groud truth
fg_thresh (float): thresh of foreground, higher is foreground
bg_thresh (float): thresh of background, lower is background
num_classes (int): number of classes
"""
def __init__(self,
anchors,
anchor_masks,
downsample_ratios,
ide_thresh=0.5,
fg_thresh=0.5,
bg_thresh=0.4,
num_classes=1):
super(Gt2JDETargetThres, self).__init__()
self.anchors = anchors
self.anchor_masks = anchor_masks
self.downsample_ratios = downsample_ratios
self.ide_thresh = ide_thresh
self.fg_thresh = fg_thresh
self.bg_thresh = bg_thresh
self.num_classes = num_classes
def generate_anchor(self, nGh, nGw, anchor_hw):
nA = len(anchor_hw)
yy, xx = np.meshgrid(np.arange(nGh), np.arange(nGw))
mesh = np.stack([xx.T, yy.T], axis=0) # [2, nGh, nGw]
mesh = np.repeat(mesh[None, :], nA, axis=0) # [nA, 2, nGh, nGw]
anchor_offset_mesh = anchor_hw[:, :, None][:, :, :, None]
anchor_offset_mesh = np.repeat(anchor_offset_mesh, nGh, axis=-2)
anchor_offset_mesh = np.repeat(anchor_offset_mesh, nGw, axis=-1)
anchor_mesh = np.concatenate(
[mesh, anchor_offset_mesh], axis=1) # [nA, 4, nGh, nGw]
return anchor_mesh
def encode_delta(self, gt_box_list, fg_anchor_list):
px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \
fg_anchor_list[:, 2], fg_anchor_list[:,3]
gx, gy, gw, gh = gt_box_list[:, 0], gt_box_list[:, 1], \
gt_box_list[:, 2], gt_box_list[:, 3]
dx = (gx - px) / pw
dy = (gy - py) / ph
dw = np.log(gw / pw)
dh = np.log(gh / ph)
return np.stack([dx, dy, dw, dh], axis=1)
def pad_box(self, sample, num_max):
assert 'gt_bbox' in sample
bbox = sample['gt_bbox']
gt_num = len(bbox)
pad_bbox = np.zeros((num_max, 4), dtype=np.float32)
if gt_num > 0:
pad_bbox[:gt_num, :] = bbox[:gt_num, :]
sample['gt_bbox'] = pad_bbox
if 'gt_score' in sample:
pad_score = np.zeros((num_max, ), dtype=np.float32)
if gt_num > 0:
pad_score[:gt_num] = sample['gt_score'][:gt_num, 0]
sample['gt_score'] = pad_score
if 'difficult' in sample:
pad_diff = np.zeros((num_max, ), dtype=np.int32)
if gt_num > 0:
pad_diff[:gt_num] = sample['difficult'][:gt_num, 0]
sample['difficult'] = pad_diff
if 'is_crowd' in sample:
pad_crowd = np.zeros((num_max, ), dtype=np.int32)
if gt_num > 0:
pad_crowd[:gt_num] = sample['is_crowd'][:gt_num, 0]
sample['is_crowd'] = pad_crowd
if 'gt_ide' in sample:
pad_ide = np.zeros((num_max, ), dtype=np.int32)
if gt_num > 0:
pad_ide[:gt_num] = sample['gt_ide'][:gt_num, 0]
sample['gt_ide'] = pad_ide
return sample
def __call__(self, samples, context=None):
assert len(self.anchor_masks) == len(self.downsample_ratios), \
"anchor_masks', and 'downsample_ratios' should have same length."
h, w = samples[0]['image'].shape[1:3]
num_max = 0
for sample in samples:
num_max = max(num_max, len(sample['gt_bbox']))
for sample in samples:
gt_bbox = sample['gt_bbox']
gt_ide = sample['gt_ide']
for i, (anchor_hw, downsample_ratio
) in enumerate(zip(self.anchors, self.downsample_ratios)):
anchor_hw = np.array(
anchor_hw, dtype=np.float32) / downsample_ratio
nA = len(anchor_hw)
nGh, nGw = int(h / downsample_ratio), int(w / downsample_ratio)
tbox = np.zeros((nA, nGh, nGw, 4), dtype=np.float32)
tconf = np.zeros((nA, nGh, nGw), dtype=np.float32)
tid = -np.ones((nA, nGh, nGw, 1), dtype=np.float32)
gxy, gwh = gt_bbox[:, 0:2].copy(), gt_bbox[:, 2:4].copy()
gxy[:, 0] = gxy[:, 0] * nGw
gxy[:, 1] = gxy[:, 1] * nGh
gwh[:, 0] = gwh[:, 0] * nGw
gwh[:, 1] = gwh[:, 1] * nGh
gxy[:, 0] = np.clip(gxy[:, 0], 0, nGw - 1)
gxy[:, 1] = np.clip(gxy[:, 1], 0, nGh - 1)
tboxes = np.concatenate([gxy, gwh], axis=1)
anchor_mesh = self.generate_anchor(nGh, nGw, anchor_hw)
anchor_list = np.transpose(anchor_mesh,
(0, 2, 3, 1)).reshape(-1, 4)
iou_pdist = bbox_iou_np_expand(
anchor_list, tboxes, x1y1x2y2=False)
iou_max = np.max(iou_pdist, axis=1)
max_gt_index = np.argmax(iou_pdist, axis=1)
iou_map = iou_max.reshape(nA, nGh, nGw)
gt_index_map = max_gt_index.reshape(nA, nGh, nGw)
id_index = iou_map > self.ide_thresh
fg_index = iou_map > self.fg_thresh
bg_index = iou_map < self.bg_thresh
ign_index = (iou_map < self.fg_thresh) * (
iou_map > self.bg_thresh)
tconf[fg_index] = 1
tconf[bg_index] = 0
tconf[ign_index] = -1
gt_index = gt_index_map[fg_index]
gt_box_list = tboxes[gt_index]
gt_id_list = gt_ide[gt_index_map[id_index]]
if np.sum(fg_index) > 0:
tid[id_index] = gt_id_list
fg_anchor_list = anchor_list.reshape(nA, nGh, nGw,
4)[fg_index]
delta_target = self.encode_delta(gt_box_list,
fg_anchor_list)
tbox[fg_index] = delta_target
sample['tbox{}'.format(i)] = tbox
sample['tconf{}'.format(i)] = tconf
sample['tide{}'.format(i)] = tid
sample.pop('gt_class')
sample = self.pad_box(sample, num_max)
return samples
@register_op
class Gt2JDETargetMax(BaseOperator):
__shared__ = ['num_classes']
"""
Generate JDE targets by groud truth data when evaluating
Args:
anchors (list): anchors of JDE model
anchor_masks (list): anchor_masks of JDE model
downsample_ratios (list): downsample ratios of JDE model
max_iou_thresh (float): iou thresh for high quality anchor
num_classes (int): number of classes
"""
def __init__(self,
anchors,
anchor_masks,
downsample_ratios,
max_iou_thresh=0.60,
num_classes=1):
super(Gt2JDETargetMax, self).__init__()
self.anchors = anchors
self.anchor_masks = anchor_masks
self.downsample_ratios = downsample_ratios
self.max_iou_thresh = max_iou_thresh
self.num_classes = num_classes
def __call__(self, samples, context=None):
assert len(self.anchor_masks) == len(self.downsample_ratios), \
"anchor_masks', and 'downsample_ratios' should have same length."
h, w = samples[0]['image'].shape[1:3]
for sample in samples:
gt_bbox = sample['gt_bbox']
gt_ide = sample['gt_ide']
for i, (anchor_hw, downsample_ratio
) in enumerate(zip(self.anchors, self.downsample_ratios)):
anchor_hw = np.array(
anchor_hw, dtype=np.float32) / downsample_ratio
nA = len(anchor_hw)
nGh, nGw = int(h / downsample_ratio), int(w / downsample_ratio)
tbox = np.zeros((nA, nGh, nGw, 4), dtype=np.float32)
tconf = np.zeros((nA, nGh, nGw), dtype=np.float32)
tid = -np.ones((nA, nGh, nGw, 1), dtype=np.float32)
gxy, gwh = gt_bbox[:, 0:2].copy(), gt_bbox[:, 2:4].copy()
gxy[:, 0] = gxy[:, 0] * nGw
gxy[:, 1] = gxy[:, 1] * nGh
gwh[:, 0] = gwh[:, 0] * nGw
gwh[:, 1] = gwh[:, 1] * nGh
gi = np.clip(gxy[:, 0], 0, nGw - 1).astype(int)
gj = np.clip(gxy[:, 1], 0, nGh - 1).astype(int)
# iou of targets-anchors (using wh only)
box1 = gwh
box2 = anchor_hw[:, None, :]
inter_area = np.minimum(box1, box2).prod(2)
iou = inter_area / (
box1.prod(1) + box2.prod(2) - inter_area + 1e-16)
# Select best iou_pred and anchor
iou_best = iou.max(0) # best anchor [0-2] for each target
a = np.argmax(iou, axis=0)
# Select best unique target-anchor combinations
iou_order = np.argsort(-iou_best) # best to worst
# Unique anchor selection
u = np.stack((gi, gj, a), 0)[:, iou_order]
_, first_unique = np.unique(u, axis=1, return_index=True)
mask = iou_order[first_unique]
# best anchor must share significant commonality (iou) with target
# TODO: examine arbitrary threshold
idx = mask[iou_best[mask] > self.max_iou_thresh]
if len(idx) > 0:
a_i, gj_i, gi_i = a[idx], gj[idx], gi[idx]
t_box = gt_bbox[idx]
t_id = gt_ide[idx]
if len(t_box.shape) == 1:
t_box = t_box.reshape(1, 4)
gxy, gwh = t_box[:, 0:2].copy(), t_box[:, 2:4].copy()
gxy[:, 0] = gxy[:, 0] * nGw
gxy[:, 1] = gxy[:, 1] * nGh
gwh[:, 0] = gwh[:, 0] * nGw
gwh[:, 1] = gwh[:, 1] * nGh
# XY coordinates
tbox[:, :, :, 0:2][a_i, gj_i, gi_i] = gxy - gxy.astype(int)
# Width and height in yolo method
tbox[:, :, :, 2:4][a_i, gj_i, gi_i] = np.log(gwh /
anchor_hw[a_i])
tconf[a_i, gj_i, gi_i] = 1
tid[a_i, gj_i, gi_i] = t_id
sample['tbox{}'.format(i)] = tbox
sample['tconf{}'.format(i)] = tconf
sample['tide{}'.format(i)] = tid
class Gt2FairMOTTarget(Gt2TTFTarget):
__shared__ = ['num_classes']
"""
Generate FairMOT targets by ground truth data.
Difference between Gt2FairMOTTarget and Gt2TTFTarget are:
1. the gaussian kernal radius to generate a heatmap.
2. the targets needed during training.
Args:
num_classes(int): the number of classes.
down_ratio(int): the down ratio from images to heatmap, 4 by default.
max_objs(int): the maximum number of ground truth objects in a image, 500 by default.
"""
def __init__(self, num_classes=1, down_ratio=4, max_objs=500):
super(Gt2TTFTarget, self).__init__()
self.down_ratio = down_ratio
self.num_classes = num_classes
self.max_objs = max_objs
def __call__(self, samples, context=None):
for b_id, sample in enumerate(samples):
output_h = sample['image'].shape[1] // self.down_ratio
output_w = sample['image'].shape[2] // self.down_ratio
heatmap = np.zeros(
(self.num_classes, output_h, output_w), dtype='float32')
bbox_size = np.zeros((self.max_objs, 4), dtype=np.float32)
center_offset = np.zeros((self.max_objs, 2), dtype=np.float32)
index = np.zeros((self.max_objs, ), dtype=np.int64)
index_mask = np.zeros((self.max_objs, ), dtype=np.int32)
reid = np.zeros((self.max_objs, ), dtype=np.int64)
bbox_xys = np.zeros((self.max_objs, 4), dtype=np.float32)
if self.num_classes > 1:
# each category corresponds to a set of track ids
cls_tr_ids = np.zeros(
(self.num_classes, output_h, output_w), dtype=np.int64)
cls_id_map = np.full((output_h, output_w), -1, dtype=np.int64)
gt_bbox = sample['gt_bbox']
gt_class = sample['gt_class']
gt_ide = sample['gt_ide']
for k in range(len(gt_bbox)):
cls_id = gt_class[k][0]
bbox = gt_bbox[k]
ide = gt_ide[k][0]
bbox[[0, 2]] = bbox[[0, 2]] * output_w
bbox[[1, 3]] = bbox[[1, 3]] * output_h
bbox_amodal = copy.deepcopy(bbox)
bbox_amodal[0] = bbox_amodal[0] - bbox_amodal[2] / 2.
bbox_amodal[1] = bbox_amodal[1] - bbox_amodal[3] / 2.
bbox_amodal[2] = bbox_amodal[0] + bbox_amodal[2]
bbox_amodal[3] = bbox_amodal[1] + bbox_amodal[3]
bbox[0] = np.clip(bbox[0], 0, output_w - 1)
bbox[1] = np.clip(bbox[1], 0, output_h - 1)
h = bbox[3]
w = bbox[2]
bbox_xy = copy.deepcopy(bbox)
bbox_xy[0] = bbox_xy[0] - bbox_xy[2] / 2
bbox_xy[1] = bbox_xy[1] - bbox_xy[3] / 2
bbox_xy[2] = bbox_xy[0] + bbox_xy[2]
bbox_xy[3] = bbox_xy[1] + bbox_xy[3]
if h > 0 and w > 0:
radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7)
radius = max(0, int(radius))
ct = np.array([bbox[0], bbox[1]], dtype=np.float32)
ct_int = ct.astype(np.int32)
self.draw_truncate_gaussian(heatmap[cls_id], ct_int, radius,
radius)
bbox_size[k] = ct[0] - bbox_amodal[0], ct[1] - bbox_amodal[1], \
bbox_amodal[2] - ct[0], bbox_amodal[3] - ct[1]
index[k] = ct_int[1] * output_w + ct_int[0]
center_offset[k] = ct - ct_int
index_mask[k] = 1
reid[k] = ide
bbox_xys[k] = bbox_xy
if self.num_classes > 1:
cls_id_map[ct_int[1], ct_int[0]] = cls_id
cls_tr_ids[cls_id][ct_int[1]][ct_int[0]] = ide - 1
# track id start from 0
sample['heatmap'] = heatmap
sample['index'] = index
sample['offset'] = center_offset
sample['size'] = bbox_size
sample['index_mask'] = index_mask
sample['reid'] = reid
if self.num_classes > 1:
sample['cls_id_map'] = cls_id_map
sample['cls_tr_ids'] = cls_tr_ids
sample['bbox_xys'] = bbox_xys
sample.pop('is_crowd', None)
sample.pop('difficult', None)
sample.pop('gt_class', None)
sample.pop('gt_bbox', None)
sample.pop('gt_score', None)
sample.pop('gt_ide', None)
return samples
================================================
FILE: ppdet/data/transform/op_helper.py
================================================
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# this file contains helper methods for BBOX processing
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import random
import math
import cv2
def meet_emit_constraint(src_bbox, sample_bbox):
center_x = (src_bbox[2] + src_bbox[0]) / 2
center_y = (src_bbox[3] + src_bbox[1]) / 2
if center_x >= sample_bbox[0] and \
center_x <= sample_bbox[2] and \
center_y >= sample_bbox[1] and \
center_y <= sample_bbox[3]:
return True
return False
def clip_bbox(src_bbox):
src_bbox[0] = max(min(src_bbox[0], 1.0), 0.0)
src_bbox[1] = max(min(src_bbox[1], 1.0), 0.0)
src_bbox[2] = max(min(src_bbox[2], 1.0), 0.0)
src_bbox[3] = max(min(src_bbox[3], 1.0), 0.0)
return src_bbox
def bbox_area(src_bbox):
if src_bbox[2] < src_bbox[0] or src_bbox[3] < src_bbox[1]:
return 0.
else:
width = src_bbox[2] - src_bbox[0]
height = src_bbox[3] - src_bbox[1]
return width * height
def is_overlap(object_bbox, sample_bbox):
if object_bbox[0] >= sample_bbox[2] or \
object_bbox[2] <= sample_bbox[0] or \
object_bbox[1] >= sample_bbox[3] or \
object_bbox[3] <= sample_bbox[1]:
return False
else:
return True
def filter_and_process(sample_bbox, bboxes, labels, scores=None,
keypoints=None):
new_bboxes = []
new_labels = []
new_scores = []
new_keypoints = []
new_kp_ignore = []
for i in range(len(bboxes)):
new_bbox = [0, 0, 0, 0]
obj_bbox = [bboxes[i][0], bboxes[i][1], bboxes[i][2], bboxes[i][3]]
if not meet_emit_constraint(obj_bbox, sample_bbox):
continue
if not is_overlap(obj_bbox, sample_bbox):
continue
sample_width = sample_bbox[2] - sample_bbox[0]
sample_height = sample_bbox[3] - sample_bbox[1]
new_bbox[0] = (obj_bbox[0] - sample_bbox[0]) / sample_width
new_bbox[1] = (obj_bbox[1] - sample_bbox[1]) / sample_height
new_bbox[2] = (obj_bbox[2] - sample_bbox[0]) / sample_width
new_bbox[3] = (obj_bbox[3] - sample_bbox[1]) / sample_height
new_bbox = clip_bbox(new_bbox)
if bbox_area(new_bbox) > 0:
new_bboxes.append(new_bbox)
new_labels.append([labels[i][0]])
if scores is not None:
new_scores.append([scores[i][0]])
if keypoints is not None:
sample_keypoint = keypoints[0][i]
for j in range(len(sample_keypoint)):
kp_len = sample_height if j % 2 else sample_width
sample_coord = sample_bbox[1] if j % 2 else sample_bbox[0]
sample_keypoint[j] = (
sample_keypoint[j] - sample_coord) / kp_len
sample_keypoint[j] = max(min(sample_keypoint[j], 1.0), 0.0)
new_keypoints.append(sample_keypoint)
new_kp_ignore.append(keypoints[1][i])
bboxes = np.array(new_bboxes)
labels = np.array(new_labels)
scores = np.array(new_scores)
if keypoints is not None:
keypoints = np.array(new_keypoints)
new_kp_ignore = np.array(new_kp_ignore)
return bboxes, labels, scores, (keypoints, new_kp_ignore)
return bboxes, labels, scores
def bbox_area_sampling(bboxes, labels, scores, target_size, min_size):
new_bboxes = []
new_labels = []
new_scores = []
for i, bbox in enumerate(bboxes):
w = float((bbox[2] - bbox[0]) * target_size)
h = float((bbox[3] - bbox[1]) * target_size)
if w * h < float(min_size * min_size):
continue
else:
new_bboxes.append(bbox)
new_labels.append(labels[i])
if scores is not None and scores.size != 0:
new_scores.append(scores[i])
bboxes = np.array(new_bboxes)
labels = np.array(new_labels)
scores = np.array(new_scores)
return bboxes, labels, scores
def generate_sample_bbox(sampler):
scale = np.random.uniform(sampler[2], sampler[3])
aspect_ratio = np.random.uniform(sampler[4], sampler[5])
aspect_ratio = max(aspect_ratio, (scale**2.0))
aspect_ratio = min(aspect_ratio, 1 / (scale**2.0))
bbox_width = scale * (aspect_ratio**0.5)
bbox_height = scale / (aspect_ratio**0.5)
xmin_bound = 1 - bbox_width
ymin_bound = 1 - bbox_height
xmin = np.random.uniform(0, xmin_bound)
ymin = np.random.uniform(0, ymin_bound)
xmax = xmin + bbox_width
ymax = ymin + bbox_height
sampled_bbox = [xmin, ymin, xmax, ymax]
return sampled_bbox
def generate_sample_bbox_square(sampler, image_width, image_height):
scale = np.random.uniform(sampler[2], sampler[3])
aspect_ratio = np.random.uniform(sampler[4], sampler[5])
aspect_ratio = max(aspect_ratio, (scale**2.0))
aspect_ratio = min(aspect_ratio, 1 / (scale**2.0))
bbox_width = scale * (aspect_ratio**0.5)
bbox_height = scale / (aspect_ratio**0.5)
if image_height < image_width:
bbox_width = bbox_height * image_height / image_width
else:
bbox_height = bbox_width * image_width / image_height
xmin_bound = 1 - bbox_width
ymin_bound = 1 - bbox_height
xmin = np.random.uniform(0, xmin_bound)
ymin = np.random.uniform(0, ymin_bound)
xmax = xmin + bbox_width
ymax = ymin + bbox_height
sampled_bbox = [xmin, ymin, xmax, ymax]
return sampled_bbox
def data_anchor_sampling(bbox_labels, image_width, image_height, scale_array,
resize_width):
num_gt = len(bbox_labels)
# np.random.randint range: [low, high)
rand_idx = np.random.randint(0, num_gt) if num_gt != 0 else 0
if num_gt != 0:
norm_xmin = bbox_labels[rand_idx][0]
norm_ymin = bbox_labels[rand_idx][1]
norm_xmax = bbox_labels[rand_idx][2]
norm_ymax = bbox_labels[rand_idx][3]
xmin = norm_xmin * image_width
ymin = norm_ymin * image_height
wid = image_width * (norm_xmax - norm_xmin)
hei = image_height * (norm_ymax - norm_ymin)
range_size = 0
area = wid * hei
for scale_ind in range(0, len(scale_array) - 1):
if area > scale_array[scale_ind] ** 2 and area < \
scale_array[scale_ind + 1] ** 2:
range_size = scale_ind + 1
break
if area > scale_array[len(scale_array) - 2]**2:
range_size = len(scale_array) - 2
scale_choose = 0.0
if range_size == 0:
rand_idx_size = 0
else:
# np.random.randint range: [low, high)
rng_rand_size = np.random.randint(0, range_size + 1)
rand_idx_size = rng_rand_size % (range_size + 1)
if rand_idx_size == range_size:
min_resize_val = scale_array[rand_idx_size] / 2.0
max_resize_val = min(2.0 * scale_array[rand_idx_size],
2 * math.sqrt(wid * hei))
scale_choose = random.uniform(min_resize_val, max_resize_val)
else:
min_resize_val = scale_array[rand_idx_size] / 2.0
max_resize_val = 2.0 * scale_array[rand_idx_size]
scale_choose = random.uniform(min_resize_val, max_resize_val)
sample_bbox_size = wid * resize_width / scale_choose
w_off_orig = 0.0
h_off_orig = 0.0
if sample_bbox_size < max(image_height, image_width):
if wid <= sample_bbox_size:
w_off_orig = np.random.uniform(xmin + wid - sample_bbox_size,
xmin)
else:
w_off_orig = np.random.uniform(xmin,
xmin + wid - sample_bbox_size)
if hei <= sample_bbox_size:
h_off_orig = np.random.uniform(ymin + hei - sample_bbox_size,
ymin)
else:
h_off_orig = np.random.uniform(ymin,
ymin + hei - sample_bbox_size)
else:
w_off_orig = np.random.uniform(image_width - sample_bbox_size, 0.0)
h_off_orig = np.random.uniform(image_height - sample_bbox_size, 0.0)
w_off_orig = math.floor(w_off_orig)
h_off_orig = math.floor(h_off_orig)
# Figure out top left coordinates.
w_off = float(w_off_orig / image_width)
h_off = float(h_off_orig / image_height)
sampled_bbox = [
w_off, h_off, w_off + float(sample_bbox_size / image_width),
h_off + float(sample_bbox_size / image_height)
]
return sampled_bbox
else:
return 0
def jaccard_overlap(sample_bbox, object_bbox):
if sample_bbox[0] >= object_bbox[2] or \
sample_bbox[2] <= object_bbox[0] or \
sample_bbox[1] >= object_bbox[3] or \
sample_bbox[3] <= object_bbox[1]:
return 0
intersect_xmin = max(sample_bbox[0], object_bbox[0])
intersect_ymin = max(sample_bbox[1], object_bbox[1])
intersect_xmax = min(sample_bbox[2], object_bbox[2])
intersect_ymax = min(sample_bbox[3], object_bbox[3])
intersect_size = (intersect_xmax - intersect_xmin) * (
intersect_ymax - intersect_ymin)
sample_bbox_size = bbox_area(sample_bbox)
object_bbox_size = bbox_area(object_bbox)
overlap = intersect_size / (
sample_bbox_size + object_bbox_size - intersect_size)
return overlap
def intersect_bbox(bbox1, bbox2):
if bbox2[0] > bbox1[2] or bbox2[2] < bbox1[0] or \
bbox2[1] > bbox1[3] or bbox2[3] < bbox1[1]:
intersection_box = [0.0, 0.0, 0.0, 0.0]
else:
intersection_box = [
max(bbox1[0], bbox2[0]), max(bbox1[1], bbox2[1]),
min(bbox1[2], bbox2[2]), min(bbox1[3], bbox2[3])
]
return intersection_box
def bbox_coverage(bbox1, bbox2):
inter_box = intersect_bbox(bbox1, bbox2)
intersect_size = bbox_area(inter_box)
if intersect_size > 0:
bbox1_size = bbox_area(bbox1)
return intersect_size / bbox1_size
else:
return 0.
def satisfy_sample_constraint(sampler,
sample_bbox,
gt_bboxes,
satisfy_all=False):
if sampler[6] == 0 and sampler[7] == 0:
return True
satisfied = []
for i in range(len(gt_bboxes)):
object_bbox = [
gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3]
]
overlap = jaccard_overlap(sample_bbox, object_bbox)
if sampler[6] != 0 and \
overlap < sampler[6]:
satisfied.append(False)
continue
if sampler[7] != 0 and \
overlap > sampler[7]:
satisfied.append(False)
continue
satisfied.append(True)
if not satisfy_all:
return True
if satisfy_all:
return np.all(satisfied)
else:
return False
def satisfy_sample_constraint_coverage(sampler, sample_bbox, gt_bboxes):
if sampler[6] == 0 and sampler[7] == 0:
has_jaccard_overlap = False
else:
has_jaccard_overlap = True
if sampler[8] == 0 and sampler[9] == 0:
has_object_coverage = False
else:
has_object_coverage = True
if not has_jaccard_overlap and not has_object_coverage:
return True
found = False
for i in range(len(gt_bboxes)):
object_bbox = [
gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3]
]
if has_jaccard_overlap:
overlap = jaccard_overlap(sample_bbox, object_bbox)
if sampler[6] != 0 and \
overlap < sampler[6]:
continue
if sampler[7] != 0 and \
overlap > sampler[7]:
continue
found = True
if has_object_coverage:
object_coverage = bbox_coverage(object_bbox, sample_bbox)
if sampler[8] != 0 and \
object_coverage < sampler[8]:
continue
if sampler[9] != 0 and \
object_coverage > sampler[9]:
continue
found = True
if found:
return True
return found
def crop_image_sampling(img, sample_bbox, image_width, image_height,
target_size):
# no clipping here
xmin = int(sample_bbox[0] * image_width)
xmax = int(sample_bbox[2] * image_width)
ymin = int(sample_bbox[1] * image_height)
ymax = int(sample_bbox[3] * image_height)
w_off = xmin
h_off = ymin
width = xmax - xmin
height = ymax - ymin
cross_xmin = max(0.0, float(w_off))
cross_ymin = max(0.0, float(h_off))
cross_xmax = min(float(w_off + width - 1.0), float(image_width))
cross_ymax = min(float(h_off + height - 1.0), float(image_height))
cross_width = cross_xmax - cross_xmin
cross_height = cross_ymax - cross_ymin
roi_xmin = 0 if w_off >= 0 else abs(w_off)
roi_ymin = 0 if h_off >= 0 else abs(h_off)
roi_width = cross_width
roi_height = cross_height
roi_y1 = int(roi_ymin)
roi_y2 = int(roi_ymin + roi_height)
roi_x1 = int(roi_xmin)
roi_x2 = int(roi_xmin + roi_width)
cross_y1 = int(cross_ymin)
cross_y2 = int(cross_ymin + cross_height)
cross_x1 = int(cross_xmin)
cross_x2 = int(cross_xmin + cross_width)
sample_img = np.zeros((height, width, 3))
sample_img[roi_y1: roi_y2, roi_x1: roi_x2] = \
img[cross_y1: cross_y2, cross_x1: cross_x2]
sample_img = cv2.resize(
sample_img, (target_size, target_size), interpolation=cv2.INTER_AREA)
return sample_img
def is_poly(segm):
assert isinstance(segm, (list, dict)), \
"Invalid segm type: {}".format(type(segm))
return isinstance(segm, list)
def gaussian_radius(bbox_size, min_overlap):
height, width = bbox_size
a1 = 1
b1 = (height + width)
c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
sq1 = np.sqrt(b1**2 - 4 * a1 * c1)
radius1 = (b1 + sq1) / (2 * a1)
a2 = 4
b2 = 2 * (height + width)
c2 = (1 - min_overlap) * width * height
sq2 = np.sqrt(b2**2 - 4 * a2 * c2)
radius2 = (b2 + sq2) / 2
a3 = 4 * min_overlap
b3 = -2 * min_overlap * (height + width)
c3 = (min_overlap - 1) * width * height
sq3 = np.sqrt(b3**2 - 4 * a3 * c3)
radius3 = (b3 + sq3) / 2
return min(radius1, radius2, radius3)
def draw_gaussian(heatmap, center, radius, k=1, delte=6):
diameter = 2 * radius + 1
sigma = diameter / delte
gaussian = gaussian2D((diameter, diameter), sigma_x=sigma, sigma_y=sigma)
x, y = center
height, width = heatmap.shape[0:2]
left, right = min(x, radius), min(width - x, radius + 1)
top, bottom = min(y, radius), min(height - y, radius + 1)
masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:
radius + right]
np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
def gaussian2D(shape, sigma_x=1, sigma_y=1):
m, n = [(ss - 1.) / 2. for ss in shape]
y, x = np.ogrid[-m:m + 1, -n:n + 1]
h = np.exp(-(x * x / (2 * sigma_x * sigma_x) + y * y / (2 * sigma_y *
sigma_y)))
h[h < np.finfo(h.dtype).eps * h.max()] = 0
return h
def draw_umich_gaussian(heatmap, center, radius, k=1):
"""
draw_umich_gaussian, refer to https://github.com/xingyizhou/CenterNet/blob/master/src/lib/utils/image.py#L126
"""
diameter = 2 * radius + 1
gaussian = gaussian2D(
(diameter, diameter), sigma_x=diameter / 6, sigma_y=diameter / 6)
x, y = int(center[0]), int(center[1])
height, width = heatmap.shape[0:2]
left, right = min(x, radius), min(width - x, radius + 1)
top, bottom = min(y, radius), min(height - y, radius + 1)
masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:
radius + right]
if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
return heatmap
def get_border(border, size):
i = 1
while size - border // i <= border // i:
i *= 2
return border // i
================================================
FILE: ppdet/data/transform/operators.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# function:
# operators to process sample,
# eg: decode/resize/crop image
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
try:
from collections.abc import Sequence
except Exception:
from collections import Sequence
from numbers import Number, Integral
import uuid
import random
import math
import numpy as np
import os
import copy
import logging
import cv2
from PIL import Image, ImageDraw, ImageEnhance
from pycocotools import mask
import pickle
import threading
MUTEX = threading.Lock()
import paddle
from ppdet.core.workspace import serializable
from ..reader import Compose
from .op_helper import (satisfy_sample_constraint, filter_and_process,
generate_sample_bbox, clip_bbox, data_anchor_sampling,
satisfy_sample_constraint_coverage, crop_image_sampling,
generate_sample_bbox_square, bbox_area_sampling,
is_poly, get_border)
from ppdet.utils.logger import setup_logger
from ppdet.utils.compact import imagedraw_textsize_c
from ppdet.modeling.keypoint_utils import get_affine_transform, affine_transform
logger = setup_logger(__name__)
registered_ops = []
def register_op(cls):
registered_ops.append(cls.__name__)
if not hasattr(BaseOperator, cls.__name__):
setattr(BaseOperator, cls.__name__, cls)
else:
raise KeyError("The {} class has been registered.".format(cls.__name__))
return serializable(cls)
class BboxError(ValueError):
pass
class ImageError(ValueError):
pass
class BaseOperator(object):
def __init__(self, name=None):
if name is None:
name = self.__class__.__name__
self._id = name + '_' + str(uuid.uuid4())[-6:]
def apply(self, sample, context=None):
""" Process a sample.
Args:
sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
context (dict): info about this sample processing
Returns:
result (dict): a processed sample
"""
return sample
def __call__(self, sample, context=None):
""" Process a sample.
Args:
sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
context (dict): info about this sample processing
Returns:
result (dict): a processed sample
"""
if isinstance(sample, Sequence):
for i in range(len(sample)):
sample[i] = self.apply(sample[i], context)
else:
sample = self.apply(sample, context)
return sample
def __str__(self):
return str(self._id)
@register_op
class Decode(BaseOperator):
def __init__(self, rtn_im_file=False):
""" Transform the image data to numpy format following the rgb format
"""
super(Decode, self).__init__()
self.rtn_im_file = rtn_im_file
def apply(self, sample, context=None):
""" load image if 'im_file' field is not empty but 'image' is"""
if 'image' not in sample:
with open(sample['im_file'], 'rb') as f:
sample['image'] = f.read()
if not self.rtn_im_file:
sample.pop('im_file')
try:
im = sample['image']
data = np.frombuffer(im, dtype='uint8')
im = cv2.imdecode(data, 1) # BGR mode, but need RGB mode
if 'keep_ori_im' in sample and sample['keep_ori_im']:
sample['ori_image'] = im
im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
except:
im = sample['image']
sample['image'] = im
if 'h' not in sample:
sample['h'] = im.shape[0]
elif sample['h'] != im.shape[0]:
logger.warning(
"The actual image height: {} is not equal to the "
"height: {} in annotation, and update sample['h'] by actual "
"image height.".format(im.shape[0], sample['h']))
sample['h'] = im.shape[0]
if 'w' not in sample:
sample['w'] = im.shape[1]
elif sample['w'] != im.shape[1]:
logger.warning(
"The actual image width: {} is not equal to the "
"width: {} in annotation, and update sample['w'] by actual "
"image width.".format(im.shape[1], sample['w']))
sample['w'] = im.shape[1]
sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
sample['scale_factor'] = np.array([1., 1.], dtype=np.float32)
return sample
def _make_dirs(dirname):
try:
from pathlib import Path
except ImportError:
from pathlib2 import Path
Path(dirname).mkdir(exist_ok=True)
@register_op
class DecodeCache(BaseOperator):
def __init__(self, cache_root=None):
'''decode image and caching
'''
super(DecodeCache, self).__init__()
self.use_cache = False if cache_root is None else True
self.cache_root = cache_root
if cache_root is not None:
_make_dirs(cache_root)
def apply(self, sample, context=None):
if self.use_cache and os.path.exists(
self.cache_path(self.cache_root, sample['im_file'])):
path = self.cache_path(self.cache_root, sample['im_file'])
im = self.load(path)
else:
if 'image' not in sample:
with open(sample['im_file'], 'rb') as f:
sample['image'] = f.read()
im = sample['image']
data = np.frombuffer(im, dtype='uint8')
im = cv2.imdecode(data, 1) # BGR mode, but need RGB mode
if 'keep_ori_im' in sample and sample['keep_ori_im']:
sample['ori_image'] = im
im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
if self.use_cache and not os.path.exists(
self.cache_path(self.cache_root, sample['im_file'])):
path = self.cache_path(self.cache_root, sample['im_file'])
self.dump(im, path)
sample['image'] = im
sample['h'] = im.shape[0]
sample['w'] = im.shape[1]
sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
sample['scale_factor'] = np.array([1., 1.], dtype=np.float32)
sample.pop('im_file')
return sample
@staticmethod
def cache_path(dir_oot, im_file):
return os.path.join(dir_oot, os.path.basename(im_file) + '.pkl')
@staticmethod
def load(path):
with open(path, 'rb') as f:
im = pickle.load(f)
return im
@staticmethod
def dump(obj, path):
MUTEX.acquire()
try:
with open(path, 'wb') as f:
pickle.dump(obj, f)
except Exception as e:
logger.warning('dump {} occurs exception {}'.format(path, str(e)))
finally:
MUTEX.release()
@register_op
class SniperDecodeCrop(BaseOperator):
def __init__(self):
super(SniperDecodeCrop, self).__init__()
def __call__(self, sample, context=None):
if 'image' not in sample:
with open(sample['im_file'], 'rb') as f:
sample['image'] = f.read()
sample.pop('im_file')
im = sample['image']
data = np.frombuffer(im, dtype='uint8')
im = cv2.imdecode(data, cv2.IMREAD_COLOR) # BGR mode, but need RGB mode
if 'keep_ori_im' in sample and sample['keep_ori_im']:
sample['ori_image'] = im
im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
chip = sample['chip']
x1, y1, x2, y2 = [int(xi) for xi in chip]
im = im[max(y1, 0):min(y2, im.shape[0]), max(x1, 0):min(x2, im.shape[
1]), :]
sample['image'] = im
h = im.shape[0]
w = im.shape[1]
# sample['im_info'] = [h, w, 1.0]
sample['h'] = h
sample['w'] = w
sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
sample['scale_factor'] = np.array([1., 1.], dtype=np.float32)
return sample
@register_op
class Permute(BaseOperator):
def __init__(self):
"""
Change the channel to be (C, H, W)
"""
super(Permute, self).__init__()
def apply(self, sample, context=None):
im = sample['image']
im = im.transpose((2, 0, 1))
sample['image'] = im
if 'pre_image' in sample:
pre_im = sample['pre_image']
pre_im = pre_im.transpose((2, 0, 1))
sample['pre_image'] = pre_im
return sample
@register_op
class Lighting(BaseOperator):
"""
Lighting the image by eigenvalues and eigenvectors
Args:
eigval (list): eigenvalues
eigvec (list): eigenvectors
alphastd (float): random weight of lighting, 0.1 by default
"""
def __init__(self, eigval, eigvec, alphastd=0.1):
super(Lighting, self).__init__()
self.alphastd = alphastd
self.eigval = np.array(eigval).astype('float32')
self.eigvec = np.array(eigvec).astype('float32')
def apply(self, sample, context=None):
alpha = np.random.normal(scale=self.alphastd, size=(3, ))
sample['image'] += np.dot(self.eigvec, self.eigval * alpha)
if 'pre_image' in sample:
sample['pre_image'] += np.dot(self.eigvec, self.eigval * alpha)
return sample
@register_op
class RandomErasingImage(BaseOperator):
def __init__(self, prob=0.5, lower=0.02, higher=0.4, aspect_ratio=0.3):
"""
Random Erasing Data Augmentation, see https://arxiv.org/abs/1708.04896
Args:
prob (float): probability to carry out random erasing
lower (float): lower limit of the erasing area ratio
higher (float): upper limit of the erasing area ratio
aspect_ratio (float): aspect ratio of the erasing region
"""
super(RandomErasingImage, self).__init__()
self.prob = prob
self.lower = lower
self.higher = higher
self.aspect_ratio = aspect_ratio
def apply(self, sample, context=None):
gt_bbox = sample['gt_bbox']
im = sample['image']
if not isinstance(im, np.ndarray):
raise TypeError("{}: image is not a numpy array.".format(self))
if len(im.shape) != 3:
raise ImageError("{}: image is not 3-dimensional.".format(self))
for idx in range(gt_bbox.shape[0]):
if self.prob <= np.random.rand():
continue
x1, y1, x2, y2 = gt_bbox[idx, :]
w_bbox = x2 - x1
h_bbox = y2 - y1
area = w_bbox * h_bbox
target_area = random.uniform(self.lower, self.higher) * area
aspect_ratio = random.uniform(self.aspect_ratio,
1 / self.aspect_ratio)
h = int(round(math.sqrt(target_area * aspect_ratio)))
w = int(round(math.sqrt(target_area / aspect_ratio)))
if w < w_bbox and h < h_bbox:
off_y1 = random.randint(0, int(h_bbox - h))
off_x1 = random.randint(0, int(w_bbox - w))
im[int(y1 + off_y1):int(y1 + off_y1 + h), int(x1 + off_x1):int(
x1 + off_x1 + w), :] = 0
sample['image'] = im
return sample
@register_op
class NormalizeImage(BaseOperator):
def __init__(self,
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225],
is_scale=True,
norm_type='mean_std'):
"""
Args:
mean (list): the pixel mean
std (list): the pixel variance
is_scale (bool): scale the pixel to [0,1]
norm_type (str): type in ['mean_std', 'none']
"""
super(NormalizeImage, self).__init__()
self.mean = mean
self.std = std
self.is_scale = is_scale
self.norm_type = norm_type
if not (isinstance(self.mean, list) and isinstance(self.std, list) and
isinstance(self.is_scale, bool) and
self.norm_type in ['mean_std', 'none']):
raise TypeError("{}: input type is invalid.".format(self))
from functools import reduce
if reduce(lambda x, y: x * y, self.std) == 0:
raise ValueError('{}: std is invalid!'.format(self))
def apply(self, sample, context=None):
"""Normalize the image.
Operators:
1.(optional) Scale the pixel to [0,1]
2.(optional) Each pixel minus mean and is divided by std
"""
im = sample['image']
im = im.astype(np.float32, copy=False)
if self.is_scale:
scale = 1.0 / 255.0
im *= scale
if self.norm_type == 'mean_std':
mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
std = np.array(self.std)[np.newaxis, np.newaxis, :]
im -= mean
im /= std
sample['image'] = im
if 'pre_image' in sample:
pre_im = sample['pre_image']
pre_im = pre_im.astype(np.float32, copy=False)
if self.is_scale:
scale = 1.0 / 255.0
pre_im *= scale
if self.norm_type == 'mean_std':
mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
std = np.array(self.std)[np.newaxis, np.newaxis, :]
pre_im -= mean
pre_im /= std
sample['pre_image'] = pre_im
return sample
@register_op
class GridMask(BaseOperator):
def __init__(self,
use_h=True,
use_w=True,
rotate=1,
offset=False,
ratio=0.5,
mode=1,
prob=0.7,
upper_iter=360000):
"""
GridMask Data Augmentation, see https://arxiv.org/abs/2001.04086
Args:
use_h (bool): whether to mask vertically
use_w (boo;): whether to mask horizontally
rotate (float): angle for the mask to rotate
offset (float): mask offset
ratio (float): mask ratio
mode (int): gridmask mode
prob (float): max probability to carry out gridmask
upper_iter (int): suggested to be equal to global max_iter
"""
super(GridMask, self).__init__()
self.use_h = use_h
self.use_w = use_w
self.rotate = rotate
self.offset = offset
self.ratio = ratio
self.mode = mode
self.prob = prob
self.upper_iter = upper_iter
from .gridmask_utils import Gridmask
self.gridmask_op = Gridmask(
use_h,
use_w,
rotate=rotate,
offset=offset,
ratio=ratio,
mode=mode,
prob=prob,
upper_iter=upper_iter)
def apply(self, sample, context=None):
sample['image'] = self.gridmask_op(sample['image'], sample['curr_iter'])
return sample
@register_op
class RandomDistort(BaseOperator):
"""Random color distortion.
Args:
hue (list): hue settings. in [lower, upper, probability] format.
saturation (list): saturation settings. in [lower, upper, probability] format.
contrast (list): contrast settings. in [lower, upper, probability] format.
brightness (list): brightness settings. in [lower, upper, probability] format.
random_apply (bool): whether to apply in random (yolo) or fixed (SSD) order.
count (int): the number of doing distrot.
random_channel (bool): whether to swap channels randomly.
prob (float): the probability of enhancing the sample.
"""
def __init__(self,
hue=[-18, 18, 0.5],
saturation=[0.5, 1.5, 0.5],
contrast=[0.5, 1.5, 0.5],
brightness=[0.5, 1.5, 0.5],
random_apply=True,
count=4,
random_channel=False,
prob=1.0):
super(RandomDistort, self).__init__()
self.hue = hue
self.saturation = saturation
self.contrast = contrast
self.brightness = brightness
self.random_apply = random_apply
self.count = count
self.random_channel = random_channel
self.prob = prob
def apply_hue(self, img):
low, high, prob = self.hue
if np.random.uniform(0., 1.) < prob:
return img
delta = np.random.uniform(low, high)
img = np.array(img.convert('HSV'))
img[:, :, 0] = img[:, :, 0] + delta
img = Image.fromarray(img, mode='HSV').convert('RGB')
return img
def apply_saturation(self, img):
low, high, prob = self.saturation
if np.random.uniform(0., 1.) < prob:
return img
delta = np.random.uniform(low, high)
img = ImageEnhance.Color(img).enhance(delta)
return img
def apply_contrast(self, img):
low, high, prob = self.contrast
if np.random.uniform(0., 1.) < prob:
return img
delta = np.random.uniform(low, high)
img = ImageEnhance.Contrast(img).enhance(delta)
return img
def apply_brightness(self, img):
low, high, prob = self.brightness
if np.random.uniform(0., 1.) < prob:
return img
delta = np.random.uniform(low, high)
img = ImageEnhance.Brightness(img).enhance(delta)
return img
def apply(self, sample, context=None):
if random.random() > self.prob:
return sample
img = sample['image']
img = Image.fromarray(img.astype(np.uint8))
if self.random_apply:
functions = [
self.apply_brightness, self.apply_contrast,
self.apply_saturation, self.apply_hue
]
distortions = np.random.permutation(functions)[:self.count]
for func in distortions:
img = func(img)
img = np.asarray(img).astype(np.float32)
sample['image'] = img
return sample
img = self.apply_brightness(img)
mode = np.random.randint(0, 2)
if mode:
img = self.apply_contrast(img)
img = self.apply_saturation(img)
img = self.apply_hue(img)
if not mode:
img = self.apply_contrast(img)
img = np.asarray(img).astype(np.float32)
if self.random_channel:
if np.random.randint(0, 2):
img = img[..., np.random.permutation(3)]
sample['image'] = img
return sample
@register_op
class PhotoMetricDistortion(BaseOperator):
"""Apply photometric distortion to image sequentially, every transformation
is applied with a probability of 0.5. The position of random contrast is in
second or second to last.
1. random brightness
2. random contrast (mode 0)
3. convert color from BGR to HSV
4. random saturation
5. random hue
6. convert color from HSV to BGR
7. random contrast (mode 1)
8. randomly swap channels
Args:
brightness_delta (int): delta of brightness.
contrast_range (tuple): range of contrast.
saturation_range (tuple): range of saturation.
hue_delta (int): delta of hue.
"""
def __init__(self,
brightness_delta=32,
contrast_range=(0.5, 1.5),
saturation_range=(0.5, 1.5),
hue_delta=18):
super(PhotoMetricDistortion, self).__init__()
self.brightness_delta = brightness_delta
self.contrast_lower, self.contrast_upper = contrast_range
self.saturation_lower, self.saturation_upper = saturation_range
self.hue_delta = hue_delta
def apply(self, results, context=None):
"""Call function to perform photometric distortion on images.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Result dict with images distorted.
"""
img = results['image']
img = img.astype(np.float32)
# random brightness
if np.random.randint(2):
delta = np.random.uniform(-self.brightness_delta,
self.brightness_delta)
img += delta
# mode == 0 --> do random contrast first
# mode == 1 --> do random contrast last
mode = np.random.randint(2)
if mode == 1:
if np.random.randint(2):
alpha = np.random.uniform(self.contrast_lower,
self.contrast_upper)
img *= alpha
# convert color from BGR to HSV
img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
# random saturation
if np.random.randint(2):
img[..., 1] *= np.random.uniform(self.saturation_lower,
self.saturation_upper)
# random hue
if np.random.randint(2):
img[..., 0] += np.random.uniform(-self.hue_delta, self.hue_delta)
img[..., 0][img[..., 0] > 360] -= 360
img[..., 0][img[..., 0] < 0] += 360
# convert color from HSV to BGR
img = cv2.cvtColor(img, cv2.COLOR_HSV2BGR)
# random contrast
if mode == 0:
if np.random.randint(2):
alpha = np.random.uniform(self.contrast_lower,
self.contrast_upper)
img *= alpha
# randomly swap channels
if np.random.randint(2):
img = img[..., np.random.permutation(3)]
results['image'] = img
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
repr_str += 'contrast_range='
repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
repr_str += 'saturation_range='
repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
repr_str += f'hue_delta={self.hue_delta})'
return repr_str
@register_op
class AutoAugment(BaseOperator):
def __init__(self, autoaug_type="v1"):
"""
Args:
autoaug_type (str): autoaug type, support v0, v1, v2, v3, test
"""
super(AutoAugment, self).__init__()
self.autoaug_type = autoaug_type
def apply(self, sample, context=None):
"""
Learning Data Augmentation Strategies for Object Detection, see https://arxiv.org/abs/1906.11172
"""
im = sample['image']
gt_bbox = sample['gt_bbox']
if not isinstance(im, np.ndarray):
raise TypeError("{}: image is not a numpy array.".format(self))
if len(im.shape) != 3:
raise ImageError("{}: image is not 3-dimensional.".format(self))
if len(gt_bbox) == 0:
return sample
height, width, _ = im.shape
norm_gt_bbox = np.ones_like(gt_bbox, dtype=np.float32)
norm_gt_bbox[:, 0] = gt_bbox[:, 1] / float(height)
norm_gt_bbox[:, 1] = gt_bbox[:, 0] / float(width)
norm_gt_bbox[:, 2] = gt_bbox[:, 3] / float(height)
norm_gt_bbox[:, 3] = gt_bbox[:, 2] / float(width)
from .autoaugment_utils import distort_image_with_autoaugment
im, norm_gt_bbox = distort_image_with_autoaugment(im, norm_gt_bbox,
self.autoaug_type)
gt_bbox[:, 0] = norm_gt_bbox[:, 1] * float(width)
gt_bbox[:, 1] = norm_gt_bbox[:, 0] * float(height)
gt_bbox[:, 2] = norm_gt_bbox[:, 3] * float(width)
gt_bbox[:, 3] = norm_gt_bbox[:, 2] * float(height)
sample['image'] = im
sample['gt_bbox'] = gt_bbox
return sample
@register_op
class RandomFlip(BaseOperator):
def __init__(self, prob=0.5):
"""
Args:
prob (float): the probability of flipping image
"""
super(RandomFlip, self).__init__()
self.prob = prob
if not (isinstance(self.prob, float)):
raise TypeError("{}: input type is invalid.".format(self))
def apply_segm(self, segms, height, width):
def _flip_poly(poly, width):
flipped_poly = np.array(poly)
flipped_poly[0::2] = width - np.array(poly[0::2])
return flipped_poly.tolist()
def _flip_rle(rle, height, width):
if 'counts' in rle and type(rle['counts']) == list:
rle = mask_util.frPyObjects(rle, height, width)
mask = mask_util.decode(rle)
mask = mask[:, ::-1]
rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
return rle
flipped_segms = []
for segm in segms:
if is_poly(segm):
# Polygon format
flipped_segms.append([_flip_poly(poly, width) for poly in segm])
else:
# RLE format
import pycocotools.mask as mask_util
flipped_segms.append(_flip_rle(segm, height, width))
return flipped_segms
def apply_keypoint(self, gt_keypoint, width):
for i in range(gt_keypoint.shape[1]):
if i % 2 == 0:
old_x = gt_keypoint[:, i].copy()
gt_keypoint[:, i] = width - old_x
return gt_keypoint
def apply_image(self, image):
return image[:, ::-1, :]
def apply_bbox(self, bbox, width):
oldx1 = bbox[:, 0].copy()
oldx2 = bbox[:, 2].copy()
bbox[:, 0] = width - oldx2
bbox[:, 2] = width - oldx1
return bbox
def apply(self, sample, context=None):
"""Filp the image and bounding box.
Operators:
1. Flip the image numpy.
2. Transform the bboxes' x coordinates.
(Must judge whether the coordinates are normalized!)
3. Transform the segmentations' x coordinates.
(Must judge whether the coordinates are normalized!)
Output:
sample: the image, bounding box and segmentation part
in sample are flipped.
"""
if np.random.uniform(0, 1) < self.prob:
im = sample['image']
height, width = im.shape[:2]
im = self.apply_image(im)
if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], width)
if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
sample['gt_poly'] = self.apply_segm(sample['gt_poly'], height,
width)
if 'gt_keypoint' in sample and len(sample['gt_keypoint']) > 0:
sample['gt_keypoint'] = self.apply_keypoint(
sample['gt_keypoint'], width)
if 'semantic' in sample and sample['semantic']:
sample['semantic'] = sample['semantic'][:, ::-1]
if 'gt_segm' in sample and sample['gt_segm'].any():
sample['gt_segm'] = sample['gt_segm'][:, :, ::-1]
sample['flipped'] = True
sample['image'] = im
return sample
@register_op
class Resize(BaseOperator):
def __init__(self, target_size, keep_ratio, interp=cv2.INTER_LINEAR):
"""
Resize image to target size. if keep_ratio is True,
resize the image's long side to the maximum of target_size
if keep_ratio is False, resize the image to target size(h, w)
Args:
target_size (int|list): image target size
keep_ratio (bool): whether keep_ratio or not, default true
interp (int): the interpolation method
"""
super(Resize, self).__init__()
self.keep_ratio = keep_ratio
self.interp = interp
if not isinstance(target_size, (Integral, Sequence)):
raise TypeError(
"Type of target_size is invalid. Must be Integer or List or Tuple, now is {}".
format(type(target_size)))
if isinstance(target_size, Integral):
target_size = [target_size, target_size]
self.target_size = target_size
def apply_image(self, image, scale):
im_scale_x, im_scale_y = scale
return cv2.resize(
image,
None,
None,
fx=im_scale_x,
fy=im_scale_y,
interpolation=self.interp)
def apply_bbox(self, bbox, scale, size):
im_scale_x, im_scale_y = scale
resize_w, resize_h = size
bbox[:, 0::2] *= im_scale_x
bbox[:, 1::2] *= im_scale_y
bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w)
bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h)
return bbox
def apply_area(self, area, scale):
im_scale_x, im_scale_y = scale
return area * im_scale_x * im_scale_y
def apply_joints(self, joints, scale, size):
im_scale_x, im_scale_y = scale
resize_w, resize_h = size
joints[..., 0] *= im_scale_x
joints[..., 1] *= im_scale_y
joints[..., 0] = np.clip(joints[..., 0], 0, resize_w)
joints[..., 1] = np.clip(joints[..., 1], 0, resize_h)
return joints
def apply_segm(self, segms, im_size, scale):
def _resize_poly(poly, im_scale_x, im_scale_y):
resized_poly = np.array(poly).astype('float32')
resized_poly[0::2] *= im_scale_x
resized_poly[1::2] *= im_scale_y
return resized_poly.tolist()
def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y):
if 'counts' in rle and type(rle['counts']) == list:
rle = mask_util.frPyObjects(rle, im_h, im_w)
mask = mask_util.decode(rle)
mask = cv2.resize(
mask,
None,
None,
fx=im_scale_x,
fy=im_scale_y,
interpolation=self.interp)
rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
return rle
im_h, im_w = im_size
im_scale_x, im_scale_y = scale
resized_segms = []
for segm in segms:
if is_poly(segm):
# Polygon format
resized_segms.append([
_resize_poly(poly, im_scale_x, im_scale_y) for poly in segm
])
else:
# RLE format
import pycocotools.mask as mask_util
resized_segms.append(
_resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y))
return resized_segms
def apply(self, sample, context=None):
""" Resize the image numpy.
"""
im = sample['image']
if not isinstance(im, np.ndarray):
raise TypeError("{}: image type is not numpy.".format(self))
# apply image
if len(im.shape) == 3:
im_shape = im.shape
else:
im_shape = im[0].shape
if self.keep_ratio:
im_size_min = np.min(im_shape[0:2])
im_size_max = np.max(im_shape[0:2])
target_size_min = np.min(self.target_size)
target_size_max = np.max(self.target_size)
im_scale = min(target_size_min / im_size_min,
target_size_max / im_size_max)
resize_h = int(im_scale * float(im_shape[0]) + 0.5)
resize_w = int(im_scale * float(im_shape[1]) + 0.5)
else:
resize_h, resize_w = self.target_size
im_scale_y = resize_h / im_shape[0]
im_scale_x = resize_w / im_shape[1]
if len(im.shape) == 3:
im = self.apply_image(sample['image'], [im_scale_x, im_scale_y])
sample['image'] = im.astype(np.float32)
else:
resized_images = []
for one_im in im:
applied_im = self.apply_image(one_im, [im_scale_x, im_scale_y])
resized_images.append(applied_im)
sample['image'] = np.array(resized_images)
# 2d keypoints resize
if 'kps2d' in sample.keys():
kps2d = sample['kps2d']
kps2d[:, :, 0] = kps2d[:, :, 0] * im_scale_x
kps2d[:, :, 1] = kps2d[:, :, 1] * im_scale_y
sample['kps2d'] = kps2d
sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)
if 'scale_factor' in sample:
scale_factor = sample['scale_factor']
sample['scale_factor'] = np.asarray(
[scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
dtype=np.float32)
else:
sample['scale_factor'] = np.asarray(
[im_scale_y, im_scale_x], dtype=np.float32)
# apply bbox
if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'],
[im_scale_x, im_scale_y],
[resize_w, resize_h])
# apply areas
if 'gt_areas' in sample:
sample['gt_areas'] = self.apply_area(sample['gt_areas'],
[im_scale_x, im_scale_y])
# apply polygon
if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2],
[im_scale_x, im_scale_y])
# apply semantic
if 'semantic' in sample and sample['semantic']:
semantic = sample['semantic']
semantic = cv2.resize(
semantic.astype('float32'),
None,
None,
fx=im_scale_x,
fy=im_scale_y,
interpolation=self.interp)
semantic = np.asarray(semantic).astype('int32')
semantic = np.expand_dims(semantic, 0)
sample['semantic'] = semantic
# apply gt_segm
if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
masks = [
cv2.resize(
gt_segm,
None,
None,
fx=im_scale_x,
fy=im_scale_y,
interpolation=cv2.INTER_NEAREST)
for gt_segm in sample['gt_segm']
]
sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
if 'gt_joints' in sample:
sample['gt_joints'] = self.apply_joints(sample['gt_joints'],
[im_scale_x, im_scale_y],
[resize_w, resize_h])
return sample
@register_op
class MultiscaleTestResize(BaseOperator):
def __init__(self,
origin_target_size=[800, 1333],
target_size=[],
interp=cv2.INTER_LINEAR,
use_flip=True):
"""
Rescale image to the each size in target size, and capped at max_size.
Args:
origin_target_size (list): origin target size of image
target_size (list): A list of target sizes of image.
interp (int): the interpolation method.
use_flip (bool): whether use flip augmentation.
"""
super(MultiscaleTestResize, self).__init__()
self.interp = interp
self.use_flip = use_flip
if not isinstance(target_size, Sequence):
raise TypeError(
"Type of target_size is invalid. Must be List or Tuple, now is {}".
format(type(target_size)))
self.target_size = target_size
if not isinstance(origin_target_size, Sequence):
raise TypeError(
"Type of origin_target_size is invalid. Must be List or Tuple, now is {}".
format(type(origin_target_size)))
self.origin_target_size = origin_target_size
def apply(self, sample, context=None):
""" Resize the image numpy for multi-scale test.
"""
samples = []
resizer = Resize(
self.origin_target_size, keep_ratio=True, interp=self.interp)
samples.append(resizer(sample.copy(), context))
if self.use_flip:
flipper = RandomFlip(1.1)
samples.append(flipper(sample.copy(), context=context))
for size in self.target_size:
resizer = Resize(size, keep_ratio=True, interp=self.interp)
samples.append(resizer(sample.copy(), context))
return samples
@register_op
class RandomResize(BaseOperator):
def __init__(self,
target_size,
keep_ratio=True,
interp=cv2.INTER_LINEAR,
random_range=False,
random_size=True,
random_interp=False):
"""
Resize image to target size randomly. random target_size and interpolation method
Args:
target_size (int, list, tuple): image target size, if random size is True, must be list or tuple
keep_ratio (bool): whether keep_raio or not, default true
interp (int): the interpolation method
random_range (bool): whether random select target size of image, the target_size must be
a [[min_short_edge, long_edge], [max_short_edge, long_edge]]
random_size (bool): whether random select target size of image
random_interp (bool): whether random select interpolation method
"""
super(RandomResize, self).__init__()
self.keep_ratio = keep_ratio
self.interp = interp
self.interps = [
cv2.INTER_NEAREST,
cv2.INTER_LINEAR,
cv2.INTER_AREA,
cv2.INTER_CUBIC,
cv2.INTER_LANCZOS4,
]
assert isinstance(target_size, (
Integral, Sequence)), "target_size must be Integer, List or Tuple"
if (random_range or random_size) and not isinstance(target_size,
Sequence):
raise TypeError(
"Type of target_size is invalid when random_size or random_range is True. Must be List or Tuple, now is {}".
format(type(target_size)))
if random_range and not len(target_size) == 2:
raise TypeError(
"target_size must be two list as [[min_short_edge, long_edge], [max_short_edge, long_edge]] when random_range is True."
)
self.target_size = target_size
self.random_range = random_range
self.random_size = random_size
self.random_interp = random_interp
def apply(self, sample, context=None):
""" Resize the image numpy.
"""
if self.random_range:
short_edge = np.random.randint(self.target_size[0][0],
self.target_size[1][0] + 1)
long_edge = max(self.target_size[0][1], self.target_size[1][1] + 1)
target_size = [short_edge, long_edge]
else:
if self.random_size:
target_size = random.choice(self.target_size)
else:
target_size = self.target_size
if self.random_interp:
interp = random.choice(self.interps)
else:
interp = self.interp
resizer = Resize(target_size, self.keep_ratio, interp)
return resizer(sample, context=context)
@register_op
class RandomExpand(BaseOperator):
"""Random expand the canvas.
Args:
ratio (float): maximum expansion ratio.
prob (float): probability to expand.
fill_value (list): color value used to fill the canvas. in RGB order.
"""
def __init__(self, ratio=4., prob=0.5, fill_value=(127.5, 127.5, 127.5)):
super(RandomExpand, self).__init__()
assert ratio > 1.01, "expand ratio must be larger than 1.01"
self.ratio = ratio
self.prob = prob
assert isinstance(fill_value, (Number, Sequence)), \
"fill value must be either float or sequence"
if isinstance(fill_value, Number):
fill_value = (fill_value, ) * 3
if not isinstance(fill_value, tuple):
fill_value = tuple(fill_value)
self.fill_value = fill_value
def apply(self, sample, context=None):
if np.random.uniform(0., 1.) < self.prob:
return sample
im = sample['image']
height, width = im.shape[:2]
ratio = np.random.uniform(1., self.ratio)
h = int(height * ratio)
w = int(width * ratio)
if not h > height or not w > width:
return sample
y = np.random.randint(0, h - height)
x = np.random.randint(0, w - width)
offsets, size = [x, y], [h, w]
pad = Pad(size,
pad_mode=-1,
offsets=offsets,
fill_value=self.fill_value)
return pad(sample, context=context)
@register_op
class CropWithSampling(BaseOperator):
def __init__(self, batch_sampler, satisfy_all=False, avoid_no_bbox=True):
"""
Args:
batch_sampler (list): Multiple sets of different
parameters for cropping.
satisfy_all (bool): whether all boxes must satisfy.
e.g.[[1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0],
[1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 1.0],
[1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 1.0],
[1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 1.0],
[1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 1.0],
[1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 1.0],
[1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0]]
[max sample, max trial, min scale, max scale,
min aspect ratio, max aspect ratio,
min overlap, max overlap]
avoid_no_bbox (bool): whether to avoid the
situation where the box does not appear.
"""
super(CropWithSampling, self).__init__()
self.batch_sampler = batch_sampler
self.satisfy_all = satisfy_all
self.avoid_no_bbox = avoid_no_bbox
def apply(self, sample, context):
"""
Crop the image and modify bounding box.
Operators:
1. Scale the image width and height.
2. Crop the image according to a radom sample.
3. Rescale the bounding box.
4. Determine if the new bbox is satisfied in the new image.
Returns:
sample: the image, bounding box are replaced.
"""
assert 'image' in sample, "image data not found"
im = sample['image']
gt_bbox = sample['gt_bbox']
gt_class = sample['gt_class']
im_height, im_width = im.shape[:2]
gt_score = None
if 'gt_score' in sample:
gt_score = sample['gt_score']
sampled_bbox = []
gt_bbox = gt_bbox.tolist()
for sampler in self.batch_sampler:
found = 0
for i in range(sampler[1]):
if found >= sampler[0]:
break
sample_bbox = generate_sample_bbox(sampler)
if satisfy_sample_constraint(sampler, sample_bbox, gt_bbox,
self.satisfy_all):
sampled_bbox.append(sample_bbox)
found = found + 1
im = np.array(im)
while sampled_bbox:
idx = int(np.random.uniform(0, len(sampled_bbox)))
sample_bbox = sampled_bbox.pop(idx)
sample_bbox = clip_bbox(sample_bbox)
crop_bbox, crop_class, crop_score = \
filter_and_process(sample_bbox, gt_bbox, gt_class, scores=gt_score)
if self.avoid_no_bbox:
if len(crop_bbox) < 1:
continue
xmin = int(sample_bbox[0] * im_width)
xmax = int(sample_bbox[2] * im_width)
ymin = int(sample_bbox[1] * im_height)
ymax = int(sample_bbox[3] * im_height)
im = im[ymin:ymax, xmin:xmax]
sample['image'] = im
sample['gt_bbox'] = crop_bbox
sample['gt_class'] = crop_class
sample['gt_score'] = crop_score
return sample
return sample
@register_op
class CropWithDataAchorSampling(BaseOperator):
def __init__(self,
batch_sampler,
anchor_sampler=None,
target_size=None,
das_anchor_scales=[16, 32, 64, 128],
sampling_prob=0.5,
min_size=8.,
avoid_no_bbox=True):
"""
Args:
anchor_sampler (list): anchor_sampling sets of different
parameters for cropping.
batch_sampler (list): Multiple sets of different
parameters for cropping.
e.g.[[1, 10, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.2, 0.0]]
[[1, 50, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
[1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
[1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
[1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
[1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0]]
[max sample, max trial, min scale, max scale,
min aspect ratio, max aspect ratio,
min overlap, max overlap, min coverage, max coverage]
target_size (int): target image size.
das_anchor_scales (list[float]): a list of anchor scales in data
anchor smapling.
min_size (float): minimum size of sampled bbox.
avoid_no_bbox (bool): whether to avoid the
situation where the box does not appear.
"""
super(CropWithDataAchorSampling, self).__init__()
self.anchor_sampler = anchor_sampler
self.batch_sampler = batch_sampler
self.target_size = target_size
self.sampling_prob = sampling_prob
self.min_size = min_size
self.avoid_no_bbox = avoid_no_bbox
self.das_anchor_scales = np.array(das_anchor_scales)
def apply(self, sample, context):
"""
Crop the image and modify bounding box.
Operators:
1. Scale the image width and height.
2. Crop the image according to a radom sample.
3. Rescale the bounding box.
4. Determine if the new bbox is satisfied in the new image.
Returns:
sample: the image, bounding box are replaced.
"""
assert 'image' in sample, "image data not found"
im = sample['image']
gt_bbox = sample['gt_bbox']
gt_class = sample['gt_class']
image_height, image_width = im.shape[:2]
gt_bbox[:, 0] /= image_width
gt_bbox[:, 1] /= image_height
gt_bbox[:, 2] /= image_width
gt_bbox[:, 3] /= image_height
gt_score = None
if 'gt_score' in sample:
gt_score = sample['gt_score']
sampled_bbox = []
gt_bbox = gt_bbox.tolist()
prob = np.random.uniform(0., 1.)
if prob > self.sampling_prob: # anchor sampling
assert self.anchor_sampler
for sampler in self.anchor_sampler:
found = 0
for i in range(sampler[1]):
if found >= sampler[0]:
break
sample_bbox = data_anchor_sampling(
gt_bbox, image_width, image_height,
self.das_anchor_scales, self.target_size)
if sample_bbox == 0:
break
if satisfy_sample_constraint_coverage(sampler, sample_bbox,
gt_bbox):
sampled_bbox.append(sample_bbox)
found = found + 1
im = np.array(im)
while sampled_bbox:
idx = int(np.random.uniform(0, len(sampled_bbox)))
sample_bbox = sampled_bbox.pop(idx)
if 'gt_keypoint' in sample.keys():
keypoints = (sample['gt_keypoint'],
sample['keypoint_ignore'])
crop_bbox, crop_class, crop_score, gt_keypoints = \
filter_and_process(sample_bbox, gt_bbox, gt_class,
scores=gt_score,
keypoints=keypoints)
else:
crop_bbox, crop_class, crop_score = filter_and_process(
sample_bbox, gt_bbox, gt_class, scores=gt_score)
crop_bbox, crop_class, crop_score = bbox_area_sampling(
crop_bbox, crop_class, crop_score, self.target_size,
self.min_size)
if self.avoid_no_bbox:
if len(crop_bbox) < 1:
continue
im = crop_image_sampling(im, sample_bbox, image_width,
image_height, self.target_size)
height, width = im.shape[:2]
crop_bbox[:, 0] *= width
crop_bbox[:, 1] *= height
crop_bbox[:, 2] *= width
crop_bbox[:, 3] *= height
sample['image'] = im
sample['gt_bbox'] = crop_bbox
sample['gt_class'] = crop_class
if 'gt_score' in sample:
sample['gt_score'] = crop_score
if 'gt_keypoint' in sample.keys():
sample['gt_keypoint'] = gt_keypoints[0]
sample['keypoint_ignore'] = gt_keypoints[1]
return sample
return sample
else:
for sampler in self.batch_sampler:
found = 0
for i in range(sampler[1]):
if found >= sampler[0]:
break
sample_bbox = generate_sample_bbox_square(
sampler, image_width, image_height)
if satisfy_sample_constraint_coverage(sampler, sample_bbox,
gt_bbox):
sampled_bbox.append(sample_bbox)
found = found + 1
im = np.array(im)
while sampled_bbox:
idx = int(np.random.uniform(0, len(sampled_bbox)))
sample_bbox = sampled_bbox.pop(idx)
sample_bbox = clip_bbox(sample_bbox)
if 'gt_keypoint' in sample.keys():
keypoints = (sample['gt_keypoint'],
sample['keypoint_ignore'])
crop_bbox, crop_class, crop_score, gt_keypoints = \
filter_and_process(sample_bbox, gt_bbox, gt_class,
scores=gt_score,
keypoints=keypoints)
else:
crop_bbox, crop_class, crop_score = filter_and_process(
sample_bbox, gt_bbox, gt_class, scores=gt_score)
# sampling bbox according the bbox area
crop_bbox, crop_class, crop_score = bbox_area_sampling(
crop_bbox, crop_class, crop_score, self.target_size,
self.min_size)
if self.avoid_no_bbox:
if len(crop_bbox) < 1:
continue
xmin = int(sample_bbox[0] * image_width)
xmax = int(sample_bbox[2] * image_width)
ymin = int(sample_bbox[1] * image_height)
ymax = int(sample_bbox[3] * image_height)
im = im[ymin:ymax, xmin:xmax]
height, width = im.shape[:2]
crop_bbox[:, 0] *= width
crop_bbox[:, 1] *= height
crop_bbox[:, 2] *= width
crop_bbox[:, 3] *= height
sample['image'] = im
sample['gt_bbox'] = crop_bbox
sample['gt_class'] = crop_class
if 'gt_score' in sample:
sample['gt_score'] = crop_score
if 'gt_keypoint' in sample.keys():
sample['gt_keypoint'] = gt_keypoints[0]
sample['keypoint_ignore'] = gt_keypoints[1]
return sample
return sample
@register_op
class RandomCrop(BaseOperator):
"""Random crop image and bboxes.
Args:
aspect_ratio (list): aspect ratio of cropped region.
in [min, max] format.
thresholds (list): iou thresholds for decide a valid bbox crop.
scaling (list): ratio between a cropped region and the original image.
in [min, max] format.
num_attempts (int): number of tries before giving up.
allow_no_crop (bool): allow return without actually cropping them.
cover_all_box (bool): ensure all bboxes are covered in the final crop.
is_mask_crop(bool): whether crop the segmentation.
"""
def __init__(self,
aspect_ratio=[.5, 2.],
thresholds=[.0, .1, .3, .5, .7, .9],
scaling=[.3, 1.],
num_attempts=50,
allow_no_crop=True,
cover_all_box=False,
is_mask_crop=False,
ioumode="iou",
prob=1.0):
super(RandomCrop, self).__init__()
self.aspect_ratio = aspect_ratio
self.thresholds = thresholds
self.scaling = scaling
self.num_attempts = num_attempts
self.allow_no_crop = allow_no_crop
self.cover_all_box = cover_all_box
self.is_mask_crop = is_mask_crop
self.ioumode = ioumode
self.prob = prob
def crop_segms(self, segms, valid_ids, crop, height, width):
def _crop_poly(segm, crop):
xmin, ymin, xmax, ymax = crop
crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin]
crop_p = np.array(crop_coord).reshape(4, 2)
crop_p = Polygon(crop_p)
crop_segm = list()
for poly in segm:
poly = np.array(poly).reshape(len(poly) // 2, 2)
polygon = Polygon(poly)
if not polygon.is_valid:
exterior = polygon.exterior
multi_lines = exterior.intersection(exterior)
polygons = shapely.ops.polygonize(multi_lines)
polygon = MultiPolygon(polygons)
multi_polygon = list()
if isinstance(polygon, MultiPolygon):
multi_polygon = copy.deepcopy(polygon)
else:
multi_polygon.append(copy.deepcopy(polygon))
for per_polygon in multi_polygon:
inter = per_polygon.intersection(crop_p)
if not inter:
continue
if isinstance(inter, (MultiPolygon, GeometryCollection)):
for part in inter:
if not isinstance(part, Polygon):
continue
part = np.squeeze(
np.array(part.exterior.coords[:-1]).reshape(1,
-1))
part[0::2] -= xmin
part[1::2] -= ymin
crop_segm.append(part.tolist())
elif isinstance(inter, Polygon):
crop_poly = np.squeeze(
np.array(inter.exterior.coords[:-1]).reshape(1, -1))
crop_poly[0::2] -= xmin
crop_poly[1::2] -= ymin
crop_segm.append(crop_poly.tolist())
else:
continue
return crop_segm
def _crop_rle(rle, crop, height, width):
if 'counts' in rle and type(rle['counts']) == list:
rle = mask_util.frPyObjects(rle, height, width)
mask = mask_util.decode(rle)
mask = mask[crop[1]:crop[3], crop[0]:crop[2]]
rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
return rle
crop_segms = []
for id in valid_ids:
segm = self.polygon_to_rle(segms[id], height, width)
if is_poly(segm):
import copy
import shapely.ops
from shapely.geometry import Polygon, MultiPolygon, GeometryCollection
logging.getLogger("shapely").setLevel(logging.WARNING)
# Polygon format
crop_segms.append(_crop_poly(segm, crop))
else:
# RLE format
import pycocotools.mask as mask_util
res = _crop_rle(segm, crop, height, width)
crop_segms.append(self.rle_to_polygon(res))
return crop_segms
def polygon_to_rle(self, polygons, height, width):
# Create an empty mask
mask_img = np.zeros((height, width), dtype=np.uint8)
# Fill the polygon in the mask
for polygon in polygons:
contour = np.array(polygon).reshape((-1, 1, 2)).astype(int)
cv2.drawContours(mask_img, [contour], 0, 255, -1)
# Convert binary mask to RLE
rle = mask.encode(np.asfortranarray(mask_img))
return rle
def rle_to_polygon(self, rle_mask, min_area=5):
binary_mask = mask.decode(rle_mask).squeeze()
# Find contours in the binary mask
contours, _ = cv2.findContours(
binary_mask.astype(np.uint8), cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE)
polygons = []
for contour in contours:
# Convert contour to polygon and filter small areas
if cv2.contourArea(contour) >= min_area:
# Flatten list and add to polygons
polygon = contour.flatten().tolist()
if len(polygon) > 4:
polygons.append(polygon)
return polygons
def set_fake_bboxes(self, sample):
sample['gt_bbox'] = np.array(
[
[32, 32, 128, 128],
[32, 32, 128, 256],
[32, 64, 128, 128],
[32, 64, 128, 256],
[64, 64, 128, 256],
[64, 64, 256, 256],
[64, 32, 128, 256],
[64, 32, 128, 256],
[96, 32, 128, 256],
[96, 32, 128, 256],
],
dtype=np.float32)
sample['gt_class'] = np.array(
[[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]], np.int32)
return sample
def apply(self, sample, context=None):
if random.random() > self.prob:
return sample
if 'gt_bbox' not in sample:
# only used in semi-det as unsup data
sample = self.set_fake_bboxes(sample)
sample = self.random_crop(sample, fake_bboxes=True)
del sample['gt_bbox']
del sample['gt_class']
return sample
if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0:
return sample
sample = self.random_crop(sample)
return sample
def random_crop(self, sample, fake_bboxes=False):
h, w = sample['image'].shape[:2]
gt_bbox = sample['gt_bbox']
# NOTE Original method attempts to generate one candidate for each
# threshold then randomly sample one from the resulting list.
# Here a short circuit approach is taken, i.e., randomly choose a
# threshold and attempt to find a valid crop, and simply return the
# first one found.
# The probability is not exactly the same, kinda resembling the
# "Monty Hall" problem. Actually carrying out the attempts will affect
# observability (just like opening doors in the "Monty Hall" game).
thresholds = list(self.thresholds)
if self.allow_no_crop:
thresholds.append('no_crop')
np.random.shuffle(thresholds)
for thresh in thresholds:
if thresh == 'no_crop':
return sample
found = False
for i in range(self.num_attempts):
scale = np.random.uniform(*self.scaling)
if self.aspect_ratio is not None:
min_ar, max_ar = self.aspect_ratio
aspect_ratio = np.random.uniform(
max(min_ar, scale**2), min(max_ar, scale**-2))
h_scale = scale / np.sqrt(aspect_ratio)
w_scale = scale * np.sqrt(aspect_ratio)
else:
h_scale = np.random.uniform(*self.scaling)
w_scale = np.random.uniform(*self.scaling)
crop_h = h * h_scale
crop_w = w * w_scale
if self.aspect_ratio is None:
if crop_h / crop_w < 0.5 or crop_h / crop_w > 2.0:
continue
crop_h = int(crop_h)
crop_w = int(crop_w)
crop_y = np.random.randint(0, h - crop_h)
crop_x = np.random.randint(0, w - crop_w)
crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h]
if self.ioumode == "iof":
iou = self._gtcropiou_matrix(
gt_bbox, np.array(
[crop_box], dtype=np.float32))
elif self.ioumode == "iou":
iou = self._iou_matrix(
gt_bbox, np.array(
[crop_box], dtype=np.float32))
if iou.max() < thresh:
continue
if self.cover_all_box and iou.min() < thresh:
continue
cropped_box, valid_ids = self._crop_box_with_center_constraint(
gt_bbox, np.array(
crop_box, dtype=np.float32))
if valid_ids.size > 0:
found = True
break
if found:
if self.is_mask_crop and 'gt_poly' in sample and len(sample[
'gt_poly']) > 0:
crop_polys = self.crop_segms(
sample['gt_poly'],
valid_ids,
np.array(
crop_box, dtype=np.int64),
h,
w)
if [] in crop_polys:
delete_id = list()
valid_polys = list()
for id, crop_poly in enumerate(crop_polys):
if crop_poly == []:
delete_id.append(id)
else:
valid_polys.append(crop_poly)
valid_ids = np.delete(valid_ids, delete_id)
if len(valid_polys) == 0:
return sample
sample['gt_poly'] = valid_polys
else:
sample['gt_poly'] = crop_polys
if 'gt_segm' in sample:
sample['gt_segm'] = self._crop_segm(sample['gt_segm'],
crop_box)
sample['gt_segm'] = np.take(
sample['gt_segm'], valid_ids, axis=0)
sample['image'] = self._crop_image(sample['image'], crop_box)
if fake_bboxes == True:
return sample
sample['gt_bbox'] = np.take(cropped_box, valid_ids, axis=0)
sample['gt_class'] = np.take(
sample['gt_class'], valid_ids, axis=0)
if 'gt_score' in sample:
sample['gt_score'] = np.take(
sample['gt_score'], valid_ids, axis=0)
if 'is_crowd' in sample:
sample['is_crowd'] = np.take(
sample['is_crowd'], valid_ids, axis=0)
if 'difficult' in sample:
sample['difficult'] = np.take(
sample['difficult'], valid_ids, axis=0)
if 'gt_joints' in sample:
sample['gt_joints'] = self._crop_joints(sample['gt_joints'],
crop_box)
return sample
return sample
def _iou_matrix(self, a, b):
tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2])
br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2)
area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
area_o = (area_a[:, np.newaxis] + area_b - area_i)
return area_i / (area_o + 1e-10)
def _gtcropiou_matrix(self, a, b):
tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2])
br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2)
area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
area_o = (area_a[:, np.newaxis] + area_b - area_i)
return area_i / (area_a + 1e-10)
def _crop_box_with_center_constraint(self, box, crop):
cropped_box = box.copy()
cropped_box[:, :2] = np.maximum(box[:, :2], crop[:2])
cropped_box[:, 2:] = np.minimum(box[:, 2:], crop[2:])
cropped_box[:, :2] -= crop[:2]
cropped_box[:, 2:] -= crop[:2]
centers = (box[:, :2] + box[:, 2:]) / 2
valid = np.logical_and(crop[:2] <= centers,
centers < crop[2:]).all(axis=1)
valid = np.logical_and(
valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1))
return cropped_box, np.where(valid)[0]
def _crop_image(self, img, crop):
x1, y1, x2, y2 = crop
return img[y1:y2, x1:x2, :]
def _crop_segm(self, segm, crop):
x1, y1, x2, y2 = crop
return segm[:, y1:y2, x1:x2]
def _crop_joints(self, joints, crop):
x1, y1, x2, y2 = crop
joints[joints[..., 0] > x2, :] = 0
joints[joints[..., 1] > y2, :] = 0
joints[joints[..., 0] < x1, :] = 0
joints[joints[..., 1] < y1, :] = 0
joints[..., 0] -= x1
joints[..., 1] -= y1
return joints
@register_op
class RandomScaledCrop(BaseOperator):
"""Resize image and bbox based on long side (with optional random scaling),
then crop or pad image to target size.
Args:
target_size (int|list): target size, "hw" format.
scale_range (list): random scale range.
interp (int): interpolation method, default to `cv2.INTER_LINEAR`.
fill_value (float|list|tuple): color value used to fill the canvas,
in RGB order.
"""
def __init__(self,
target_size=512,
scale_range=[.1, 2.],
interp=cv2.INTER_LINEAR,
fill_value=(123.675, 116.28, 103.53)):
super(RandomScaledCrop, self).__init__()
assert isinstance(target_size, (
Integral, Sequence)), "target_size must be Integer, List or Tuple"
if isinstance(target_size, Integral):
target_size = [target_size, ] * 2
self.target_size = target_size
self.scale_range = scale_range
self.interp = interp
assert isinstance(fill_value, (Number, Sequence)), \
"fill value must be either float or sequence"
if isinstance(fill_value, Number):
fill_value = (fill_value, ) * 3
if not isinstance(fill_value, tuple):
fill_value = tuple(fill_value)
self.fill_value = fill_value
def apply_image(self, img, output_size, offset_x, offset_y):
th, tw = self.target_size
rh, rw = output_size
img = cv2.resize(
img, (rw, rh), interpolation=self.interp).astype(np.float32)
canvas = np.ones([th, tw, 3], dtype=np.float32)
canvas *= np.array(self.fill_value, dtype=np.float32)
canvas[:min(th, rh), :min(tw, rw)] = \
img[offset_y:offset_y + th, offset_x:offset_x + tw]
return canvas
def apply_bbox(self, gt_bbox, gt_class, scale, offset_x, offset_y):
th, tw = self.target_size
shift_array = np.array(
[
offset_x,
offset_y,
] * 2, dtype=np.float32)
boxes = gt_bbox * scale - shift_array
boxes[:, 0::2] = np.clip(boxes[:, 0::2], 0, tw)
boxes[:, 1::2] = np.clip(boxes[:, 1::2], 0, th)
# filter boxes with no area
area = np.prod(boxes[..., 2:] - boxes[..., :2], axis=1)
valid = (area > 1.).nonzero()[0]
return boxes[valid], gt_class[valid], valid
def apply_segm(self, segms, output_size, offset_x, offset_y, valid=None):
th, tw = self.target_size
rh, rw = output_size
out_segms = []
for segm in segms:
segm = cv2.resize(segm, (rw, rh), interpolation=cv2.INTER_NEAREST)
segm = segm.astype(np.float32)
canvas = np.zeros([th, tw], dtype=segm.dtype)
canvas[:min(th, rh), :min(tw, rw)] = \
segm[offset_y:offset_y + th, offset_x:offset_x + tw]
out_segms.append(canvas)
out_segms = np.stack(out_segms)
return out_segms if valid is None else out_segms[valid]
def apply(self, sample, context=None):
img = sample['image']
h, w = img.shape[:2]
random_scale = np.random.uniform(*self.scale_range)
target_scale_size = [t * random_scale for t in self.target_size]
# Compute actual rescaling applied to image.
scale = min(target_scale_size[0] / h, target_scale_size[1] / w)
output_size = [int(round(h * scale)), int(round(w * scale))]
# get offset
offset_x = int(
max(0, np.random.uniform(0., output_size[1] - self.target_size[1])))
offset_y = int(
max(0, np.random.uniform(0., output_size[0] - self.target_size[0])))
# apply to image
sample['image'] = self.apply_image(img, output_size, offset_x, offset_y)
# apply to bbox
valid = None
if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
sample['gt_bbox'], sample['gt_class'], valid = self.apply_bbox(
sample['gt_bbox'], sample['gt_class'], scale, offset_x,
offset_y)
# apply to segm
if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
sample['gt_segm'] = self.apply_segm(sample['gt_segm'], output_size,
offset_x, offset_y, valid)
sample['im_shape'] = np.asarray(output_size, dtype=np.float32)
scale_factor = sample['scale_factor']
sample['scale_factor'] = np.asarray(
[scale_factor[0] * scale, scale_factor[1] * scale],
dtype=np.float32)
return sample
@register_op
class Cutmix(BaseOperator):
def __init__(self, alpha=1.5, beta=1.5):
"""
CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features, see https://arxiv.org/abs/1905.04899
Cutmix image and gt_bbbox/gt_score
Args:
alpha (float): alpha parameter of beta distribute
beta (float): beta parameter of beta distribute
"""
super(Cutmix, self).__init__()
self.alpha = alpha
self.beta = beta
if self.alpha <= 0.0:
raise ValueError("alpha shold be positive in {}".format(self))
if self.beta <= 0.0:
raise ValueError("beta shold be positive in {}".format(self))
def apply_image(self, img1, img2, factor):
""" _rand_bbox """
h = max(img1.shape[0], img2.shape[0])
w = max(img1.shape[1], img2.shape[1])
cut_rat = np.sqrt(1. - factor)
cut_w = np.int32(w * cut_rat)
cut_h = np.int32(h * cut_rat)
# uniform
cx = np.random.randint(w)
cy = np.random.randint(h)
bbx1 = np.clip(cx - cut_w // 2, 0, w - 1)
bby1 = np.clip(cy - cut_h // 2, 0, h - 1)
bbx2 = np.clip(cx + cut_w // 2, 0, w - 1)
bby2 = np.clip(cy + cut_h // 2, 0, h - 1)
img_1_pad = np.zeros((h, w, img1.shape[2]), 'float32')
img_1_pad[:img1.shape[0], :img1.shape[1], :] = \
img1.astype('float32')
img_2_pad = np.zeros((h, w, img2.shape[2]), 'float32')
img_2_pad[:img2.shape[0], :img2.shape[1], :] = \
img2.astype('float32')
img_1_pad[bby1:bby2, bbx1:bbx2, :] = img_2_pad[bby1:bby2, bbx1:bbx2, :]
return img_1_pad
def __call__(self, sample, context=None):
if not isinstance(sample, Sequence):
return sample
assert len(sample) == 2, 'cutmix need two samples'
factor = np.random.beta(self.alpha, self.beta)
factor = max(0.0, min(1.0, factor))
if factor >= 1.0:
return sample[0]
if factor <= 0.0:
return sample[1]
img1 = sample[0]['image']
img2 = sample[1]['image']
img = self.apply_image(img1, img2, factor)
gt_bbox1 = sample[0]['gt_bbox']
gt_bbox2 = sample[1]['gt_bbox']
gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0)
gt_class1 = sample[0]['gt_class']
gt_class2 = sample[1]['gt_class']
gt_class = np.concatenate((gt_class1, gt_class2), axis=0)
gt_score1 = np.ones_like(sample[0]['gt_class'])
gt_score2 = np.ones_like(sample[1]['gt_class'])
gt_score = np.concatenate(
(gt_score1 * factor, gt_score2 * (1. - factor)), axis=0)
result = copy.deepcopy(sample[0])
result['image'] = img
result['gt_bbox'] = gt_bbox
result['gt_score'] = gt_score
result['gt_class'] = gt_class
if 'is_crowd' in sample[0]:
is_crowd1 = sample[0]['is_crowd']
is_crowd2 = sample[1]['is_crowd']
is_crowd = np.concatenate((is_crowd1, is_crowd2), axis=0)
result['is_crowd'] = is_crowd
if 'difficult' in sample[0]:
is_difficult1 = sample[0]['difficult']
is_difficult2 = sample[1]['difficult']
is_difficult = np.concatenate(
(is_difficult1, is_difficult2), axis=0)
result['difficult'] = is_difficult
return result
@register_op
class Mixup(BaseOperator):
def __init__(self, alpha=1.5, beta=1.5):
""" Mixup image and gt_bbbox/gt_score
Args:
alpha (float): alpha parameter of beta distribute
beta (float): beta parameter of beta distribute
"""
super(Mixup, self).__init__()
self.alpha = alpha
self.beta = beta
if self.alpha <= 0.0:
raise ValueError("alpha shold be positive in {}".format(self))
if self.beta <= 0.0:
raise ValueError("beta shold be positive in {}".format(self))
def apply_image(self, img1, img2, factor):
h = max(img1.shape[0], img2.shape[0])
w = max(img1.shape[1], img2.shape[1])
img = np.zeros((h, w, img1.shape[2]), 'float32')
img[:img1.shape[0], :img1.shape[1], :] = \
img1.astype('float32') * factor
img[:img2.shape[0], :img2.shape[1], :] += \
img2.astype('float32') * (1.0 - factor)
return img.astype('uint8')
def __call__(self, sample, context=None):
if not isinstance(sample, Sequence):
return sample
assert len(sample) == 2, 'mixup need two samples'
factor = np.random.beta(self.alpha, self.beta)
factor = max(0.0, min(1.0, factor))
if factor >= 1.0:
return sample[0]
if factor <= 0.0:
return sample[1]
im = self.apply_image(sample[0]['image'], sample[1]['image'], factor)
result = copy.deepcopy(sample[0])
result['image'] = im
# apply bbox and score
if 'gt_bbox' in sample[0]:
gt_bbox1 = sample[0]['gt_bbox']
gt_bbox2 = sample[1]['gt_bbox']
gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0)
result['gt_bbox'] = gt_bbox
if 'gt_class' in sample[0]:
gt_class1 = sample[0]['gt_class']
gt_class2 = sample[1]['gt_class']
gt_class = np.concatenate((gt_class1, gt_class2), axis=0)
result['gt_class'] = gt_class
gt_score1 = np.ones_like(sample[0]['gt_class'])
gt_score2 = np.ones_like(sample[1]['gt_class'])
gt_score = np.concatenate(
(gt_score1 * factor, gt_score2 * (1. - factor)), axis=0)
result['gt_score'] = gt_score.astype('float32')
if 'is_crowd' in sample[0]:
is_crowd1 = sample[0]['is_crowd']
is_crowd2 = sample[1]['is_crowd']
is_crowd = np.concatenate((is_crowd1, is_crowd2), axis=0)
result['is_crowd'] = is_crowd
if 'difficult' in sample[0]:
is_difficult1 = sample[0]['difficult']
is_difficult2 = sample[1]['difficult']
is_difficult = np.concatenate(
(is_difficult1, is_difficult2), axis=0)
result['difficult'] = is_difficult
if 'gt_ide' in sample[0]:
gt_ide1 = sample[0]['gt_ide']
gt_ide2 = sample[1]['gt_ide']
gt_ide = np.concatenate((gt_ide1, gt_ide2), axis=0)
result['gt_ide'] = gt_ide
return result
@register_op
class NormalizeBox(BaseOperator):
"""Transform the bounding box's coornidates to [0,1]."""
def __init__(self, retain_origin_box=False):
super(NormalizeBox, self).__init__()
self.retain_origin_box = retain_origin_box
def apply(self, sample, context):
im = sample['image']
if 'gt_bbox' in sample.keys():
if self.retain_origin_box:
sample['origin_gt_bbox'] = sample['gt_bbox'].copy()
sample['origin_gt_class'] = sample['gt_class'].copy()
gt_bbox = sample['gt_bbox']
height, width, _ = im.shape
for i in range(gt_bbox.shape[0]):
gt_bbox[i][0] = gt_bbox[i][0] / width
gt_bbox[i][1] = gt_bbox[i][1] / height
gt_bbox[i][2] = gt_bbox[i][2] / width
gt_bbox[i][3] = gt_bbox[i][3] / height
sample['gt_bbox'] = gt_bbox
if 'gt_keypoint' in sample.keys():
gt_keypoint = sample['gt_keypoint']
for i in range(gt_keypoint.shape[1]):
if i % 2:
gt_keypoint[:, i] = gt_keypoint[:, i] / height
else:
gt_keypoint[:, i] = gt_keypoint[:, i] / width
sample['gt_keypoint'] = gt_keypoint
return sample
else:
return sample
@register_op
class BboxXYXY2XYWH(BaseOperator):
"""
Convert bbox XYXY format to XYWH format.
"""
def __init__(self):
super(BboxXYXY2XYWH, self).__init__()
def apply(self, sample, context=None):
if 'gt_bbox' in sample.keys():
bbox = sample['gt_bbox']
bbox[:, 2:4] = bbox[:, 2:4] - bbox[:, :2]
bbox[:, :2] = bbox[:, :2] + bbox[:, 2:4] / 2.
sample['gt_bbox'] = bbox
return sample
else:
return sample
@register_op
class PadBox(BaseOperator):
def __init__(self, num_max_boxes=50):
"""
Pad zeros to bboxes if number of bboxes is less than num_max_boxes.
Args:
num_max_boxes (int): the max number of bboxes
"""
self.num_max_boxes = num_max_boxes
super(PadBox, self).__init__()
def apply(self, sample, context=None):
assert 'gt_bbox' in sample
bbox = sample['gt_bbox']
gt_num = min(self.num_max_boxes, len(bbox))
num_max = self.num_max_boxes
# fields = context['fields'] if context else []
pad_bbox = np.zeros((num_max, 4), dtype=np.float32)
if gt_num > 0:
pad_bbox[:gt_num, :] = bbox[:gt_num, :]
sample['gt_bbox'] = pad_bbox
if 'gt_class' in sample:
pad_class = np.zeros((num_max, ), dtype=np.int32)
if gt_num > 0:
pad_class[:gt_num] = sample['gt_class'][:gt_num, 0]
sample['gt_class'] = pad_class
if 'gt_score' in sample:
pad_score = np.zeros((num_max, ), dtype=np.float32)
if gt_num > 0:
pad_score[:gt_num] = sample['gt_score'][:gt_num, 0]
sample['gt_score'] = pad_score
# in training, for example in op ExpandImage,
# the bbox and gt_class is expandded, but the difficult is not,
# so, judging by it's length
if 'difficult' in sample:
pad_diff = np.zeros((num_max, ), dtype=np.int32)
if gt_num > 0:
pad_diff[:gt_num] = sample['difficult'][:gt_num, 0]
sample['difficult'] = pad_diff
if 'is_crowd' in sample:
pad_crowd = np.zeros((num_max, ), dtype=np.int32)
if gt_num > 0:
pad_crowd[:gt_num] = sample['is_crowd'][:gt_num, 0]
sample['is_crowd'] = pad_crowd
if 'gt_ide' in sample:
pad_ide = np.zeros((num_max, ), dtype=np.int32)
if gt_num > 0:
pad_ide[:gt_num] = sample['gt_ide'][:gt_num, 0]
sample['gt_ide'] = pad_ide
return sample
@register_op
class DebugVisibleImage(BaseOperator):
"""
In debug mode, visualize images according to `gt_box`.
(Currently only supported when not cropping and flipping image.)
"""
def __init__(self, output_dir='output/debug', is_normalized=False):
super(DebugVisibleImage, self).__init__()
self.is_normalized = is_normalized
self.output_dir = output_dir
if not os.path.isdir(output_dir):
os.makedirs(output_dir)
if not isinstance(self.is_normalized, bool):
raise TypeError("{}: input type is invalid.".format(self))
def apply(self, sample, context=None):
image = Image.fromarray(sample['image'].astype(np.uint8))
out_file_name = '{:012d}.jpg'.format(sample['im_id'][0])
width = sample['w']
height = sample['h']
gt_bbox = sample['gt_bbox']
gt_class = sample['gt_class']
draw = ImageDraw.Draw(image)
for i in range(gt_bbox.shape[0]):
if self.is_normalized:
gt_bbox[i][0] = gt_bbox[i][0] * width
gt_bbox[i][1] = gt_bbox[i][1] * height
gt_bbox[i][2] = gt_bbox[i][2] * width
gt_bbox[i][3] = gt_bbox[i][3] * height
xmin, ymin, xmax, ymax = gt_bbox[i]
draw.line(
[(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin),
(xmin, ymin)],
width=2,
fill='green')
# draw label
text = str(gt_class[i][0])
tw, th = imagedraw_textsize_c(draw, text)
draw.rectangle(
[(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill='green')
draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))
if 'gt_keypoint' in sample.keys():
gt_keypoint = sample['gt_keypoint']
if self.is_normalized:
for i in range(gt_keypoint.shape[1]):
if i % 2:
gt_keypoint[:, i] = gt_keypoint[:, i] * height
else:
gt_keypoint[:, i] = gt_keypoint[:, i] * width
for i in range(gt_keypoint.shape[0]):
keypoint = gt_keypoint[i]
for j in range(int(keypoint.shape[0] / 2)):
x1 = round(keypoint[2 * j]).astype(np.int32)
y1 = round(keypoint[2 * j + 1]).astype(np.int32)
draw.ellipse(
(x1, y1, x1 + 5, y1 + 5), fill='green', outline='green')
save_path = os.path.join(self.output_dir, out_file_name)
image.save(save_path, quality=95)
return sample
@register_op
class Pad(BaseOperator):
def __init__(self,
size=None,
size_divisor=32,
pad_mode=0,
offsets=None,
fill_value=(127.5, 127.5, 127.5)):
"""
Pad image to a specified size or multiple of size_divisor.
Args:
size (int, Sequence): image target size, if None, pad to multiple of size_divisor, default None
size_divisor (int): size divisor, default 32
pad_mode (int): pad mode, currently only supports four modes [-1, 0, 1, 2]. if -1, use specified offsets
if 0, only pad to right and bottom. if 1, pad according to center. if 2, only pad left and top
offsets (list): [offset_x, offset_y], specify offset while padding, only supported pad_mode=-1
fill_value (bool): rgb value of pad area, default (127.5, 127.5, 127.5)
"""
super(Pad, self).__init__()
if not isinstance(size, (int, Sequence)):
raise TypeError(
"Type of target_size is invalid when random_size is True. \
Must be List, now is {}".format(type(size)))
if isinstance(size, int):
size = [size, size]
assert pad_mode in [
-1, 0, 1, 2
], 'currently only supports four modes [-1, 0, 1, 2]'
if pad_mode == -1:
assert offsets, 'if pad_mode is -1, offsets should not be None'
self.size = size
self.size_divisor = size_divisor
self.pad_mode = pad_mode
self.fill_value = fill_value
self.offsets = offsets
def apply_segm(self, segms, offsets, im_size, size):
def _expand_poly(poly, x, y):
expanded_poly = np.array(poly)
expanded_poly[0::2] += x
expanded_poly[1::2] += y
return expanded_poly.tolist()
def _expand_rle(rle, x, y, height, width, h, w):
if 'counts' in rle and type(rle['counts']) == list:
rle = mask_util.frPyObjects(rle, height, width)
mask = mask_util.decode(rle)
expanded_mask = np.full((h, w), 0).astype(mask.dtype)
expanded_mask[y:y + height, x:x + width] = mask
rle = mask_util.encode(
np.array(
expanded_mask, order='F', dtype=np.uint8))
return rle
x, y = offsets
height, width = im_size
h, w = size
expanded_segms = []
for segm in segms:
if is_poly(segm):
# Polygon format
expanded_segms.append(
[_expand_poly(poly, x, y) for poly in segm])
else:
# RLE format
import pycocotools.mask as mask_util
expanded_segms.append(
_expand_rle(segm, x, y, height, width, h, w))
return expanded_segms
def apply_bbox(self, bbox, offsets):
return bbox + np.array(offsets * 2, dtype=np.float32)
def apply_keypoint(self, keypoints, offsets):
n = len(keypoints[0]) // 2
return keypoints + np.array(offsets * n, dtype=np.float32)
def apply_image(self, image, offsets, im_size, size):
x, y = offsets
im_h, im_w = im_size
h, w = size
canvas = np.ones((h, w, 3), dtype=np.float32)
canvas *= np.array(self.fill_value, dtype=np.float32)
canvas[y:y + im_h, x:x + im_w, :] = image.astype(np.float32)
return canvas
def apply(self, sample, context=None):
im = sample['image']
im_h, im_w = im.shape[:2]
if self.size:
h, w = self.size
assert (
im_h <= h and im_w <= w
), '(h, w) of target size should be greater than (im_h, im_w)'
else:
h = int(np.ceil(im_h / self.size_divisor) * self.size_divisor)
w = int(np.ceil(im_w / self.size_divisor) * self.size_divisor)
if h == im_h and w == im_w:
sample['image'] = im.astype(np.float32)
return sample
if self.pad_mode == -1:
offset_x, offset_y = self.offsets
elif self.pad_mode == 0:
offset_y, offset_x = 0, 0
elif self.pad_mode == 1:
offset_y, offset_x = (h - im_h) // 2, (w - im_w) // 2
else:
offset_y, offset_x = h - im_h, w - im_w
offsets, im_size, size = [offset_x, offset_y], [im_h, im_w], [h, w]
sample['image'] = self.apply_image(im, offsets, im_size, size)
if self.pad_mode == 0:
return sample
if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], offsets)
if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
sample['gt_poly'] = self.apply_segm(sample['gt_poly'], offsets,
im_size, size)
if 'gt_keypoint' in sample and len(sample['gt_keypoint']) > 0:
sample['gt_keypoint'] = self.apply_keypoint(sample['gt_keypoint'],
offsets)
if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
masks = [
cv2.copyMakeBorder(
gt_segm,
offset_y, h - (offset_y + im_h),
offset_x, w - (offset_x + im_w),
borderType=cv2.BORDER_CONSTANT,
value=0)
for gt_segm in sample['gt_segm']
]
sample['gt_segm'] = np.asarray(masks, dtype=np.uint8)
return sample
@register_op
class Poly2Mask(BaseOperator):
"""
gt poly to mask annotations.
Args:
del_poly (bool): Whether to delete poly after generating mask. Default: False.
"""
def __init__(self, del_poly=False):
super(Poly2Mask, self).__init__()
import pycocotools.mask as maskUtils
self.maskutils = maskUtils
self.del_poly = del_poly
def _poly2mask(self, mask_ann, img_h, img_w):
if isinstance(mask_ann, list):
# polygon -- a single object might consist of multiple parts
# we merge all parts into one mask rle code
rles = self.maskutils.frPyObjects(mask_ann, img_h, img_w)
rle = self.maskutils.merge(rles)
elif isinstance(mask_ann['counts'], list):
# uncompressed RLE
rle = self.maskutils.frPyObjects(mask_ann, img_h, img_w)
else:
# rle
rle = mask_ann
mask = self.maskutils.decode(rle)
return mask
def apply(self, sample, context=None):
assert 'gt_poly' in sample
im_h, im_w = sample['im_shape']
masks = [
self._poly2mask(gt_poly, im_h, im_w)
for gt_poly in sample['gt_poly']
]
sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
if self.del_poly:
del (sample['gt_poly'])
return sample
@register_op
class AugmentHSV(BaseOperator):
"""
Augment the SV channel of image data.
Args:
fraction (float): the fraction for augment. Default: 0.5.
is_bgr (bool): whether the image is BGR mode. Default: True.
hgain (float): H channel gains
sgain (float): S channel gains
vgain (float): V channel gains
"""
def __init__(self,
fraction=0.50,
is_bgr=True,
hgain=None,
sgain=None,
vgain=None):
super(AugmentHSV, self).__init__()
self.fraction = fraction
self.is_bgr = is_bgr
self.hgain = hgain
self.sgain = sgain
self.vgain = vgain
self.use_hsvgain = False if hgain is None else True
def apply(self, sample, context=None):
img = sample['image']
if self.is_bgr:
img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
else:
img_hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
if self.use_hsvgain:
hsv_augs = np.random.uniform(
-1, 1, 3) * [self.hgain, self.sgain, self.vgain]
# random selection of h, s, v
hsv_augs *= np.random.randint(0, 2, 3)
img_hsv[..., 0] = (img_hsv[..., 0] + hsv_augs[0]) % 180
img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_augs[1], 0, 255)
img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_augs[2], 0, 255)
else:
S = img_hsv[:, :, 1].astype(np.float32)
V = img_hsv[:, :, 2].astype(np.float32)
a = (random.random() * 2 - 1) * self.fraction + 1
S *= a
if a > 1:
np.clip(S, a_min=0, a_max=255, out=S)
a = (random.random() * 2 - 1) * self.fraction + 1
V *= a
if a > 1:
np.clip(V, a_min=0, a_max=255, out=V)
img_hsv[:, :, 1] = S.astype(np.uint8)
img_hsv[:, :, 2] = V.astype(np.uint8)
if self.is_bgr:
cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img)
else:
cv2.cvtColor(img_hsv, cv2.COLOR_HSV2RGB, dst=img)
sample['image'] = img.astype(np.float32)
return sample
@register_op
class Norm2PixelBbox(BaseOperator):
"""
Transform the bounding box's coornidates which is in [0,1] to pixels.
"""
def __init__(self):
super(Norm2PixelBbox, self).__init__()
def apply(self, sample, context=None):
assert 'gt_bbox' in sample
bbox = sample['gt_bbox']
height, width = sample['image'].shape[:2]
bbox[:, 0::2] = bbox[:, 0::2] * width
bbox[:, 1::2] = bbox[:, 1::2] * height
sample['gt_bbox'] = bbox
return sample
@register_op
class BboxCXCYWH2XYXY(BaseOperator):
"""
Convert bbox CXCYWH format to XYXY format.
[center_x, center_y, width, height] -> [x0, y0, x1, y1]
"""
def __init__(self):
super(BboxCXCYWH2XYXY, self).__init__()
def apply(self, sample, context=None):
assert 'gt_bbox' in sample
bbox0 = sample['gt_bbox']
bbox = bbox0.copy()
bbox[:, :2] = bbox0[:, :2] - bbox0[:, 2:4] / 2.
bbox[:, 2:4] = bbox0[:, :2] + bbox0[:, 2:4] / 2.
sample['gt_bbox'] = bbox
return sample
@register_op
class RandomResizeCrop(BaseOperator):
"""Random resize and crop image and bboxes.
Args:
resizes (list): resize image to one of resizes. if keep_ratio is True and mode is
'long', resize the image's long side to the maximum of target_size, if keep_ratio is
True and mode is 'short', resize the image's short side to the minimum of target_size.
cropsizes (list): crop sizes after resize, [(min_crop_1, max_crop_1), ...]
mode (str): resize mode, `long` or `short`. Details see resizes.
prob (float): probability of this op.
keep_ratio (bool): whether keep_ratio or not, default true
interp (int): the interpolation method
thresholds (list): iou thresholds for decide a valid bbox crop.
num_attempts (int): number of tries before giving up.
allow_no_crop (bool): allow return without actually cropping them.
cover_all_box (bool): ensure all bboxes are covered in the final crop.
is_mask_crop(bool): whether crop the segmentation.
"""
def __init__(self,
resizes,
cropsizes,
prob=0.5,
mode='short',
keep_ratio=True,
interp=cv2.INTER_LINEAR,
num_attempts=3,
cover_all_box=False,
allow_no_crop=False,
thresholds=[0.3, 0.5, 0.7],
is_mask_crop=False,
ioumode="iou"):
super(RandomResizeCrop, self).__init__()
self.resizes = resizes
self.cropsizes = cropsizes
self.prob = prob
self.mode = mode
self.ioumode = ioumode
self.resizer = Resize(0, keep_ratio=keep_ratio, interp=interp)
self.croper = RandomCrop(
num_attempts=num_attempts,
cover_all_box=cover_all_box,
thresholds=thresholds,
allow_no_crop=allow_no_crop,
is_mask_crop=is_mask_crop)
def _format_size(self, size):
if isinstance(size, Integral):
size = (size, size)
return size
def apply(self, sample, context=None):
if random.random() < self.prob:
_resize = self._format_size(random.choice(self.resizes))
_cropsize = self._format_size(random.choice(self.cropsizes))
sample = self._resize(
self.resizer,
sample,
size=_resize,
mode=self.mode,
context=context)
sample = self._random_crop(
self.croper, sample, size=_cropsize, context=context)
return sample
@staticmethod
def _random_crop(croper, sample, size, context=None):
if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0:
return sample
self = croper
h, w = sample['image'].shape[:2]
gt_bbox = sample['gt_bbox']
cropsize = size
min_crop = min(cropsize)
max_crop = max(cropsize)
thresholds = list(self.thresholds)
np.random.shuffle(thresholds)
for thresh in thresholds:
found = False
for _ in range(self.num_attempts):
crop_h = random.randint(min_crop, min(h, max_crop))
crop_w = random.randint(min_crop, min(w, max_crop))
crop_y = random.randint(0, h - crop_h)
crop_x = random.randint(0, w - crop_w)
crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h]
if self.ioumode == "iof":
iou = self._gtcropiou_matrix(
gt_bbox, np.array(
[crop_box], dtype=np.float32))
elif self.ioumode == "iou":
iou = self._iou_matrix(
gt_bbox, np.array(
[crop_box], dtype=np.float32))
if iou.max() < thresh:
continue
if self.cover_all_box and iou.min() < thresh:
continue
cropped_box, valid_ids = self._crop_box_with_center_constraint(
gt_bbox, np.array(
crop_box, dtype=np.float32))
if valid_ids.size > 0:
found = True
break
if found:
if self.is_mask_crop and 'gt_poly' in sample and len(sample[
'gt_poly']) > 0:
crop_polys = self.crop_segms(
sample['gt_poly'],
valid_ids,
np.array(
crop_box, dtype=np.int64),
h,
w)
if [] in crop_polys:
delete_id = list()
valid_polys = list()
for id, crop_poly in enumerate(crop_polys):
if crop_poly == []:
delete_id.append(id)
else:
valid_polys.append(crop_poly)
valid_ids = np.delete(valid_ids, delete_id)
if len(valid_polys) == 0:
return sample
sample['gt_poly'] = valid_polys
else:
sample['gt_poly'] = crop_polys
if 'gt_segm' in sample:
sample['gt_segm'] = self._crop_segm(sample['gt_segm'],
crop_box)
sample['gt_segm'] = np.take(
sample['gt_segm'], valid_ids, axis=0)
sample['image'] = self._crop_image(sample['image'], crop_box)
sample['gt_bbox'] = np.take(cropped_box, valid_ids, axis=0)
sample['gt_class'] = np.take(
sample['gt_class'], valid_ids, axis=0)
if 'gt_score' in sample:
sample['gt_score'] = np.take(
sample['gt_score'], valid_ids, axis=0)
if 'is_crowd' in sample:
sample['is_crowd'] = np.take(
sample['is_crowd'], valid_ids, axis=0)
if 'gt_areas' in sample:
sample['gt_areas'] = np.take(
sample['gt_areas'], valid_ids, axis=0)
if 'gt_joints' in sample:
gt_joints = self._crop_joints(sample['gt_joints'], crop_box)
sample['gt_joints'] = gt_joints[valid_ids]
return sample
return sample
@staticmethod
def _resize(resizer, sample, size, mode='short', context=None):
self = resizer
im = sample['image']
target_size = size
if not isinstance(im, np.ndarray):
raise TypeError("{}: image type is not numpy.".format(self))
if len(im.shape) != 3:
raise ImageError('{}: image is not 3-dimensional.'.format(self))
# apply image
im_shape = im.shape
if self.keep_ratio:
im_size_min = np.min(im_shape[0:2])
im_size_max = np.max(im_shape[0:2])
target_size_min = np.min(target_size)
target_size_max = np.max(target_size)
if mode == 'long':
im_scale = min(target_size_min / im_size_min,
target_size_max / im_size_max)
else:
im_scale = max(target_size_min / im_size_min,
target_size_max / im_size_max)
resize_h = int(im_scale * float(im_shape[0]) + 0.5)
resize_w = int(im_scale * float(im_shape[1]) + 0.5)
im_scale_x = im_scale
im_scale_y = im_scale
else:
resize_h, resize_w = target_size
im_scale_y = resize_h / im_shape[0]
im_scale_x = resize_w / im_shape[1]
im = self.apply_image(sample['image'], [im_scale_x, im_scale_y])
sample['image'] = im
sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)
if 'scale_factor' in sample:
scale_factor = sample['scale_factor']
sample['scale_factor'] = np.asarray(
[scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
dtype=np.float32)
else:
sample['scale_factor'] = np.asarray(
[im_scale_y, im_scale_x], dtype=np.float32)
# apply bbox
if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'],
[im_scale_x, im_scale_y],
[resize_w, resize_h])
# apply polygon
if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2],
[im_scale_x, im_scale_y])
# apply semantic
if 'semantic' in sample and sample['semantic']:
semantic = sample['semantic']
semantic = cv2.resize(
semantic.astype('float32'),
None,
None,
fx=im_scale_x,
fy=im_scale_y,
interpolation=self.interp)
semantic = np.asarray(semantic).astype('int32')
semantic = np.expand_dims(semantic, 0)
sample['semantic'] = semantic
# apply gt_segm
if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
masks = [
cv2.resize(
gt_segm,
None,
None,
fx=im_scale_x,
fy=im_scale_y,
interpolation=cv2.INTER_NEAREST)
for gt_segm in sample['gt_segm']
]
sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
if 'gt_joints' in sample:
sample['gt_joints'] = self.apply_joints(sample['gt_joints'],
[im_scale_x, im_scale_y],
[resize_w, resize_h])
return sample
@register_op
class RandomSelect(BaseOperator):
"""
Randomly choose a transformation between transforms1 and transforms2,
and the probability of choosing transforms1 is p.
The code is based on https://github.com/facebookresearch/detr/blob/main/datasets/transforms.py
"""
def __init__(self, transforms1, transforms2, p=0.5):
super(RandomSelect, self).__init__()
self.transforms1 = Compose(transforms1)
self.transforms2 = Compose(transforms2)
self.p = p
def apply(self, sample, context=None):
if random.random() < self.p:
return self.transforms1(sample)
return self.transforms2(sample)
@register_op
class RandomSelects(BaseOperator):
"""
Randomly choose a transformation between transforms1 and transforms2,
and the probability of choosing transforms1 is p.
The code is based on https://github.com/facebookresearch/detr/blob/main/datasets/transforms.py
"""
def __init__(self, transforms_list, p=None):
super(RandomSelects, self).__init__()
if p is not None:
assert isinstance(p, (list, tuple))
assert len(transforms_list) == len(p)
else:
assert len(transforms_list) > 0
self.transforms = [Compose(t) for t in transforms_list]
self.p = p
def apply(self, sample, context=None):
if self.p is None:
return random.choice(self.transforms)(sample)
else:
prob = random.random()
for p, t in zip(self.p, self.transforms):
if prob <= p:
return t(sample)
@register_op
class RandomShortSideResize(BaseOperator):
def __init__(self,
short_side_sizes,
max_size=None,
interp=cv2.INTER_LINEAR,
random_interp=False):
"""
Resize the image randomly according to the short side. If max_size is not None,
the long side is scaled according to max_size. The whole process will be keep ratio.
Args:
short_side_sizes (list|tuple): Image target short side size.
max_size (int): The size of the longest side of image after resize.
interp (int): The interpolation method.
random_interp (bool): Whether random select interpolation method.
"""
super(RandomShortSideResize, self).__init__()
assert isinstance(short_side_sizes,
Sequence), "short_side_sizes must be List or Tuple"
self.short_side_sizes = short_side_sizes
self.max_size = max_size
self.interp = interp
self.random_interp = random_interp
self.interps = [
cv2.INTER_NEAREST,
cv2.INTER_LINEAR,
cv2.INTER_AREA,
cv2.INTER_CUBIC,
cv2.INTER_LANCZOS4,
]
def get_size_with_aspect_ratio(self, image_shape, size, max_size=None):
h, w = image_shape
max_clip = False
if max_size is not None:
min_original_size = float(min((w, h)))
max_original_size = float(max((w, h)))
if max_original_size / min_original_size * size > max_size:
size = int(max_size * min_original_size / max_original_size)
max_clip = True
if (w <= h and w == size) or (h <= w and h == size):
return (w, h)
if w < h:
ow = size
oh = int(round(size * h / w)) if not max_clip else max_size
else:
oh = size
ow = int(round(size * w / h)) if not max_clip else max_size
return (ow, oh)
def resize(self,
sample,
target_size,
max_size=None,
interp=cv2.INTER_LINEAR):
im = sample['image']
if not isinstance(im, np.ndarray):
raise TypeError("{}: image type is not numpy.".format(self))
if len(im.shape) != 3:
raise ImageError('{}: image is not 3-dimensional.'.format(self))
target_size = self.get_size_with_aspect_ratio(im.shape[:2], target_size,
max_size)
im_scale_y, im_scale_x = target_size[1] / im.shape[0], target_size[
0] / im.shape[1]
sample['image'] = cv2.resize(im, target_size, interpolation=interp)
sample['im_shape'] = np.asarray(target_size[::-1], dtype=np.float32)
if 'scale_factor' in sample:
scale_factor = sample['scale_factor']
sample['scale_factor'] = np.asarray(
[scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
dtype=np.float32)
else:
sample['scale_factor'] = np.asarray(
[im_scale_y, im_scale_x], dtype=np.float32)
# apply bbox
if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
sample['gt_bbox'] = self.apply_bbox(
sample['gt_bbox'], [im_scale_x, im_scale_y], target_size)
# apply polygon
if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im.shape[:2],
[im_scale_x, im_scale_y])
# apply semantic
if 'semantic' in sample and sample['semantic']:
semantic = sample['semantic']
semantic = cv2.resize(
semantic.astype('float32'),
target_size,
interpolation=self.interp)
semantic = np.asarray(semantic).astype('int32')
semantic = np.expand_dims(semantic, 0)
sample['semantic'] = semantic
# apply gt_segm
if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
masks = [
cv2.resize(
gt_segm, target_size, interpolation=cv2.INTER_NEAREST)
for gt_segm in sample['gt_segm']
]
sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
if 'gt_joints' in sample:
sample['gt_joints'] = self.apply_joints(
sample['gt_joints'], [im_scale_x, im_scale_y], target_size)
# apply areas
if 'gt_areas' in sample:
sample['gt_areas'] = self.apply_area(sample['gt_areas'],
[im_scale_x, im_scale_y])
return sample
def apply_bbox(self, bbox, scale, size):
im_scale_x, im_scale_y = scale
resize_w, resize_h = size
bbox[:, 0::2] *= im_scale_x
bbox[:, 1::2] *= im_scale_y
bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w)
bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h)
return bbox.astype('float32')
def apply_joints(self, joints, scale, size):
im_scale_x, im_scale_y = scale
resize_w, resize_h = size
joints[..., 0] *= im_scale_x
joints[..., 1] *= im_scale_y
# joints[joints[..., 0] >= resize_w, :] = 0
# joints[joints[..., 1] >= resize_h, :] = 0
# joints[joints[..., 0] < 0, :] = 0
# joints[joints[..., 1] < 0, :] = 0
joints[..., 0] = np.clip(joints[..., 0], 0, resize_w)
joints[..., 1] = np.clip(joints[..., 1], 0, resize_h)
return joints
def apply_area(self, area, scale):
im_scale_x, im_scale_y = scale
return area * im_scale_x * im_scale_y
def apply_segm(self, segms, im_size, scale):
def _resize_poly(poly, im_scale_x, im_scale_y):
resized_poly = np.array(poly).astype('float32')
resized_poly[0::2] *= im_scale_x
resized_poly[1::2] *= im_scale_y
return resized_poly.tolist()
def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y):
if 'counts' in rle and type(rle['counts']) == list:
rle = mask_util.frPyObjects(rle, im_h, im_w)
mask = mask_util.decode(rle)
mask = cv2.resize(
mask,
None,
None,
fx=im_scale_x,
fy=im_scale_y,
interpolation=self.interp)
rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
return rle
im_h, im_w = im_size
im_scale_x, im_scale_y = scale
resized_segms = []
for segm in segms:
if is_poly(segm):
# Polygon format
resized_segms.append([
_resize_poly(poly, im_scale_x, im_scale_y) for poly in segm
])
else:
# RLE format
import pycocotools.mask as mask_util
resized_segms.append(
_resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y))
return resized_segms
def apply(self, sample, context=None):
target_size = random.choice(self.short_side_sizes)
interp = random.choice(
self.interps) if self.random_interp else self.interp
return self.resize(sample, target_size, self.max_size, interp)
@register_op
class RandomShortSideRangeResize(RandomShortSideResize):
def __init__(self, scales, interp=cv2.INTER_LINEAR, random_interp=False):
"""
Resize the image randomly according to the short side. If max_size is not None,
the long side is scaled according to max_size. The whole process will be keep ratio.
Args:
short_side_sizes (list|tuple): Image target short side size.
interp (int): The interpolation method.
random_interp (bool): Whether random select interpolation method.
"""
super(RandomShortSideRangeResize, self).__init__(scales, None, interp,
random_interp)
assert isinstance(scales,
Sequence), "short_side_sizes must be List or Tuple"
self.scales = scales
def random_sample(self, img_scales):
img_scale_long = [max(s) for s in img_scales]
img_scale_short = [min(s) for s in img_scales]
long_edge = np.random.randint(
min(img_scale_long), max(img_scale_long) + 1)
short_edge = np.random.randint(
min(img_scale_short), max(img_scale_short) + 1)
img_scale = (long_edge, short_edge)
return img_scale
def apply(self, sample, context=None):
long_edge, short_edge = self.random_sample(self.short_side_sizes)
# print("target size:{}".format((long_edge, short_edge)))
interp = random.choice(
self.interps) if self.random_interp else self.interp
return self.resize(sample, short_edge, long_edge, interp)
@register_op
class RandomSizeCrop(BaseOperator):
"""
Cut the image randomly according to `min_size` and `max_size`
Args:
min_size (int): Min size for edges of cropped image.
max_size (int): Max size for edges of cropped image. If it
is set to larger than length of the input image,
the output will keep the origin length.
keep_empty (bool): Whether to keep the cropped result with no object.
If it is set to False, the no-object result will not
be returned, replaced by the original input.
"""
def __init__(self, min_size, max_size, keep_empty=True):
super(RandomSizeCrop, self).__init__()
self.min_size = min_size
self.max_size = max_size
self.keep_empty = keep_empty
from paddle.vision.transforms.functional import crop as paddle_crop
self.paddle_crop = paddle_crop
@staticmethod
def get_crop_params(img_shape, output_size):
"""Get parameters for ``crop`` for a random crop.
Args:
img_shape (list|tuple): Image's height and width.
output_size (list|tuple): Expected output size of the crop.
Returns:
tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
"""
h, w = img_shape
th, tw = output_size
if h + 1 < th or w + 1 < tw:
raise ValueError(
"Required crop size {} is larger then input image size {}".
format((th, tw), (h, w)))
if w == tw and h == th:
return 0, 0, h, w
i = random.randint(0, h - th + 1)
j = random.randint(0, w - tw + 1)
return i, j, th, tw
def crop(self, sample, region):
keep_index = None
# apply bbox and check whether the cropped result is valid
if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
croped_bbox = self.apply_bbox(sample['gt_bbox'], region)
bbox = croped_bbox.reshape([-1, 2, 2])
area = (bbox[:, 1, :] - bbox[:, 0, :]).prod(axis=1)
keep_index = np.where(area > 0)[0]
if not self.keep_empty and len(keep_index) == 0:
# When keep_empty is set to False, cropped with no-object will
# not be used and return the origin content.
return sample
sample['gt_bbox'] = croped_bbox[keep_index] if len(
keep_index) > 0 else np.zeros(
[0, 4], dtype=np.float32)
sample['gt_class'] = sample['gt_class'][keep_index] if len(
keep_index) > 0 else np.zeros(
[0, 1], dtype=np.float32)
if 'gt_score' in sample:
sample['gt_score'] = sample['gt_score'][keep_index] if len(
keep_index) > 0 else np.zeros(
[0, 1], dtype=np.float32)
if 'is_crowd' in sample:
sample['is_crowd'] = sample['is_crowd'][keep_index] if len(
keep_index) > 0 else np.zeros(
[0, 1], dtype=np.float32)
if 'gt_areas' in sample:
sample['gt_areas'] = np.take(
sample['gt_areas'], keep_index, axis=0)
image_shape = sample['image'].shape[:2]
sample['image'] = self.paddle_crop(sample['image'], *region)
sample['im_shape'] = np.array(
sample['image'].shape[:2], dtype=np.float32)
# apply polygon
if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
sample['gt_poly'] = self.apply_segm(sample['gt_poly'], region,
image_shape)
sample['gt_poly'] = np.array(sample['gt_poly'])
if keep_index is not None and len(keep_index) > 0:
sample['gt_poly'] = sample['gt_poly'][keep_index]
sample['gt_poly'] = sample['gt_poly'].tolist()
# apply gt_segm
if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
i, j, h, w = region
sample['gt_segm'] = sample['gt_segm'][:, i:i + h, j:j + w]
if keep_index is not None and len(keep_index) > 0:
sample['gt_segm'] = sample['gt_segm'][keep_index]
if 'gt_joints' in sample:
gt_joints = self._crop_joints(sample['gt_joints'], region)
sample['gt_joints'] = gt_joints
if keep_index is not None:
sample['gt_joints'] = sample['gt_joints'][keep_index]
return sample
def apply_bbox(self, bbox, region):
i, j, h, w = region
region_size = np.asarray([w, h])
crop_bbox = bbox - np.asarray([j, i, j, i])
crop_bbox = np.minimum(crop_bbox.reshape([-1, 2, 2]), region_size)
crop_bbox = crop_bbox.clip(min=0)
return crop_bbox.reshape([-1, 4]).astype('float32')
def _crop_joints(self, joints, region):
y1, x1, h, w = region
x2 = x1 + w
y2 = y1 + h
# x1, y1, x2, y2 = crop
joints[..., 0] -= x1
joints[..., 1] -= y1
joints[joints[..., 0] > w, :] = 0
joints[joints[..., 1] > h, :] = 0
joints[joints[..., 0] < 0, :] = 0
joints[joints[..., 1] < 0, :] = 0
return joints
def apply_segm(self, segms, region, image_shape):
def _crop_poly(segm, crop):
xmin, ymin, xmax, ymax = crop
crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin]
crop_p = np.array(crop_coord).reshape(4, 2)
crop_p = Polygon(crop_p)
crop_segm = list()
for poly in segm:
poly = np.array(poly).reshape(len(poly) // 2, 2)
polygon = Polygon(poly)
if not polygon.is_valid:
exterior = polygon.exterior
multi_lines = exterior.intersection(exterior)
polygons = shapely.ops.polygonize(multi_lines)
polygon = MultiPolygon(polygons)
multi_polygon = list()
if isinstance(polygon, MultiPolygon):
multi_polygon = copy.deepcopy(polygon)
else:
multi_polygon.append(copy.deepcopy(polygon))
for per_polygon in multi_polygon:
inter = per_polygon.intersection(crop_p)
if not inter:
continue
if isinstance(inter, (MultiPolygon, GeometryCollection)):
for part in inter:
if not isinstance(part, Polygon):
continue
part = np.squeeze(
np.array(part.exterior.coords[:-1]).reshape(1,
-1))
part[0::2] -= xmin
part[1::2] -= ymin
crop_segm.append(part.tolist())
elif isinstance(inter, Polygon):
crop_poly = np.squeeze(
np.array(inter.exterior.coords[:-1]).reshape(1, -1))
crop_poly[0::2] -= xmin
crop_poly[1::2] -= ymin
crop_segm.append(crop_poly.tolist())
else:
continue
return crop_segm
def _crop_rle(rle, crop, height, width):
if 'counts' in rle and type(rle['counts']) == list:
rle = mask_util.frPyObjects(rle, height, width)
mask = mask_util.decode(rle)
mask = mask[crop[1]:crop[3], crop[0]:crop[2]]
rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
return rle
i, j, h, w = region
crop = [j, i, j + w, i + h]
height, width = image_shape
crop_segms = []
for segm in segms:
if is_poly(segm):
import copy
import shapely.ops
from shapely.geometry import Polygon, MultiPolygon, GeometryCollection
# Polygon format
crop_segms.append(_crop_poly(segm, crop))
else:
# RLE format
import pycocotools.mask as mask_util
crop_segms.append(_crop_rle(segm, crop, height, width))
return crop_segms
def apply(self, sample, context=None):
h = random.randint(self.min_size,
min(sample['image'].shape[0], self.max_size))
w = random.randint(self.min_size,
min(sample['image'].shape[1], self.max_size))
region = self.get_crop_params(sample['image'].shape[:2], [h, w])
return self.crop(sample, region)
@register_op
class WarpAffine(BaseOperator):
def __init__(self,
keep_res=False,
pad=31,
input_h=512,
input_w=512,
scale=0.4,
shift=0.1,
down_ratio=4):
"""WarpAffine
Warp affine the image
The code is based on https://github.com/xingyizhou/CenterNet/blob/master/src/lib/datasets/sample/ctdet.py
"""
super(WarpAffine, self).__init__()
self.keep_res = keep_res
self.pad = pad
self.input_h = input_h
self.input_w = input_w
self.scale = scale
self.shift = shift
self.down_ratio = down_ratio
def apply(self, sample, context=None):
img = sample['image']
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
h, w = img.shape[:2]
if self.keep_res:
# True in detection eval/infer
input_h = (h | self.pad) + 1
input_w = (w | self.pad) + 1
s = np.array([input_w, input_h], dtype=np.float32)
c = np.array([w // 2, h // 2], dtype=np.float32)
else:
# False in centertrack eval_mot/eval_mot
s = max(h, w) * 1.0
input_h, input_w = self.input_h, self.input_w
c = np.array([w / 2., h / 2.], dtype=np.float32)
trans_input = get_affine_transform(c, s, 0, [input_w, input_h])
img = cv2.resize(img, (w, h))
inp = cv2.warpAffine(
img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)
sample['image'] = inp
if not self.keep_res:
out_h = input_h // self.down_ratio
out_w = input_w // self.down_ratio
trans_output = get_affine_transform(c, s, 0, [out_w, out_h])
sample.update({
'center': c,
'scale': s,
'out_height': out_h,
'out_width': out_w,
'inp_height': input_h,
'inp_width': input_w,
'trans_input': trans_input,
'trans_output': trans_output,
})
return sample
@register_op
class FlipWarpAffine(BaseOperator):
def __init__(self,
keep_res=False,
pad=31,
input_h=512,
input_w=512,
not_rand_crop=False,
scale=0.4,
shift=0.1,
flip=0.5,
is_scale=True,
use_random=True,
add_pre_img=False):
"""FlipWarpAffine
1. Random Crop
2. Flip the image horizontal
3. Warp affine the image
4. (Optinal) Add previous image
"""
super(FlipWarpAffine, self).__init__()
self.keep_res = keep_res
self.pad = pad
self.input_h = input_h
self.input_w = input_w
self.not_rand_crop = not_rand_crop
self.scale = scale
self.shift = shift
self.flip = flip
self.is_scale = is_scale
self.use_random = use_random
self.add_pre_img = add_pre_img
def __call__(self, samples, context=None):
if self.add_pre_img:
assert isinstance(samples, Sequence) and len(samples) == 2
sample, pre_sample = samples[0], samples[1]
else:
sample = samples
img = sample['image']
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0:
return sample
h, w = img.shape[:2]
flipped = 0
if self.keep_res:
input_h = (h | self.pad) + 1
input_w = (w | self.pad) + 1
s = np.array([input_w, input_h], dtype=np.float32)
c = np.array([w // 2, h // 2], dtype=np.float32)
else:
# centernet training default
s = max(h, w) * 1.0
input_h, input_w = self.input_h, self.input_w
c = np.array([w / 2., h / 2.], dtype=np.float32)
if self.use_random:
gt_bbox = sample['gt_bbox']
if not self.not_rand_crop:
# centernet default
s = s * np.random.choice(np.arange(0.6, 1.4, 0.1))
w_border = get_border(128, w)
h_border = get_border(128, h)
c[0] = np.random.randint(low=w_border, high=w - w_border)
c[1] = np.random.randint(low=h_border, high=h - h_border)
else:
sf = self.scale
cf = self.shift
c[0] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf)
c[1] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf)
s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
if np.random.random() < self.flip:
img = img[:, ::-1, :]
c[0] = w - c[0] - 1
oldx1 = gt_bbox[:, 0].copy()
oldx2 = gt_bbox[:, 2].copy()
gt_bbox[:, 0] = w - oldx2 - 1
gt_bbox[:, 2] = w - oldx1 - 1
flipped = 1
sample['gt_bbox'] = gt_bbox
trans_input = get_affine_transform(c, s, 0, [input_w, input_h])
inp = cv2.warpAffine(
img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)
if self.is_scale:
inp = (inp.astype(np.float32) / 255.)
sample['image'] = inp
sample['center'] = c
sample['scale'] = s
if self.add_pre_img:
sample['trans_input'] = trans_input
# previous image, use same aug trans_input as current image
pre_img = pre_sample['image']
pre_img = cv2.cvtColor(pre_img, cv2.COLOR_RGB2BGR)
if flipped:
pre_img = pre_img[:, ::-1, :].copy()
pre_inp = cv2.warpAffine(
pre_img,
trans_input, (input_w, input_h),
flags=cv2.INTER_LINEAR)
if self.is_scale:
pre_inp = (pre_inp.astype(np.float32) / 255.)
sample['pre_image'] = pre_inp
# if empty gt_bbox
if 'gt_bbox' in pre_sample and len(pre_sample['gt_bbox']) == 0:
return sample
pre_gt_bbox = pre_sample['gt_bbox']
if flipped:
pre_oldx1 = pre_gt_bbox[:, 0].copy()
pre_oldx2 = pre_gt_bbox[:, 2].copy()
pre_gt_bbox[:, 0] = w - pre_oldx1 - 1
pre_gt_bbox[:, 2] = w - pre_oldx2 - 1
sample['pre_gt_bbox'] = pre_gt_bbox
sample['pre_gt_class'] = pre_sample['gt_class']
sample['pre_gt_track_id'] = pre_sample['gt_track_id']
del pre_sample
return sample
@register_op
class CenterRandColor(BaseOperator):
"""Random color for CenterNet series models.
Args:
saturation (float): saturation settings.
contrast (float): contrast settings.
brightness (float): brightness settings.
"""
def __init__(self, saturation=0.4, contrast=0.4, brightness=0.4):
super(CenterRandColor, self).__init__()
self.saturation = saturation
self.contrast = contrast
self.brightness = brightness
def apply_saturation(self, img, img_gray):
alpha = 1. + np.random.uniform(
low=-self.saturation, high=self.saturation)
self._blend(alpha, img, img_gray[:, :, None])
return img
def apply_contrast(self, img, img_gray):
alpha = 1. + np.random.uniform(low=-self.contrast, high=self.contrast)
img_mean = img_gray.mean()
self._blend(alpha, img, img_mean)
return img
def apply_brightness(self, img, img_gray):
alpha = 1 + np.random.uniform(
low=-self.brightness, high=self.brightness)
img *= alpha
return img
def _blend(self, alpha, img, img_mean):
img *= alpha
img_mean *= (1 - alpha)
img += img_mean
def apply(self, sample, context=None):
functions = [
self.apply_brightness,
self.apply_contrast,
self.apply_saturation,
]
img = sample['image']
img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
distortions = np.random.permutation(functions)
for func in distortions:
img = func(img, img_gray)
sample['image'] = img
if 'pre_image' in sample:
pre_img = sample['pre_image']
pre_img_gray = cv2.cvtColor(pre_img, cv2.COLOR_BGR2GRAY)
pre_distortions = np.random.permutation(functions)
for func in pre_distortions:
pre_img = func(pre_img, pre_img_gray)
sample['pre_image'] = pre_img
return sample
@register_op
class Mosaic(BaseOperator):
""" Mosaic operator for image and gt_bboxes
The code is based on https://github.com/Megvii-BaseDetection/YOLOX/blob/main/yolox/data/datasets/mosaicdetection.py
1. get mosaic coords
2. clip bbox and get mosaic_labels
3. random_affine augment
4. Mixup augment as copypaste (optinal), not used in tiny/nano
Args:
prob (float): probability of using Mosaic, 1.0 as default
input_dim (list[int]): input shape
degrees (list[2]): the rotate range to apply, transform range is [min, max]
translate (list[2]): the translate range to apply, transform range is [min, max]
scale (list[2]): the scale range to apply, transform range is [min, max]
shear (list[2]): the shear range to apply, transform range is [min, max]
enable_mixup (bool): whether to enable Mixup or not
mixup_prob (float): probability of using Mixup, 1.0 as default
mixup_scale (list[int]): scale range of Mixup
remove_outside_box (bool): whether remove outside boxes, False as
default in COCO dataset, True in MOT dataset
"""
def __init__(self,
prob=1.0,
input_dim=[640, 640],
degrees=[-10, 10],
translate=[-0.1, 0.1],
scale=[0.1, 2],
shear=[-2, 2],
enable_mixup=True,
mixup_prob=1.0,
mixup_scale=[0.5, 1.5],
remove_outside_box=False):
super(Mosaic, self).__init__()
self.prob = prob
if isinstance(input_dim, Integral):
input_dim = [input_dim, input_dim]
self.input_dim = input_dim
self.degrees = degrees
self.translate = translate
self.scale = scale
self.shear = shear
self.enable_mixup = enable_mixup
self.mixup_prob = mixup_prob
self.mixup_scale = mixup_scale
self.remove_outside_box = remove_outside_box
def get_mosaic_coords(self, mosaic_idx, xc, yc, w, h, input_h, input_w):
# (x1, y1, x2, y2) means coords in large image,
# small_coords means coords in small image in mosaic aug.
if mosaic_idx == 0:
# top left
x1, y1, x2, y2 = max(xc - w, 0), max(yc - h, 0), xc, yc
small_coords = w - (x2 - x1), h - (y2 - y1), w, h
elif mosaic_idx == 1:
# top right
x1, y1, x2, y2 = xc, max(yc - h, 0), min(xc + w, input_w * 2), yc
small_coords = 0, h - (y2 - y1), min(w, x2 - x1), h
elif mosaic_idx == 2:
# bottom left
x1, y1, x2, y2 = max(xc - w, 0), yc, xc, min(input_h * 2, yc + h)
small_coords = w - (x2 - x1), 0, w, min(y2 - y1, h)
elif mosaic_idx == 3:
# bottom right
x1, y1, x2, y2 = xc, yc, min(xc + w, input_w * 2), min(input_h * 2,
yc + h)
small_coords = 0, 0, min(w, x2 - x1), min(y2 - y1, h)
return (x1, y1, x2, y2), small_coords
def random_affine_augment(self,
img,
labels=[],
input_dim=[640, 640],
degrees=[-10, 10],
scales=[0.1, 2],
shears=[-2, 2],
translates=[-0.1, 0.1]):
# random rotation and scale
degree = random.uniform(degrees[0], degrees[1])
scale = random.uniform(scales[0], scales[1])
assert scale > 0, "Argument scale should be positive."
R = cv2.getRotationMatrix2D(angle=degree, center=(0, 0), scale=scale)
M = np.ones([2, 3])
# random shear
shear = random.uniform(shears[0], shears[1])
shear_x = math.tan(shear * math.pi / 180)
shear_y = math.tan(shear * math.pi / 180)
M[0] = R[0] + shear_y * R[1]
M[1] = R[1] + shear_x * R[0]
# random translation
translate = random.uniform(translates[0], translates[1])
translation_x = translate * input_dim[0]
translation_y = translate * input_dim[1]
M[0, 2] = translation_x
M[1, 2] = translation_y
# warpAffine
img = cv2.warpAffine(
img, M, dsize=tuple(input_dim), borderValue=(114, 114, 114))
num_gts = len(labels)
if num_gts > 0:
# warp corner points
corner_points = np.ones((4 * num_gts, 3))
corner_points[:, :2] = labels[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
4 * num_gts, 2) # x1y1, x2y2, x1y2, x2y1
# apply affine transform
corner_points = corner_points @M.T
corner_points = corner_points.reshape(num_gts, 8)
# create new boxes
corner_xs = corner_points[:, 0::2]
corner_ys = corner_points[:, 1::2]
new_bboxes = np.concatenate((corner_xs.min(1), corner_ys.min(1),
corner_xs.max(1), corner_ys.max(1)))
new_bboxes = new_bboxes.reshape(4, num_gts).T
# clip boxes
new_bboxes[:, 0::2] = np.clip(new_bboxes[:, 0::2], 0, input_dim[0])
new_bboxes[:, 1::2] = np.clip(new_bboxes[:, 1::2], 0, input_dim[1])
labels[:, :4] = new_bboxes
return img, labels
def __call__(self, sample, context=None):
if not isinstance(sample, Sequence):
return sample
assert len(
sample) == 5, "Mosaic needs 5 samples, 4 for mosaic and 1 for mixup."
if np.random.uniform(0., 1.) > self.prob:
return sample[0]
mosaic_gt_bbox, mosaic_gt_class, mosaic_is_crowd, mosaic_difficult = [], [], [], []
input_h, input_w = self.input_dim
yc = int(random.uniform(0.5 * input_h, 1.5 * input_h))
xc = int(random.uniform(0.5 * input_w, 1.5 * input_w))
mosaic_img = np.full((input_h * 2, input_w * 2, 3), 114, dtype=np.uint8)
# 1. get mosaic coords
for mosaic_idx, sp in enumerate(sample[:4]):
img = sp['image']
gt_bbox = sp['gt_bbox']
h0, w0 = img.shape[:2]
scale = min(1. * input_h / h0, 1. * input_w / w0)
img = cv2.resize(
img, (int(w0 * scale), int(h0 * scale)),
interpolation=cv2.INTER_LINEAR)
(h, w, c) = img.shape[:3]
# suffix l means large image, while s means small image in mosaic aug.
(l_x1, l_y1, l_x2, l_y2), (
s_x1, s_y1, s_x2, s_y2) = self.get_mosaic_coords(
mosaic_idx, xc, yc, w, h, input_h, input_w)
mosaic_img[l_y1:l_y2, l_x1:l_x2] = img[s_y1:s_y2, s_x1:s_x2]
padw, padh = l_x1 - s_x1, l_y1 - s_y1
# Normalized xywh to pixel xyxy format
_gt_bbox = gt_bbox.copy()
if len(gt_bbox) > 0:
_gt_bbox[:, 0] = scale * gt_bbox[:, 0] + padw
_gt_bbox[:, 1] = scale * gt_bbox[:, 1] + padh
_gt_bbox[:, 2] = scale * gt_bbox[:, 2] + padw
_gt_bbox[:, 3] = scale * gt_bbox[:, 3] + padh
mosaic_gt_bbox.append(_gt_bbox)
mosaic_gt_class.append(sp['gt_class'])
if 'is_crowd' in sp:
mosaic_is_crowd.append(sp['is_crowd'])
if 'difficult' in sp:
mosaic_difficult.append(sp['difficult'])
# 2. clip bbox and get mosaic_labels([gt_bbox, gt_class, is_crowd])
if len(mosaic_gt_bbox):
mosaic_gt_bbox = np.concatenate(mosaic_gt_bbox, 0)
mosaic_gt_class = np.concatenate(mosaic_gt_class, 0)
if mosaic_is_crowd:
mosaic_is_crowd = np.concatenate(mosaic_is_crowd, 0)
mosaic_labels = np.concatenate([
mosaic_gt_bbox,
mosaic_gt_class.astype(mosaic_gt_bbox.dtype),
mosaic_is_crowd.astype(mosaic_gt_bbox.dtype)
], 1)
elif mosaic_difficult:
mosaic_difficult = np.concatenate(mosaic_difficult, 0)
mosaic_labels = np.concatenate([
mosaic_gt_bbox,
mosaic_gt_class.astype(mosaic_gt_bbox.dtype),
mosaic_difficult.astype(mosaic_gt_bbox.dtype)
], 1)
else:
mosaic_labels = np.concatenate([
mosaic_gt_bbox, mosaic_gt_class.astype(mosaic_gt_bbox.dtype)
], 1)
if self.remove_outside_box:
# for MOT dataset
flag1 = mosaic_gt_bbox[:, 0] < 2 * input_w
flag2 = mosaic_gt_bbox[:, 2] > 0
flag3 = mosaic_gt_bbox[:, 1] < 2 * input_h
flag4 = mosaic_gt_bbox[:, 3] > 0
flag_all = flag1 * flag2 * flag3 * flag4
mosaic_labels = mosaic_labels[flag_all]
else:
mosaic_labels[:, 0] = np.clip(mosaic_labels[:, 0], 0,
2 * input_w)
mosaic_labels[:, 1] = np.clip(mosaic_labels[:, 1], 0,
2 * input_h)
mosaic_labels[:, 2] = np.clip(mosaic_labels[:, 2], 0,
2 * input_w)
mosaic_labels[:, 3] = np.clip(mosaic_labels[:, 3], 0,
2 * input_h)
else:
mosaic_labels = np.zeros((1, 6))
# 3. random_affine augment
mosaic_img, mosaic_labels = self.random_affine_augment(
mosaic_img,
mosaic_labels,
input_dim=self.input_dim,
degrees=self.degrees,
translates=self.translate,
scales=self.scale,
shears=self.shear)
# 4. Mixup augment as copypaste, https://arxiv.org/abs/2012.07177
# optinal, not used(enable_mixup=False) in tiny/nano
if (self.enable_mixup and not len(mosaic_labels) == 0 and
random.random() < self.mixup_prob):
sample_mixup = sample[4]
mixup_img = sample_mixup['image']
if 'is_crowd' in sample_mixup:
cp_labels = np.concatenate([
sample_mixup['gt_bbox'],
sample_mixup['gt_class'].astype(mosaic_labels.dtype),
sample_mixup['is_crowd'].astype(mosaic_labels.dtype)
], 1)
elif 'difficult' in sample_mixup:
cp_labels = np.concatenate([
sample_mixup['gt_bbox'],
sample_mixup['gt_class'].astype(mosaic_labels.dtype),
sample_mixup['difficult'].astype(mosaic_labels.dtype)
], 1)
else:
cp_labels = np.concatenate([
sample_mixup['gt_bbox'],
sample_mixup['gt_class'].astype(mosaic_labels.dtype)
], 1)
mosaic_img, mosaic_labels = self.mixup_augment(
mosaic_img, mosaic_labels, self.input_dim, cp_labels, mixup_img)
sample0 = sample[0]
sample0['image'] = mosaic_img.astype(np.uint8) # can not be float32
sample0['h'] = float(mosaic_img.shape[0])
sample0['w'] = float(mosaic_img.shape[1])
sample0['im_shape'][0] = sample0['h']
sample0['im_shape'][1] = sample0['w']
sample0['gt_bbox'] = mosaic_labels[:, :4].astype(np.float32)
sample0['gt_class'] = mosaic_labels[:, 4:5].astype(np.float32)
if 'is_crowd' in sample[0]:
sample0['is_crowd'] = mosaic_labels[:, 5:6].astype(np.float32)
if 'difficult' in sample[0]:
sample0['difficult'] = mosaic_labels[:, 5:6].astype(np.float32)
return sample0
def mixup_augment(self, origin_img, origin_labels, input_dim, cp_labels,
img):
jit_factor = random.uniform(*self.mixup_scale)
FLIP = random.uniform(0, 1) > 0.5
if len(img.shape) == 3:
cp_img = np.ones(
(input_dim[0], input_dim[1], 3), dtype=np.uint8) * 114
else:
cp_img = np.ones(input_dim, dtype=np.uint8) * 114
cp_scale_ratio = min(input_dim[0] / img.shape[0],
input_dim[1] / img.shape[1])
resized_img = cv2.resize(
img, (int(img.shape[1] * cp_scale_ratio),
int(img.shape[0] * cp_scale_ratio)),
interpolation=cv2.INTER_LINEAR)
cp_img[:int(img.shape[0] * cp_scale_ratio), :int(img.shape[
1] * cp_scale_ratio)] = resized_img
cp_img = cv2.resize(cp_img, (int(cp_img.shape[1] * jit_factor),
int(cp_img.shape[0] * jit_factor)))
cp_scale_ratio *= jit_factor
if FLIP:
cp_img = cp_img[:, ::-1, :]
origin_h, origin_w = cp_img.shape[:2]
target_h, target_w = origin_img.shape[:2]
padded_img = np.zeros(
(max(origin_h, target_h), max(origin_w, target_w), 3),
dtype=np.uint8)
padded_img[:origin_h, :origin_w] = cp_img
x_offset, y_offset = 0, 0
if padded_img.shape[0] > target_h:
y_offset = random.randint(0, padded_img.shape[0] - target_h - 1)
if padded_img.shape[1] > target_w:
x_offset = random.randint(0, padded_img.shape[1] - target_w - 1)
padded_cropped_img = padded_img[y_offset:y_offset + target_h, x_offset:
x_offset + target_w]
# adjust boxes
cp_bboxes_origin_np = cp_labels[:, :4].copy()
cp_bboxes_origin_np[:, 0::2] = np.clip(cp_bboxes_origin_np[:, 0::2] *
cp_scale_ratio, 0, origin_w)
cp_bboxes_origin_np[:, 1::2] = np.clip(cp_bboxes_origin_np[:, 1::2] *
cp_scale_ratio, 0, origin_h)
if FLIP:
cp_bboxes_origin_np[:, 0::2] = (
origin_w - cp_bboxes_origin_np[:, 0::2][:, ::-1])
cp_bboxes_transformed_np = cp_bboxes_origin_np.copy()
if self.remove_outside_box:
# for MOT dataset
cp_bboxes_transformed_np[:, 0::2] -= x_offset
cp_bboxes_transformed_np[:, 1::2] -= y_offset
else:
cp_bboxes_transformed_np[:, 0::2] = np.clip(
cp_bboxes_transformed_np[:, 0::2] - x_offset, 0, target_w)
cp_bboxes_transformed_np[:, 1::2] = np.clip(
cp_bboxes_transformed_np[:, 1::2] - y_offset, 0, target_h)
cls_labels = cp_labels[:, 4:5].copy()
box_labels = cp_bboxes_transformed_np
if cp_labels.shape[-1] == 6:
crd_labels = cp_labels[:, 5:6].copy()
labels = np.hstack((box_labels, cls_labels, crd_labels))
else:
labels = np.hstack((box_labels, cls_labels))
if self.remove_outside_box:
labels = labels[labels[:, 0] < target_w]
labels = labels[labels[:, 2] > 0]
labels = labels[labels[:, 1] < target_h]
labels = labels[labels[:, 3] > 0]
origin_labels = np.vstack((origin_labels, labels))
origin_img = origin_img.astype(np.float32)
origin_img = 0.5 * origin_img + 0.5 * padded_cropped_img.astype(
np.float32)
return origin_img.astype(np.uint8), origin_labels
@register_op
class PadResize(BaseOperator):
""" PadResize for image and gt_bbbox
Args:
target_size (list[int]): input shape
fill_value (float): pixel value of padded image
"""
def __init__(self, target_size, fill_value=114):
super(PadResize, self).__init__()
if isinstance(target_size, Integral):
target_size = [target_size, target_size]
self.target_size = target_size
self.fill_value = fill_value
def _resize(self, img, bboxes, labels):
ratio = min(self.target_size[0] / img.shape[0],
self.target_size[1] / img.shape[1])
w, h = int(img.shape[1] * ratio), int(img.shape[0] * ratio)
resized_img = cv2.resize(img, (w, h), interpolation=cv2.INTER_LINEAR)
if len(bboxes) > 0:
bboxes *= ratio
mask = np.minimum(bboxes[:, 2] - bboxes[:, 0],
bboxes[:, 3] - bboxes[:, 1]) > 1
bboxes = bboxes[mask]
labels = labels[mask]
return resized_img, bboxes, labels
def _pad(self, img):
h, w, _ = img.shape
if h == self.target_size[0] and w == self.target_size[1]:
return img
padded_img = np.full(
(self.target_size[0], self.target_size[1], 3),
self.fill_value,
dtype=np.uint8)
padded_img[:h, :w] = img
return padded_img
def apply(self, sample, context=None):
image = sample['image']
bboxes = sample['gt_bbox']
labels = sample['gt_class']
image, bboxes, labels = self._resize(image, bboxes, labels)
sample['image'] = self._pad(image).astype(np.float32)
sample['gt_bbox'] = bboxes
sample['gt_class'] = labels
return sample
@register_op
class RandomShift(BaseOperator):
"""
Randomly shift image
Args:
prob (float): probability to do random shift.
max_shift (int): max shift pixels
filter_thr (int): filter gt bboxes if one side is smaller than this
"""
def __init__(self, prob=0.5, max_shift=32, filter_thr=1):
super(RandomShift, self).__init__()
self.prob = prob
self.max_shift = max_shift
self.filter_thr = filter_thr
def calc_shift_coor(self, im_h, im_w, shift_h, shift_w):
return [
max(0, shift_w), max(0, shift_h), min(im_w, im_w + shift_w),
min(im_h, im_h + shift_h)
]
def apply(self, sample, context=None):
if random.random() > self.prob:
return sample
im = sample['image']
gt_bbox = sample['gt_bbox']
gt_class = sample['gt_class']
im_h, im_w = im.shape[:2]
shift_h = random.randint(-self.max_shift, self.max_shift)
shift_w = random.randint(-self.max_shift, self.max_shift)
gt_bbox[:, 0::2] += shift_w
gt_bbox[:, 1::2] += shift_h
gt_bbox[:, 0::2] = np.clip(gt_bbox[:, 0::2], 0, im_w)
gt_bbox[:, 1::2] = np.clip(gt_bbox[:, 1::2], 0, im_h)
gt_bbox_h = gt_bbox[:, 2] - gt_bbox[:, 0]
gt_bbox_w = gt_bbox[:, 3] - gt_bbox[:, 1]
keep = (gt_bbox_w > self.filter_thr) & (gt_bbox_h > self.filter_thr)
if not keep.any():
return sample
gt_bbox = gt_bbox[keep]
gt_class = gt_class[keep]
# shift image
coor_new = self.calc_shift_coor(im_h, im_w, shift_h, shift_w)
# shift frame to the opposite direction
coor_old = self.calc_shift_coor(im_h, im_w, -shift_h, -shift_w)
canvas = np.zeros_like(im)
canvas[coor_new[1]:coor_new[3], coor_new[0]:coor_new[2]] \
= im[coor_old[1]:coor_old[3], coor_old[0]:coor_old[2]]
sample['image'] = canvas
sample['gt_bbox'] = gt_bbox
sample['gt_class'] = gt_class
return sample
@register_op
class StrongAugImage(BaseOperator):
def __init__(self, transforms):
super(StrongAugImage, self).__init__()
self.transforms = Compose(transforms)
def apply(self, sample, context=None):
im = sample
im['image'] = sample['image'].astype('uint8')
results = self.transforms(im)
sample['image'] = results['image'].astype('uint8')
return sample
@register_op
class RandomColorJitter(BaseOperator):
def __init__(self,
prob=0.8,
brightness=0.4,
contrast=0.4,
saturation=0.4,
hue=0.1):
super(RandomColorJitter, self).__init__()
self.prob = prob
self.brightness = brightness
self.contrast = contrast
self.saturation = saturation
self.hue = hue
def apply(self, sample, context=None):
if np.random.uniform(0, 1) < self.prob:
from paddle.vision.transforms import ColorJitter
transform = ColorJitter(self.brightness, self.contrast,
self.saturation, self.hue)
sample['image'] = transform(sample['image'].astype(np.uint8))
sample['image'] = sample['image'].astype(np.float32)
return sample
@register_op
class RandomGrayscale(BaseOperator):
def __init__(self, prob=0.2):
super(RandomGrayscale, self).__init__()
self.prob = prob
def apply(self, sample, context=None):
if np.random.uniform(0, 1) < self.prob:
from paddle.vision.transforms import Grayscale
transform = Grayscale(num_output_channels=3)
sample['image'] = transform(sample['image'])
return sample
@register_op
class RandomGaussianBlur(BaseOperator):
def __init__(self, prob=0.5, sigma=[0.1, 2.0]):
super(RandomGaussianBlur, self).__init__()
self.prob = prob
self.sigma = sigma
def apply(self, sample, context=None):
if np.random.uniform(0, 1) < self.prob:
sigma = np.random.uniform(self.sigma[0], self.sigma[1])
im = cv2.GaussianBlur(sample['image'], (23, 23), sigma)
sample['image'] = im
return sample
@register_op
class RandomErasing(BaseOperator):
def __init__(self,
prob=0.5,
scale=(0.02, 0.33),
ratio=(0.3, 3.3),
value=0,
inplace=False):
super(RandomErasing, self).__init__()
assert isinstance(scale,
(tuple, list)), "scale should be a tuple or list"
assert (scale[0] >= 0 and scale[1] <= 1 and scale[0] <= scale[1]
), "scale should be of kind (min, max) and in range [0, 1]"
assert isinstance(ratio,
(tuple, list)), "ratio should be a tuple or list"
assert (ratio[0] >= 0 and
ratio[0] <= ratio[1]), "ratio should be of kind (min, max)"
assert isinstance(
value, (Number, str, tuple,
list)), "value should be a number, tuple, list or str"
if isinstance(value, str) and value != "random":
raise ValueError("value must be 'random' when type is str")
self.prob = prob
self.scale = scale
self.ratio = ratio
self.value = value
self.inplace = inplace
def _erase(self, img, i, j, h, w, v, inplace=False):
if not inplace:
img = img.copy()
img[i:i + h, j:j + w, ...] = v
return img
def _get_param(self, img, scale, ratio, value):
shape = np.asarray(img).astype(np.uint8).shape
h, w, c = shape[-3], shape[-2], shape[-1]
img_area = h * w
log_ratio = np.log(ratio)
for _ in range(1):
erase_area = np.random.uniform(*scale) * img_area
aspect_ratio = np.exp(np.random.uniform(*log_ratio))
erase_h = int(round(np.sqrt(erase_area * aspect_ratio)))
erase_w = int(round(np.sqrt(erase_area / aspect_ratio)))
if erase_h >= h or erase_w >= w:
continue
if value is None:
v = np.random.normal(size=[erase_h, erase_w, c]) * 255
else:
v = np.array(value)[None, None, :]
top = np.random.randint(0, h - erase_h + 1)
left = np.random.randint(0, w - erase_w + 1)
return top, left, erase_h, erase_w, v
return 0, 0, h, w, img
def apply(self, sample, context=None):
if random.random() < self.prob:
if isinstance(self.value, Number):
value = [self.value]
elif isinstance(self.value, str):
value = None
else:
value = self.value
if value is not None and not (len(value) == 1 or len(value) == 3):
raise ValueError(
"Value should be a single number or a sequence with length equals to image's channel."
)
im = sample['image']
top, left, erase_h, erase_w, v = self._get_param(im, self.scale,
self.ratio, value)
im = self._erase(im, top, left, erase_h, erase_w, v, self.inplace)
sample['image'] = im
return sample
@register_op
class RandomErasingCrop(BaseOperator):
def __init__(self):
super(RandomErasingCrop, self).__init__()
self.transform1 = RandomErasing(
prob=0.7, scale=(0.05, 0.2), ratio=(0.3, 3.3), value="random")
self.transform2 = RandomErasing(
prob=0.5, scale=(0.05, 0.2), ratio=(0.1, 6), value="random")
self.transform3 = RandomErasing(
prob=0.3, scale=(0.05, 0.2), ratio=(0.05, 8), value="random")
def apply(self, sample, context=None):
sample = self.transform1(sample)
sample = self.transform2(sample)
sample = self.transform3(sample)
return sample
================================================
FILE: ppdet/data/transform/rotated_operators.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
try:
from collections.abc import Sequence
except Exception:
from collections import Sequence
from numbers import Number, Integral
import cv2
import numpy as np
import math
import copy
from .operators import register_op, BaseOperator
from ppdet.modeling.rbox_utils import poly2rbox_le135_np, poly2rbox_oc_np, rbox2poly_np
from ppdet.utils.logger import setup_logger
from ppdet.utils.compact import imagedraw_textsize_c
logger = setup_logger(__name__)
@register_op
class RRotate(BaseOperator):
""" Rotate Image, Polygon, Box
Args:
scale (float): rotate scale
angle (float): rotate angle
fill_value (int, tuple): fill color
auto_bound (bool): whether auto bound or not
"""
def __init__(self, scale=1.0, angle=0., fill_value=0., auto_bound=True):
super(RRotate, self).__init__()
self.scale = scale
self.angle = angle
self.fill_value = fill_value
self.auto_bound = auto_bound
def get_rotated_matrix(self, angle, scale, h, w):
center = ((w - 1) * 0.5, (h - 1) * 0.5)
matrix = cv2.getRotationMatrix2D(center, -angle, scale)
# calculate the new size
cos = np.abs(matrix[0, 0])
sin = np.abs(matrix[0, 1])
new_w = h * sin + w * cos
new_h = h * cos + w * sin
# calculate offset
n_w = int(np.round(new_w))
n_h = int(np.round(new_h))
if self.auto_bound:
ratio = min(w / n_w, h / n_h)
matrix = cv2.getRotationMatrix2D(center, -angle, ratio)
else:
matrix[0, 2] += (new_w - w) * 0.5
matrix[1, 2] += (new_h - h) * 0.5
w = n_w
h = n_h
return matrix, h, w
def get_rect_from_pts(self, pts, h, w):
""" get minimum rectangle of points
"""
assert pts.shape[-1] % 2 == 0, 'the dim of input [pts] is not correct'
min_x, min_y = np.min(pts[:, 0::2], axis=1), np.min(pts[:, 1::2],
axis=1)
max_x, max_y = np.max(pts[:, 0::2], axis=1), np.max(pts[:, 1::2],
axis=1)
min_x, min_y = np.clip(min_x, 0, w), np.clip(min_y, 0, h)
max_x, max_y = np.clip(max_x, 0, w), np.clip(max_y, 0, h)
boxes = np.stack([min_x, min_y, max_x, max_y], axis=-1)
return boxes
def apply_image(self, image, matrix, h, w):
return cv2.warpAffine(
image, matrix, (w, h), borderValue=self.fill_value)
def apply_pts(self, pts, matrix, h, w):
assert pts.shape[-1] % 2 == 0, 'the dim of input [pts] is not correct'
# n is number of samples and m is two times the number of points due to (x, y)
_, m = pts.shape
# transpose points
pts_ = pts.reshape(-1, 2).T
# pad 1 to convert the points to homogeneous coordinates
padding = np.ones((1, pts_.shape[1]), pts.dtype)
rotated_pts = np.matmul(matrix, np.concatenate((pts_, padding), axis=0))
return rotated_pts[:2, :].T.reshape(-1, m)
def apply(self, sample, context=None):
image = sample['image']
h, w = image.shape[:2]
matrix, h, w = self.get_rotated_matrix(self.angle, self.scale, h, w)
sample['image'] = self.apply_image(image, matrix, h, w)
polys = sample['gt_poly']
# TODO: segment or keypoint to be processed
if len(polys) > 0:
pts = self.apply_pts(polys, matrix, h, w)
sample['gt_poly'] = pts
sample['gt_bbox'] = self.get_rect_from_pts(pts, h, w)
return sample
@register_op
class RandomRRotate(BaseOperator):
""" Random Rotate Image
Args:
scale (float, tuple, list): rotate scale
scale_mode (str): mode of scale, [range, value, None]
angle (float, tuple, list): rotate angle
angle_mode (str): mode of angle, [range, value, None]
fill_value (float, tuple, list): fill value
rotate_prob (float): probability of rotation
auto_bound (bool): whether auto bound or not
"""
def __init__(self,
scale=1.0,
scale_mode=None,
angle=0.,
angle_mode=None,
fill_value=0.,
rotate_prob=1.0,
auto_bound=True):
super(RandomRRotate, self).__init__()
self.scale = scale
self.scale_mode = scale_mode
self.angle = angle
self.angle_mode = angle_mode
self.fill_value = fill_value
self.rotate_prob = rotate_prob
self.auto_bound = auto_bound
def get_angle(self, angle, angle_mode):
assert not angle_mode or angle_mode in [
'range', 'value'
], 'angle mode should be in [range, value, None]'
if not angle_mode:
return angle
elif angle_mode == 'range':
low, high = angle
return np.random.rand() * (high - low) + low
elif angle_mode == 'value':
return np.random.choice(angle)
def get_scale(self, scale, scale_mode):
assert not scale_mode or scale_mode in [
'range', 'value'
], 'scale mode should be in [range, value, None]'
if not scale_mode:
return scale
elif scale_mode == 'range':
low, high = scale
return np.random.rand() * (high - low) + low
elif scale_mode == 'value':
return np.random.choice(scale)
def apply(self, sample, context=None):
if np.random.rand() > self.rotate_prob:
return sample
angle = self.get_angle(self.angle, self.angle_mode)
scale = self.get_scale(self.scale, self.scale_mode)
rotator = RRotate(scale, angle, self.fill_value, self.auto_bound)
return rotator(sample)
@register_op
class Poly2RBox(BaseOperator):
""" Polygon to Rotated Box, using new OpenCV definition since 4.5.1
Args:
filter_threshold (int, float): threshold to filter annotations
filter_mode (str): filter mode, ['area', 'edge']
rbox_type (str): rbox type, ['le135', 'oc']
"""
def __init__(self, filter_threshold=4, filter_mode=None, rbox_type='le135'):
super(Poly2RBox, self).__init__()
self.filter_fn = lambda size: self.filter(size, filter_threshold, filter_mode)
self.rbox_fn = poly2rbox_le135_np if rbox_type == 'le135' else poly2rbox_oc_np
def filter(self, size, threshold, mode):
if mode == 'area':
if size[0] * size[1] < threshold:
return True
elif mode == 'edge':
if min(size) < threshold:
return True
return False
def get_rbox(self, polys):
valid_ids, rboxes, bboxes = [], [], []
for i, poly in enumerate(polys):
cx, cy, w, h, angle = self.rbox_fn(poly)
if self.filter_fn((w, h)):
continue
rboxes.append(np.array([cx, cy, w, h, angle], dtype=np.float32))
valid_ids.append(i)
xmin, ymin = min(poly[0::2]), min(poly[1::2])
xmax, ymax = max(poly[0::2]), max(poly[1::2])
bboxes.append(np.array([xmin, ymin, xmax, ymax], dtype=np.float32))
if len(valid_ids) == 0:
rboxes = np.zeros((0, 5), dtype=np.float32)
bboxes = np.zeros((0, 4), dtype=np.float32)
else:
rboxes = np.stack(rboxes)
bboxes = np.stack(bboxes)
return rboxes, bboxes, valid_ids
def apply(self, sample, context=None):
rboxes, bboxes, valid_ids = self.get_rbox(sample['gt_poly'])
sample['gt_rbox'] = rboxes
sample['gt_bbox'] = bboxes
for k in ['gt_class', 'gt_score', 'gt_poly', 'is_crowd', 'difficult']:
if k in sample:
sample[k] = sample[k][valid_ids]
return sample
@register_op
class Poly2Array(BaseOperator):
""" convert gt_poly to np.array for rotated bboxes
"""
def __init__(self):
super(Poly2Array, self).__init__()
def apply(self, sample, context=None):
if 'gt_poly' in sample:
sample['gt_poly'] = np.array(
sample['gt_poly'], dtype=np.float32).reshape((-1, 8))
return sample
@register_op
class RResize(BaseOperator):
def __init__(self, target_size, keep_ratio, interp=cv2.INTER_LINEAR):
"""
Resize image to target size. if keep_ratio is True,
resize the image's long side to the maximum of target_size
if keep_ratio is False, resize the image to target size(h, w)
Args:
target_size (int|list): image target size
keep_ratio (bool): whether keep_ratio or not, default true
interp (int): the interpolation method
"""
super(RResize, self).__init__()
self.keep_ratio = keep_ratio
self.interp = interp
if not isinstance(target_size, (Integral, Sequence)):
raise TypeError(
"Type of target_size is invalid. Must be Integer or List or Tuple, now is {}".
format(type(target_size)))
if isinstance(target_size, Integral):
target_size = [target_size, target_size]
self.target_size = target_size
def apply_image(self, image, scale):
im_scale_x, im_scale_y = scale
return cv2.resize(
image,
None,
None,
fx=im_scale_x,
fy=im_scale_y,
interpolation=self.interp)
def apply_pts(self, pts, scale, size):
im_scale_x, im_scale_y = scale
resize_w, resize_h = size
pts[:, 0::2] *= im_scale_x
pts[:, 1::2] *= im_scale_y
pts[:, 0::2] = np.clip(pts[:, 0::2], 0, resize_w)
pts[:, 1::2] = np.clip(pts[:, 1::2], 0, resize_h)
return pts
def apply(self, sample, context=None):
""" Resize the image numpy.
"""
im = sample['image']
if not isinstance(im, np.ndarray):
raise TypeError("{}: image type is not numpy.".format(self))
if len(im.shape) != 3:
raise ImageError('{}: image is not 3-dimensional.'.format(self))
# apply image
im_shape = im.shape
if self.keep_ratio:
im_size_min = np.min(im_shape[0:2])
im_size_max = np.max(im_shape[0:2])
target_size_min = np.min(self.target_size)
target_size_max = np.max(self.target_size)
im_scale = min(target_size_min / im_size_min,
target_size_max / im_size_max)
resize_h = im_scale * float(im_shape[0])
resize_w = im_scale * float(im_shape[1])
im_scale_x = im_scale
im_scale_y = im_scale
else:
resize_h, resize_w = self.target_size
im_scale_y = resize_h / im_shape[0]
im_scale_x = resize_w / im_shape[1]
im = self.apply_image(sample['image'], [im_scale_x, im_scale_y])
sample['image'] = im.astype(np.float32)
sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)
if 'scale_factor' in sample:
scale_factor = sample['scale_factor']
sample['scale_factor'] = np.asarray(
[scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
dtype=np.float32)
else:
sample['scale_factor'] = np.asarray(
[im_scale_y, im_scale_x], dtype=np.float32)
# apply bbox
if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
sample['gt_bbox'] = self.apply_pts(sample['gt_bbox'],
[im_scale_x, im_scale_y],
[resize_w, resize_h])
# apply polygon
if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
sample['gt_poly'] = self.apply_pts(sample['gt_poly'],
[im_scale_x, im_scale_y],
[resize_w, resize_h])
return sample
@register_op
class RandomRFlip(BaseOperator):
def __init__(self, prob=0.5):
"""
Args:
prob (float): the probability of flipping image
"""
super(RandomRFlip, self).__init__()
self.prob = prob
if not (isinstance(self.prob, float)):
raise TypeError("{}: input type is invalid.".format(self))
def apply_image(self, image):
return image[:, ::-1, :]
def apply_pts(self, pts, width):
oldx = pts[:, 0::2].copy()
pts[:, 0::2] = width - oldx - 1
return pts
def apply(self, sample, context=None):
"""Filp the image and bounding box.
Operators:
1. Flip the image numpy.
2. Transform the bboxes' x coordinates.
(Must judge whether the coordinates are normalized!)
3. Transform the segmentations' x coordinates.
(Must judge whether the coordinates are normalized!)
Output:
sample: the image, bounding box and segmentation part
in sample are flipped.
"""
if np.random.uniform(0, 1) < self.prob:
im = sample['image']
height, width = im.shape[:2]
im = self.apply_image(im)
if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
sample['gt_bbox'] = self.apply_pts(sample['gt_bbox'], width)
if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
sample['gt_poly'] = self.apply_pts(sample['gt_poly'], width)
sample['flipped'] = True
sample['image'] = im
return sample
@register_op
class VisibleRBox(BaseOperator):
"""
In debug mode, visualize images according to `gt_box`.
(Currently only supported when not cropping and flipping image.)
"""
def __init__(self, output_dir='debug'):
super(VisibleRBox, self).__init__()
self.output_dir = output_dir
if not os.path.isdir(output_dir):
os.makedirs(output_dir)
def apply(self, sample, context=None):
image = Image.fromarray(sample['image'].astype(np.uint8))
out_file_name = '{:012d}.jpg'.format(sample['im_id'][0])
width = sample['w']
height = sample['h']
# gt_poly = sample['gt_rbox']
gt_poly = sample['gt_poly']
gt_class = sample['gt_class']
draw = ImageDraw.Draw(image)
for i in range(gt_poly.shape[0]):
x1, y1, x2, y2, x3, y3, x4, y4 = gt_poly[i]
draw.line(
[(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)],
width=2,
fill='green')
# draw label
xmin = min(x1, x2, x3, x4)
ymin = min(y1, y2, y3, y4)
text = str(gt_class[i][0])
tw, th = imagedraw_textsize_c(draw, text)
draw.rectangle(
[(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill='green')
draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))
if 'gt_keypoint' in sample.keys():
gt_keypoint = sample['gt_keypoint']
if self.is_normalized:
for i in range(gt_keypoint.shape[1]):
if i % 2:
gt_keypoint[:, i] = gt_keypoint[:, i] * height
else:
gt_keypoint[:, i] = gt_keypoint[:, i] * width
for i in range(gt_keypoint.shape[0]):
keypoint = gt_keypoint[i]
for j in range(int(keypoint.shape[0] / 2)):
x1 = round(keypoint[2 * j]).astype(np.int32)
y1 = round(keypoint[2 * j + 1]).astype(np.int32)
draw.ellipse(
(x1, y1, x1 + 5, y1 + 5), fill='green', outline='green')
save_path = os.path.join(self.output_dir, out_file_name)
image.save(save_path, quality=95)
return sample
@register_op
class Rbox2Poly(BaseOperator):
"""
Convert rbbox format to poly format.
"""
def __init__(self):
super(Rbox2Poly, self).__init__()
def apply(self, sample, context=None):
assert 'gt_rbox' in sample
assert sample['gt_rbox'].shape[1] == 5
rboxes = sample['gt_rbox']
polys = rbox2poly_np(rboxes)
sample['gt_poly'] = polys
xmin, ymin = polys[:, 0::2].min(1), polys[:, 1::2].min(1)
xmax, ymax = polys[:, 0::2].max(1), polys[:, 1::2].max(1)
sample['gt_bbox'] = np.stack([xmin, ymin, xmin, ymin], axis=1)
return sample
================================================
FILE: ppdet/data/utils.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import numbers
import numpy as np
try:
from collections.abc import Sequence, Mapping
except:
from collections import Sequence, Mapping
def default_collate_fn(batch):
"""
Default batch collating function for :code:`paddle.io.DataLoader`,
get input data as a list of sample datas, each element in list
if the data of a sample, and sample data should composed of list,
dictionary, string, number, numpy array, this
function will parse input data recursively and stack number,
numpy array and paddle.Tensor datas as batch datas. e.g. for
following input data:
[{'image': np.array(shape=[3, 224, 224]), 'label': 1},
{'image': np.array(shape=[3, 224, 224]), 'label': 3},
{'image': np.array(shape=[3, 224, 224]), 'label': 4},
{'image': np.array(shape=[3, 224, 224]), 'label': 5},]
This default collate function zipped each number and numpy array
field together and stack each field as the batch field as follows:
{'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])}
Args:
batch(list of sample data): batch should be a list of sample data.
Returns:
Batched data: batched each number, numpy array and paddle.Tensor
in input data.
"""
sample = batch[0]
if isinstance(sample, np.ndarray):
batch = np.stack(batch, axis=0)
return batch
elif isinstance(sample, numbers.Number):
batch = np.array(batch)
return batch
elif isinstance(sample, (str, bytes)):
return batch
elif isinstance(sample, Mapping):
return {
key: default_collate_fn([d[key] for d in batch])
for key in sample
}
elif isinstance(sample, Sequence):
sample_fields_num = len(sample)
if not all(len(sample) == sample_fields_num for sample in iter(batch)):
raise RuntimeError(
"fileds number not same among samples in a batch")
return [default_collate_fn(fields) for fields in zip(*batch)]
raise TypeError("batch data con only contains: tensor, numpy.ndarray, "
"dict, list, number, but got {}".format(type(sample)))
================================================
FILE: ppdet/engine/__init__.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import trainer
from .trainer import *
from . import trainer_cot
from .trainer_cot import *
from . import callbacks
from .callbacks import *
from . import env
from .env import *
__all__ = trainer.__all__ \
+ callbacks.__all__ \
+ env.__all__
from . import tracker
from .tracker import *
__all__ = __all__ + tracker.__all__
from . import trainer_ssod
from .trainer_ssod import *
__all__ = __all__ + trainer_ssod.__all__
================================================
FILE: ppdet/engine/callbacks.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import gc
import sys
import datetime
import six
import copy
import json
import paddle
import paddle.distributed as dist
from ppdet.utils.checkpoint import save_model, save_semi_model, save_model_info, update_train_results
from ppdet.metrics import get_infer_results
from ppdet.utils.logger import setup_logger
logger = setup_logger('ppdet.engine')
__all__ = [
'Callback', 'ComposeCallback', 'LogPrinter', 'Checkpointer',
'VisualDLWriter', 'SniperProposalsGenerator'
]
class Callback(object):
def __init__(self, model):
self.model = model
log_ranks = self.model.cfg.get("log_ranks", '0')
if isinstance(log_ranks, str):
self.log_ranks = [int(i) for i in log_ranks.split(',')]
elif isinstance(log_ranks, int):
self.log_ranks = [log_ranks]
self.logger = setup_logger('ppdet.engine.callbacks',log_ranks=self.log_ranks)
def on_step_begin(self, status):
pass
def on_step_end(self, status):
pass
def on_epoch_begin(self, status):
pass
def on_epoch_end(self, status):
pass
def on_train_begin(self, status):
pass
def on_train_end(self, status):
pass
class ComposeCallback(object):
def __init__(self, callbacks):
callbacks = [c for c in list(callbacks) if c is not None]
for c in callbacks:
assert isinstance(
c, Callback), "callback should be subclass of Callback"
self._callbacks = callbacks
def on_step_begin(self, status):
for c in self._callbacks:
c.on_step_begin(status)
def on_step_end(self, status):
for c in self._callbacks:
c.on_step_end(status)
def on_epoch_begin(self, status):
for c in self._callbacks:
c.on_epoch_begin(status)
def on_epoch_end(self, status):
for c in self._callbacks:
c.on_epoch_end(status)
def on_train_begin(self, status):
for c in self._callbacks:
c.on_train_begin(status)
def on_train_end(self, status):
for c in self._callbacks:
c.on_train_end(status)
class LogPrinter(Callback):
def __init__(self, model):
super(LogPrinter, self).__init__(model)
def on_step_end(self, status):
if dist.get_world_size() < 2 or dist.get_rank() in self.log_ranks:
mode = status['mode']
if mode == 'train':
epoch_id = status['epoch_id']
step_id = status['step_id']
steps_per_epoch = status['steps_per_epoch']
training_staus = status['training_staus']
batch_time = status['batch_time']
data_time = status['data_time']
epoches = self.model.cfg.epoch
batch_size = self.model.cfg['{}Reader'.format(mode.capitalize(
))]['batch_size']
logs = training_staus.log()
space_fmt = ':' + str(len(str(steps_per_epoch))) + 'd'
if step_id % self.model.cfg.log_iter == 0:
eta_steps = (epoches - epoch_id) * steps_per_epoch - step_id
eta_sec = eta_steps * batch_time.global_avg
eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
ips = float(batch_size) / batch_time.avg
max_mem_reserved_str = ""
max_mem_allocated_str = ""
print_mem_info = self.model.cfg.get("print_mem_info", True)
if paddle.device.is_compiled_with_cuda() and print_mem_info:
max_mem_reserved_str = f", max_mem_reserved: {paddle.device.cuda.max_memory_reserved() // (1024 ** 2)} MB"
max_mem_allocated_str = f", max_mem_allocated: {paddle.device.cuda.max_memory_allocated() // (1024 ** 2)} MB"
fmt = ' '.join([
'Epoch: [{}]',
'[{' + space_fmt + '}/{}]',
'learning_rate: {lr:.6f}',
'{meters}',
'eta: {eta}',
'batch_cost: {btime}',
'data_cost: {dtime}',
'ips: {ips:.4f} images/s'
'{max_mem_reserved_str}'
'{max_mem_allocated_str}'
])
fmt = fmt.format(
epoch_id,
step_id,
steps_per_epoch,
lr=status['learning_rate'],
meters=logs,
eta=eta_str,
btime=str(batch_time),
dtime=str(data_time),
ips=ips,
max_mem_reserved_str=max_mem_reserved_str,
max_mem_allocated_str=max_mem_allocated_str)
self.logger.info(fmt)
if mode == 'eval':
step_id = status['step_id']
if step_id % 100 == 0:
self.logger.info("Eval iter: {}".format(step_id))
def on_epoch_end(self, status):
if dist.get_world_size() < 2 or dist.get_rank() == 0:
mode = status['mode']
if mode == 'eval':
sample_num = status['sample_num']
cost_time = status['cost_time']
self.logger.info('Total sample number: {}, average FPS: {}'.format(
sample_num, sample_num / cost_time))
class Checkpointer(Callback):
def __init__(self, model):
super(Checkpointer, self).__init__(model)
self.best_ap = -1000.
self.save_dir = self.model.cfg.save_dir
self.uniform_output_enabled = self.model.cfg.get("uniform_output_enabled", False)
if hasattr(self.model.model, 'student_model'):
self.weight = self.model.model.student_model
else:
self.weight = self.model.model
def on_epoch_end(self, status):
# Checkpointer only performed during training
mode = status['mode']
epoch_id = status['epoch_id']
weight = None
save_name = None
if dist.get_world_size() < 2 or dist.get_rank() == 0:
end_epoch = self.model.cfg.epoch
save_name = str(epoch_id) if epoch_id != end_epoch - 1 else "model_final"
if mode == 'train':
end_epoch = self.model.cfg.epoch
if (
epoch_id + 1
) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1:
save_name = str(
epoch_id) if epoch_id != end_epoch - 1 else "model_final"
weight = self.weight.state_dict()
elif mode == 'eval':
for metric in self.model._metrics:
map_res = metric.get_results()
eval_func = "ap"
if 'pose3d' in map_res:
key = 'pose3d'
eval_func = "mpjpe"
elif 'bbox' in map_res:
key = 'bbox'
elif 'keypoint' in map_res:
key = 'keypoint'
else:
key = 'mask'
key = self.model.cfg.get('target_metrics', key)
if key not in map_res:
logger.warning("Evaluation results empty, this may be due to " \
"training iterations being too few or not " \
"loading the correct weights.")
return
epoch_ap = map_res[key][0]
epoch_metric = {
'metric': abs(epoch_ap),
'epoch': epoch_id + 1
}
save_path = os.path.join(os.path.join(self.save_dir, save_name) if self.uniform_output_enabled else self.save_dir, f"{save_name}.pdstates")
paddle.save(epoch_metric, save_path)
if self.uniform_output_enabled:
save_model_info(epoch_metric, self.save_dir, save_name)
update_train_results(self.model.cfg, save_name, epoch_metric, done_flag=epoch_id + 1 == self.model.cfg.epoch, ema=self.model.use_ema)
if 'save_best_model' in status and status['save_best_model']:
if epoch_ap >= self.best_ap:
self.best_ap = epoch_ap
save_name = 'best_model'
weight = self.weight.state_dict()
best_metric = {
'metric': abs(self.best_ap),
'epoch': epoch_id + 1
}
save_path = os.path.join(os.path.join(self.save_dir, save_name) if self.uniform_output_enabled else self.save_dir, "best_model.pdstates")
paddle.save(best_metric, save_path)
if self.uniform_output_enabled:
save_model_info(best_metric, self.save_dir, save_name)
update_train_results(self.model.cfg, save_name, best_metric, done_flag=epoch_id + 1 == self.model.cfg.epoch, ema=self.model.use_ema)
logger.info("Best test {} {} is {:0.3f}.".format(
key, eval_func, abs(self.best_ap)))
if weight:
if self.model.use_ema:
exchange_save_model = status.get('exchange_save_model',
False)
if not exchange_save_model:
# save model and ema_model
save_model(
status['weight'],
self.model.optimizer,
os.path.join(self.save_dir, save_name) if self.uniform_output_enabled else self.save_dir,
save_name,
epoch_id + 1,
ema_model=weight)
if self.uniform_output_enabled:
self.model.export(output_dir=os.path.join(self.save_dir, save_name, "inference"), for_fd=True)
gc.collect()
else:
# save model(student model) and ema_model(teacher model)
# in DenseTeacher SSOD, the teacher model will be higher,
# so exchange when saving pdparams
student_model = status['weight'] # model
teacher_model = weight # ema_model
save_model(
teacher_model,
self.model.optimizer,
self.save_dir,
save_name,
epoch_id + 1,
ema_model=student_model)
del teacher_model
del student_model
else:
save_model(weight, self.model.optimizer, os.path.join(self.save_dir, save_name) if self.uniform_output_enabled else self.save_dir,
save_name, epoch_id + 1)
if self.uniform_output_enabled:
self.model.export(output_dir=os.path.join(self.save_dir, save_name, "inference"), for_fd=True)
gc.collect()
class WiferFaceEval(Callback):
def __init__(self, model):
super(WiferFaceEval, self).__init__(model)
def on_epoch_begin(self, status):
assert self.model.mode == 'eval', \
"WiferFaceEval can only be set during evaluation"
for metric in self.model._metrics:
metric.update(self.model.model)
sys.exit()
class VisualDLWriter(Callback):
"""
Use VisualDL to log data or image
"""
def __init__(self, model):
super(VisualDLWriter, self).__init__(model)
assert six.PY3, "VisualDL requires Python >= 3.5"
try:
from visualdl import LogWriter
except Exception as e:
logger.error('visualdl not found, plaese install visualdl. '
'for example: `pip install visualdl`.')
raise e
self.vdl_writer = LogWriter(
model.cfg.get('vdl_log_dir', 'vdl_log_dir/scalar'))
self.vdl_loss_step = 0
self.vdl_mAP_step = 0
self.vdl_image_step = 0
self.vdl_image_frame = 0
def on_step_end(self, status):
mode = status['mode']
if dist.get_world_size() < 2 or dist.get_rank() == 0:
if mode == 'train':
training_staus = status['training_staus']
for loss_name, loss_value in training_staus.get().items():
self.vdl_writer.add_scalar(loss_name, loss_value,
self.vdl_loss_step)
self.vdl_loss_step += 1
elif mode == 'test':
ori_image = status['original_image']
result_image = status['result_image']
self.vdl_writer.add_image(
"original/frame_{}".format(self.vdl_image_frame), ori_image,
self.vdl_image_step)
self.vdl_writer.add_image(
"result/frame_{}".format(self.vdl_image_frame),
result_image, self.vdl_image_step)
self.vdl_image_step += 1
# each frame can display ten pictures at most.
if self.vdl_image_step % 10 == 0:
self.vdl_image_step = 0
self.vdl_image_frame += 1
def on_epoch_end(self, status):
mode = status['mode']
if dist.get_world_size() < 2 or dist.get_rank() == 0:
if mode == 'eval':
for metric in self.model._metrics:
for key, map_value in metric.get_results().items():
self.vdl_writer.add_scalar("{}-mAP".format(key),
map_value[0],
self.vdl_mAP_step)
self.vdl_mAP_step += 1
class WandbCallback(Callback):
def __init__(self, model):
super(WandbCallback, self).__init__(model)
try:
import wandb
self.wandb = wandb
except Exception as e:
logger.error('wandb not found, please install wandb. '
'Use: `pip install wandb`.')
raise e
self.wandb_params = model.cfg.get('wandb', None)
self.save_dir = self.model.cfg.save_dir
if self.wandb_params is None:
self.wandb_params = {}
for k, v in model.cfg.items():
if k.startswith("wandb_"):
self.wandb_params.update({k.lstrip("wandb_"): v})
self._run = None
if dist.get_world_size() < 2 or dist.get_rank() == 0:
_ = self.run
self.run.config.update(self.model.cfg)
self.run.define_metric("epoch")
self.run.define_metric("eval/*", step_metric="epoch")
self.best_ap = -1000.
self.fps = []
@property
def run(self):
if self._run is None:
if self.wandb.run is not None:
logger.info(
"There is an ongoing wandb run which will be used"
"for logging. Please use `wandb.finish()` to end that"
"if the behaviour is not intended")
self._run = self.wandb.run
else:
self._run = self.wandb.init(**self.wandb_params)
return self._run
def save_model(self,
optimizer,
save_dir,
save_name,
last_epoch,
ema_model=None,
ap=None,
fps=None,
tags=None):
if dist.get_world_size() < 2 or dist.get_rank() == 0:
model_path = os.path.join(save_dir, save_name)
metadata = {}
metadata["last_epoch"] = last_epoch
if ap:
metadata["ap"] = ap
if fps:
metadata["fps"] = fps
if ema_model is None:
ema_artifact = self.wandb.Artifact(
name="ema_model-{}".format(self.run.id),
type="model",
metadata=metadata)
model_artifact = self.wandb.Artifact(
name="model-{}".format(self.run.id),
type="model",
metadata=metadata)
ema_artifact.add_file(model_path + ".pdema", name="model_ema")
model_artifact.add_file(model_path + ".pdparams", name="model")
self.run.log_artifact(ema_artifact, aliases=tags)
self.run.log_artfact(model_artifact, aliases=tags)
else:
model_artifact = self.wandb.Artifact(
name="model-{}".format(self.run.id),
type="model",
metadata=metadata)
model_artifact.add_file(model_path + ".pdparams", name="model")
self.run.log_artifact(model_artifact, aliases=tags)
def on_step_end(self, status):
mode = status['mode']
if dist.get_world_size() < 2 or dist.get_rank() == 0:
if mode == 'train':
training_status = status['training_staus'].get()
for k, v in training_status.items():
training_status[k] = float(v)
# calculate ips, data_cost, batch_cost
batch_time = status['batch_time']
data_time = status['data_time']
batch_size = self.model.cfg['{}Reader'.format(mode.capitalize(
))]['batch_size']
ips = float(batch_size) / float(batch_time.avg)
data_cost = float(data_time.avg)
batch_cost = float(batch_time.avg)
metrics = {"train/" + k: v for k, v in training_status.items()}
metrics["train/ips"] = ips
metrics["train/data_cost"] = data_cost
metrics["train/batch_cost"] = batch_cost
self.fps.append(ips)
self.run.log(metrics)
def on_epoch_end(self, status):
mode = status['mode']
epoch_id = status['epoch_id']
save_name = None
if dist.get_world_size() < 2 or dist.get_rank() == 0:
if mode == 'train':
fps = sum(self.fps) / len(self.fps)
self.fps = []
end_epoch = self.model.cfg.epoch
if (
epoch_id + 1
) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1:
save_name = str(
epoch_id) if epoch_id != end_epoch - 1 else "model_final"
tags = ["latest", "epoch_{}".format(epoch_id)]
self.save_model(
self.model.optimizer,
self.save_dir,
save_name,
epoch_id + 1,
self.model.use_ema,
fps=fps,
tags=tags)
if mode == 'eval':
sample_num = status['sample_num']
cost_time = status['cost_time']
fps = sample_num / cost_time
merged_dict = {}
for metric in self.model._metrics:
for key, map_value in metric.get_results().items():
merged_dict["eval/{}-mAP".format(key)] = map_value[0]
merged_dict["epoch"] = status["epoch_id"]
merged_dict["eval/fps"] = sample_num / cost_time
self.run.log(merged_dict)
if 'save_best_model' in status and status['save_best_model']:
for metric in self.model._metrics:
map_res = metric.get_results()
if 'pose3d' in map_res:
key = 'pose3d'
elif 'bbox' in map_res:
key = 'bbox'
elif 'keypoint' in map_res:
key = 'keypoint'
else:
key = 'mask'
if key not in map_res:
logger.warning("Evaluation results empty, this may be due to " \
"training iterations being too few or not " \
"loading the correct weights.")
return
if map_res[key][0] >= self.best_ap:
self.best_ap = map_res[key][0]
save_name = 'best_model'
tags = ["best", "epoch_{}".format(epoch_id)]
self.save_model(
self.model.optimizer,
self.save_dir,
save_name,
last_epoch=epoch_id + 1,
ema_model=self.model.use_ema,
ap=abs(self.best_ap),
fps=fps,
tags=tags)
def on_train_end(self, status):
self.run.finish()
class SniperProposalsGenerator(Callback):
def __init__(self, model):
super(SniperProposalsGenerator, self).__init__(model)
ori_dataset = self.model.dataset
self.dataset = self._create_new_dataset(ori_dataset)
self.loader = self.model.loader
self.cfg = self.model.cfg
self.infer_model = self.model.model
def _create_new_dataset(self, ori_dataset):
dataset = copy.deepcopy(ori_dataset)
# init anno_cropper
dataset.init_anno_cropper()
# generate infer roidbs
ori_roidbs = dataset.get_ori_roidbs()
roidbs = dataset.anno_cropper.crop_infer_anno_records(ori_roidbs)
# set new roidbs
dataset.set_roidbs(roidbs)
return dataset
def _eval_with_loader(self, loader):
results = []
with paddle.no_grad():
self.infer_model.eval()
for step_id, data in enumerate(loader):
outs = self.infer_model(data)
for key in ['im_shape', 'scale_factor', 'im_id']:
outs[key] = data[key]
for key, value in outs.items():
if hasattr(value, 'numpy'):
outs[key] = value.numpy()
results.append(outs)
return results
def on_train_end(self, status):
self.loader.dataset = self.dataset
results = self._eval_with_loader(self.loader)
results = self.dataset.anno_cropper.aggregate_chips_detections(results)
# sniper
proposals = []
clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()}
for outs in results:
batch_res = get_infer_results(outs, clsid2catid)
start = 0
for i, im_id in enumerate(outs['im_id']):
bbox_num = outs['bbox_num']
end = start + bbox_num[i]
bbox_res = batch_res['bbox'][start:end] \
if 'bbox' in batch_res else None
if bbox_res:
proposals += bbox_res
logger.info("save proposals in {}".format(self.cfg.proposals_path))
with open(self.cfg.proposals_path, 'w') as f:
json.dump(proposals, f)
class SemiLogPrinter(LogPrinter):
def __init__(self, model):
super(SemiLogPrinter, self).__init__(model)
def on_step_end(self, status):
if dist.get_world_size() < 2 or dist.get_rank() == 0:
mode = status['mode']
if mode == 'train':
epoch_id = status['epoch_id']
step_id = status['step_id']
iter_id = status['iter_id']
steps_per_epoch = status['steps_per_epoch']
training_staus = status['training_staus']
batch_time = status['batch_time']
data_time = status['data_time']
epoches = self.model.cfg.epoch
batch_size = self.model.cfg['{}Reader'.format(mode.capitalize(
))]['batch_size']
iters = epoches * steps_per_epoch
logs = training_staus.log()
iter_space_fmt = ':' + str(len(str(iters))) + 'd'
space_fmt = ':' + str(len(str(iters))) + 'd'
if step_id % self.model.cfg.log_iter == 0:
eta_steps = (epoches - epoch_id) * steps_per_epoch - step_id
eta_sec = eta_steps * batch_time.global_avg
eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
ips = float(batch_size) / batch_time.avg
fmt = ' '.join([
'{' + iter_space_fmt + '}/{} iters',
'Epoch: [{}]',
'[{' + space_fmt + '}/{}]',
'learning_rate: {lr:.6f}',
'{meters}',
'eta: {eta}',
'batch_cost: {btime}',
'data_cost: {dtime}',
'ips: {ips:.4f} images/s',
])
fmt = fmt.format(
iter_id,
iters,
epoch_id,
step_id,
steps_per_epoch,
lr=status['learning_rate'],
meters=logs,
eta=eta_str,
btime=str(batch_time),
dtime=str(data_time),
ips=ips)
logger.info(fmt)
if mode == 'eval':
step_id = status['step_id']
if step_id % 100 == 0:
logger.info("Eval iter: {}".format(step_id))
class SemiCheckpointer(Checkpointer):
def __init__(self, model):
super(SemiCheckpointer, self).__init__(model)
cfg = self.model.cfg
self.best_ap = 0.
self.save_dir = os.path.join(self.model.cfg.save_dir,
self.model.cfg.filename)
if hasattr(self.model.model, 'student') and hasattr(self.model.model,
'teacher'):
self.weight = (self.model.model.teacher, self.model.model.student)
elif hasattr(self.model.model, 'student') or hasattr(self.model.model,
'teacher'):
raise AttributeError(
"model has no attribute 'student' or 'teacher'")
else:
raise AttributeError(
"model has no attribute 'student' and 'teacher'")
def every_n_iters(self, iter_id, n):
return (iter_id + 1) % n == 0 if n > 0 else False
def on_step_end(self, status):
# Checkpointer only performed during training
mode = status['mode']
eval_interval = status['eval_interval']
save_interval = status['save_interval']
iter_id = status['iter_id']
epoch_id = status['epoch_id']
t_weight = None
s_weight = None
save_name = None
if dist.get_world_size() < 2 or dist.get_rank() == 0:
if self.every_n_iters(iter_id, save_interval) and mode == 'train':
save_name = "last_epoch"
# save_name = str(iter_id + 1)
t_weight = self.weight[0].state_dict()
s_weight = self.weight[1].state_dict()
save_semi_model(t_weight, s_weight, self.model.optimizer,
self.save_dir, save_name, epoch_id + 1,
iter_id + 1)
def on_epoch_end(self, status):
# Checkpointer only performed during training
mode = status['mode']
eval_interval = status['eval_interval']
save_interval = status['save_interval']
iter_id = status['iter_id']
epoch_id = status['epoch_id']
t_weight = None
s_weight = None
save_name = None
if dist.get_world_size() < 2 or dist.get_rank() == 0:
if self.every_n_iters(iter_id, eval_interval) and mode == 'eval':
if 'save_best_model' in status and status['save_best_model']:
for metric in self.model._metrics:
map_res = metric.get_results()
if 'bbox' in map_res:
key = 'bbox'
elif 'keypoint' in map_res:
key = 'keypoint'
else:
key = 'mask'
if key not in map_res:
logger.warning("Evaluation results empty, this may be due to " \
"training iterations being too few or not " \
"loading the correct weights.")
return
if map_res[key][0] > self.best_ap:
self.best_ap = map_res[key][0]
save_name = 'best_model'
t_weight = self.weight[0].state_dict()
s_weight = self.weight[1].state_dict()
logger.info("Best teacher test {} ap is {:0.3f}.".
format(key, self.best_ap))
if t_weight and s_weight:
save_semi_model(t_weight, s_weight,
self.model.optimizer, self.save_dir,
save_name, epoch_id + 1, iter_id + 1)
================================================
FILE: ppdet/engine/env.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import random
import numpy as np
import paddle
from paddle.distributed import fleet
__all__ = ['init_parallel_env', 'set_random_seed', 'init_fleet_env']
def init_fleet_env(find_unused_parameters=False):
strategy = fleet.DistributedStrategy()
strategy.find_unused_parameters = find_unused_parameters
fleet.init(is_collective=True, strategy=strategy)
def init_parallel_env():
env = os.environ
dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env
if dist:
trainer_id = int(env['PADDLE_TRAINER_ID'])
local_seed = (99 + trainer_id)
random.seed(local_seed)
np.random.seed(local_seed)
paddle.distributed.init_parallel_env()
def set_random_seed(seed):
paddle.seed(seed)
random.seed(seed)
np.random.seed(seed)
================================================
FILE: ppdet/engine/export_utils.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import yaml
from collections import OrderedDict
import paddle
from ppdet.data.source.category import get_categories
from ppdet.core.workspace import load_config
from ppdet.utils.logger import setup_logger
logger = setup_logger('ppdet.engine')
# Global dictionary
TRT_MIN_SUBGRAPH = {
'YOLO': 3,
'PPYOLOE': 3,
'SSD': 60,
'RCNN': 40,
'RetinaNet': 40,
'S2ANet': 80,
'EfficientDet': 40,
'Face': 3,
'TTFNet': 60,
'FCOS': 16,
'SOLOv2': 60,
'HigherHRNet': 3,
'HRNet': 3,
'DeepSORT': 3,
'ByteTrack': 10,
'CenterTrack': 5,
'JDE': 10,
'FairMOT': 5,
'GFL': 16,
'PicoDet': 3,
'CenterNet': 5,
'TOOD': 5,
'YOLOX': 8,
'YOLOF': 40,
'METRO_Body': 3,
'DETR': 3,
'CLRNet': 3
}
KEYPOINT_ARCH = ['HigherHRNet', 'TopDownHRNet']
MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']
LANE_ARCH = ['CLRNet']
TO_STATIC_SPEC = {
'yolov3_darknet53_270e_coco': [{
'im_id': paddle.static.InputSpec(
name='im_id', shape=[-1, 1], dtype='float32'),
'is_crowd': paddle.static.InputSpec(
name='is_crowd', shape=[-1, 50], dtype='float32'),
'gt_bbox': paddle.static.InputSpec(
name='gt_bbox', shape=[-1, 50, 4], dtype='float32'),
'curr_iter': paddle.static.InputSpec(
name='curr_iter', shape=[-1], dtype='float32'),
'curr_epoch': paddle.static.InputSpec(
name='curr_epoch', shape=[-1], dtype='int64'),
'image': paddle.static.InputSpec(
name='image', shape=[-1, 3, -1, -1], dtype='float32'),
'im_shape': paddle.static.InputSpec(
name='im_shape', shape=[-1, 2], dtype='float32'),
'scale_factor': paddle.static.InputSpec(
name='scale_factor', shape=[-1, 2], dtype='float32'),
'target0': paddle.static.InputSpec(
name='target0', shape=[-1, 3, 86, -1, -1], dtype='float32'),
'target1': paddle.static.InputSpec(
name='target1', shape=[-1, 3, 86, -1, -1], dtype='float32'),
'target2': paddle.static.InputSpec(
name='target2', shape=[-1, 3, 86, -1, -1], dtype='float32'),
}],
'tinypose_128x96': [{
'center': paddle.static.InputSpec(
name='center', shape=[-1, 2], dtype='float32'),
'scale': paddle.static.InputSpec(
name='scale', shape=[-1, 2], dtype='float32'),
'im_id': paddle.static.InputSpec(
name='im_id', shape=[-1, 1], dtype='float32'),
'image': paddle.static.InputSpec(
name='image', shape=[-1, 3, 128, 96], dtype='float32'),
'score': paddle.static.InputSpec(
name='score', shape=[-1], dtype='float32'),
'rotate': paddle.static.InputSpec(
name='rotate', shape=[-1], dtype='float32'),
'target': paddle.static.InputSpec(
name='target', shape=[-1, 17, 32, 24], dtype='float32'),
'target_weight': paddle.static.InputSpec(
name='target_weight', shape=[-1, 17, 1], dtype='float32'),
}],
'fcos_r50_fpn_1x_coco': [{
'im_id': paddle.static.InputSpec(
name='im_id', shape=[-1, 1], dtype='float32'),
'curr_iter': paddle.static.InputSpec(
name='curr_iter', shape=[-1], dtype='float32'),
'curr_epoch': paddle.static.InputSpec(
name='curr_epoch', shape=[-1], dtype='int64'),
'image': paddle.static.InputSpec(
name='image', shape=[-1, 3, -1, -1], dtype='float32'),
'im_shape': paddle.static.InputSpec(
name='im_shape', shape=[-1, 2], dtype='float32'),
'scale_factor': paddle.static.InputSpec(
name='scale_factor', shape=[-1, 2], dtype='float32'),
'reg_target0': paddle.static.InputSpec(
name='reg_target0', shape=[-1, 160, 160, 4], dtype='float32'),
'labels0': paddle.static.InputSpec(
name='labels0', shape=[-1, 160, 160, 1], dtype='int32'),
'centerness0': paddle.static.InputSpec(
name='centerness0', shape=[-1, 160, 160, 1], dtype='float32'),
'reg_target1': paddle.static.InputSpec(
name='reg_target1', shape=[-1, 80, 80, 4], dtype='float32'),
'labels1': paddle.static.InputSpec(
name='labels1', shape=[-1, 80, 80, 1], dtype='int32'),
'centerness1': paddle.static.InputSpec(
name='centerness1', shape=[-1, 80, 80, 1], dtype='float32'),
'reg_target2': paddle.static.InputSpec(
name='reg_target2', shape=[-1, 40, 40, 4], dtype='float32'),
'labels2': paddle.static.InputSpec(
name='labels2', shape=[-1, 40, 40, 1], dtype='int32'),
'centerness2': paddle.static.InputSpec(
name='centerness2', shape=[-1, 40, 40, 1], dtype='float32'),
'reg_target3': paddle.static.InputSpec(
name='reg_target3', shape=[-1, 20, 20, 4], dtype='float32'),
'labels3': paddle.static.InputSpec(
name='labels3', shape=[-1, 20, 20, 1], dtype='int32'),
'centerness3': paddle.static.InputSpec(
name='centerness3', shape=[-1, 20, 20, 1], dtype='float32'),
'reg_target4': paddle.static.InputSpec(
name='reg_target4', shape=[-1, 10, 10, 4], dtype='float32'),
'labels4': paddle.static.InputSpec(
name='labels4', shape=[-1, 10, 10, 1], dtype='int32'),
'centerness4': paddle.static.InputSpec(
name='centerness4', shape=[-1, 10, 10, 1], dtype='float32'),
}],
'picodet_s_320_coco_lcnet': [{
'im_id': paddle.static.InputSpec(
name='im_id', shape=[-1, 1], dtype='float32'),
'is_crowd': paddle.static.InputSpec(
name='is_crowd', shape=[-1, -1, 1], dtype='float32'),
'gt_class': paddle.static.InputSpec(
name='gt_class', shape=[-1, -1, 1], dtype='int32'),
'gt_bbox': paddle.static.InputSpec(
name='gt_bbox', shape=[-1, -1, 4], dtype='float32'),
'curr_iter': paddle.static.InputSpec(
name='curr_iter', shape=[-1], dtype='float32'),
'curr_epoch': paddle.static.InputSpec(
name='curr_epoch', shape=[-1], dtype='int64'),
'image': paddle.static.InputSpec(
name='image', shape=[-1, 3, -1, -1], dtype='float32'),
'im_shape': paddle.static.InputSpec(
name='im_shape', shape=[-1, 2], dtype='float32'),
'scale_factor': paddle.static.InputSpec(
name='scale_factor', shape=[-1, 2], dtype='float32'),
'pad_gt_mask': paddle.static.InputSpec(
name='pad_gt_mask', shape=[-1, -1, 1], dtype='float32'),
}],
'ppyoloe_crn_s_300e_coco': [{
'im_id': paddle.static.InputSpec(
name='im_id', shape=[-1, 1], dtype='float32'),
'is_crowd': paddle.static.InputSpec(
name='is_crowd', shape=[-1, -1, 1], dtype='float32'),
'gt_class': paddle.static.InputSpec(
name='gt_class', shape=[-1, -1, 1], dtype='int32'),
'gt_bbox': paddle.static.InputSpec(
name='gt_bbox', shape=[-1, -1, 4], dtype='float32'),
'curr_iter': paddle.static.InputSpec(
name='curr_iter', shape=[-1], dtype='float32'),
'curr_epoch': paddle.static.InputSpec(
name='curr_epoch', shape=[-1], dtype='int64'),
'image': paddle.static.InputSpec(
name='image', shape=[-1, 3, -1, -1], dtype='float32'),
'im_shape': paddle.static.InputSpec(
name='im_shape', shape=[-1, 2], dtype='float32'),
'scale_factor': paddle.static.InputSpec(
name='scale_factor', shape=[-1, 2], dtype='float32'),
'pad_gt_mask': paddle.static.InputSpec(
name='pad_gt_mask', shape=[-1, -1, 1], dtype='float32'),
}],
}
def apply_to_static(config, model):
filename = config.get('filename', None)
spec = TO_STATIC_SPEC.get(filename, None)
model = paddle.jit.to_static(model, input_spec=spec)
logger.info("Successfully to apply @to_static with specs: {}".format(spec))
return model
def _prune_input_spec(input_spec, program, targets):
# try to prune static program to figure out pruned input spec
# so we perform following operations in static mode
device = paddle.get_device()
paddle.enable_static()
paddle.set_device(device)
pruned_input_spec = [{}]
program = program.clone()
program = program._prune(targets=targets)
global_block = program.global_block()
pir_value_set = set()
if paddle.framework.use_pir_api():
for op in global_block.ops:
if op.name() == 'pd_op.data':
pir_value_set.insert(op.attrs()["name"])
for name, spec in input_spec[0].items():
if paddle.framework.use_pir_api():
if name in pir_value_set:
pruned_input_spec[0][name] = spec
else:
try:
v = global_block.var(name)
pruned_input_spec[0][name] = spec
except Exception:
pass
paddle.disable_static(place=device)
return pruned_input_spec
def _parse_reader(reader_cfg, dataset_cfg, metric, arch, image_shape):
preprocess_list = []
label_list = []
if arch != "lane_arch":
anno_file = dataset_cfg.get_anno()
clsid2catid, catid2name = get_categories(metric, anno_file, arch)
label_list = [str(cat) for cat in catid2name.values()]
fuse_normalize = reader_cfg.get('fuse_normalize', False)
sample_transforms = reader_cfg['sample_transforms']
hpi_dynamic_shape = None
for st in sample_transforms[1:]:
for key, value in st.items():
p = {'type': key}
if key == 'Resize':
if int(image_shape[1]) != -1:
value['target_size'] = image_shape[1:]
hpi_dynamic_shape = image_shape[1:]
value['interp'] = value.get('interp', 1) # cv2.INTER_LINEAR
if fuse_normalize and key == 'NormalizeImage':
continue
p.update(value)
preprocess_list.append(p)
batch_transforms = reader_cfg.get('batch_transforms', None)
if batch_transforms:
for bt in batch_transforms:
for key, value in bt.items():
# for deploy/infer, use PadStride(stride) instead PadBatch(pad_to_stride)
if key == 'PadBatch':
preprocess_list.append({
'type': 'PadStride',
'stride': value['pad_to_stride']
})
break
elif key == "CULaneResize":
# cut and resize
p = {'type': key}
p.update(value)
p.update({"cut_height": dataset_cfg.cut_height})
preprocess_list.append(p)
break
return preprocess_list, label_list, hpi_dynamic_shape
def _parse_tracker(tracker_cfg):
tracker_params = {}
for k, v in tracker_cfg.items():
tracker_params.update({k: v})
return tracker_params
def _dump_infer_config(config, path, image_shape, model):
arch_state = False
from ppdet.core.config.yaml_helpers import setup_orderdict
setup_orderdict()
use_dynamic_shape = True if image_shape[2] == -1 else False
infer_cfg = OrderedDict({
'mode': 'paddle',
'draw_threshold': 0.5,
'metric': config['metric'],
'use_dynamic_shape': use_dynamic_shape
})
if config.get('pdx_model_name', None):
infer_cfg["Global"] = {"model_name": config["pdx_model_name"]}
export_onnx = config.get('export_onnx', False)
export_eb = config.get('export_eb', False)
infer_arch = config['architecture']
if 'RCNN' in infer_arch and export_onnx:
logger.warning(
"Exporting RCNN model to ONNX only support batch_size = 1")
infer_cfg['export_onnx'] = True
infer_cfg['export_eb'] = export_eb
if infer_arch in MOT_ARCH:
if infer_arch == 'DeepSORT':
tracker_cfg = config['DeepSORTTracker']
elif infer_arch == 'CenterTrack':
tracker_cfg = config['CenterTracker']
else:
tracker_cfg = config['JDETracker']
infer_cfg['tracker'] = _parse_tracker(tracker_cfg)
for arch, min_subgraph_size in TRT_MIN_SUBGRAPH.items():
if arch in infer_arch:
infer_cfg['arch'] = arch
infer_cfg['min_subgraph_size'] = min_subgraph_size
arch_state = True
break
if infer_arch == 'PPYOLOEWithAuxHead':
infer_arch = 'PPYOLOE'
if infer_arch in ['PPYOLOE', 'YOLOX', 'YOLOF']:
infer_cfg['arch'] = infer_arch
infer_cfg['min_subgraph_size'] = TRT_MIN_SUBGRAPH[infer_arch]
arch_state = True
if infer_arch == 'DETR' and config.get('with_mask', False):
infer_cfg['mask'] = True
if not arch_state:
logger.error(
'Architecture: {} is not supported for exporting model now.\n'.
format(infer_arch) +
'Please set TRT_MIN_SUBGRAPH in ppdet/engine/export_utils.py')
os._exit(0)
if 'mask_head' in config[config['architecture']] and config[config[
'architecture']]['mask_head']:
infer_cfg['mask'] = True
if 'with_mask' in config[config['architecture']] and config[config[
'architecture']]['with_mask']:
infer_cfg['mask'] = True
label_arch = 'detection_arch'
if infer_arch in KEYPOINT_ARCH:
label_arch = 'keypoint_arch'
if infer_arch in LANE_ARCH:
infer_cfg['arch'] = infer_arch
infer_cfg['min_subgraph_size'] = TRT_MIN_SUBGRAPH[infer_arch]
infer_cfg['img_w'] = config['img_w']
infer_cfg['ori_img_h'] = config['ori_img_h']
infer_cfg['cut_height'] = config['cut_height']
label_arch = 'lane_arch'
head_name = "CLRHead"
infer_cfg['conf_threshold'] = config[head_name]['conf_threshold']
infer_cfg['nms_thres'] = config[head_name]['nms_thres']
infer_cfg['max_lanes'] = config[head_name]['max_lanes']
infer_cfg['num_points'] = config[head_name]['num_points']
arch_state = True
if infer_arch in MOT_ARCH:
if config['metric'] in ['COCO', 'VOC']:
# MOT model run as Detector
reader_cfg = config['TestReader']
dataset_cfg = config['TestDataset']
else:
# 'metric' in ['MOT', 'MCMOT', 'KITTI']
label_arch = 'mot_arch'
reader_cfg = config['TestMOTReader']
dataset_cfg = config['TestMOTDataset']
else:
reader_cfg = config['TestReader']
dataset_cfg = config['TestDataset']
infer_cfg['Preprocess'], infer_cfg['label_list'], hpi_dynamic_shape = _parse_reader(
reader_cfg, dataset_cfg, config['metric'], label_arch, image_shape[1:])
if config.get("uniform_output_enabled", None):
def get_dynamic_shapes(hpi_shape):
return [[1, 3] + hpi_shape, [1, 3] + hpi_shape, [8, 3] + hpi_shape]
dynamic_shapes = get_dynamic_shapes(hpi_dynamic_shape) if hpi_dynamic_shape else [
[1, 3, 320, 320],
[1, 3, 640, 640],
[8, 3, 1280, 1280]
]
shapes = {
"image": dynamic_shapes,
"im_shape": [[1, 2], [1, 2], [8, 2]],
"scale_factor": [[1, 2], [1, 2], [8, 2]]
}
trt_dynamic_shape = [
[dim for _ in range(shape[0]) for dim in shape[2:]]
for shape in dynamic_shapes
]
trt_dynamic_shape_input_data = {
"im_shape": trt_dynamic_shape,
"scale_factor": [
[2, 2],
[1, 1],
[0.67 for _ in range(2 * shapes["scale_factor"][-1][0])]
]
}
hpi_config = OrderedDict({
"backend_configs": OrderedDict({
"paddle_infer": OrderedDict({
"trt_dynamic_shapes": shapes,
"trt_dynamic_shape_input_data": trt_dynamic_shape_input_data
}),
"tensorrt": OrderedDict({
"dynamic_shapes": shapes
})
})
})
infer_cfg["Hpi"] = hpi_config
if infer_arch == 'PicoDet':
if hasattr(config, 'export') and config['export'].get(
'post_process',
False) and not config['export'].get('benchmark', False):
infer_cfg['arch'] = 'GFL'
head_name = 'PicoHeadV2' if config['PicoHeadV2'] else 'PicoHead'
infer_cfg['NMS'] = config[head_name]['nms']
# In order to speed up the prediction, the threshold of nms
# is adjusted here, which can be changed in infer_cfg.yml
config[head_name]['nms']["score_threshold"] = 0.3
config[head_name]['nms']["nms_threshold"] = 0.5
infer_cfg['fpn_stride'] = config[head_name]['fpn_stride']
yaml.dump(infer_cfg, open(path, 'w'))
logger.info("Export inference config file to {}".format(os.path.join(path)))
================================================
FILE: ppdet/engine/naive_sync_bn.py
================================================
import paddle.distributed as dist
import math
import paddle
import paddle.nn as nn
class _AllReduce(paddle.autograd.PyLayer):
@staticmethod
def forward(ctx, input):
input_list = [paddle.zeros_like(input) for k in range(dist.get_world_size())]
# Use allgather instead of allreduce since I don't trust in-place operations ..
dist.all_gather(input_list, input, sync_op=True)
inputs = paddle.stack(input_list, axis=0)
return paddle.sum(inputs, axis=0)
@staticmethod
def backward(ctx, grad_output):
dist.all_reduce(grad_output, sync_op=True)
return grad_output
def differentiable_all_reduce(input):
"""
Differentiable counterpart of `dist.all_reduce`.
"""
if (
not dist.is_available()
or not dist.is_initialized()
or dist.get_world_size() == 1
):
return input
return _AllReduce.apply(input)
class NaiveSyncBatchNorm(nn.BatchNorm2D):
def __init__(self, *args, stats_mode="", **kwargs):
super().__init__(*args, **kwargs)
assert stats_mode in ["", "N"]
self._stats_mode = stats_mode
def forward(self, input):
if dist.get_world_size() == 1 or not self.training:
return super(NaiveSyncBatchNorm, self).forward(input)
B, C = input.shape[0], input.shape[1]
mean = paddle.mean(input, axis=[0, 2, 3])
meansqr = paddle.mean(input * input, axis=[0, 2, 3])
if self._stats_mode == "":
assert B > 0, 'SyncBatchNorm(stats_mode="") does not support zero batch size.'
vec = paddle.concat([mean, meansqr], axis=0)
vec = differentiable_all_reduce(vec) * (1.0 / dist.get_world_size())
mean, meansqr = paddle.split(vec, [C, C])
momentum = 1 - self._momentum # NOTE: paddle has reverse momentum defination
else:
if B == 0:
vec = paddle.zeros([2 * C + 1], dtype=mean.dtype)
vec = vec + input.sum() # make sure there is gradient w.r.t input
else:
vec = paddle.concat(
[
mean,
meansqr,
paddle.ones([1], dtype=mean.dtype),
],
axis=0,
)
vec = differentiable_all_reduce(vec * B)
total_batch = vec[-1].detach()
momentum = total_batch.clip(max=1) * (1 - self._momentum) # no update if total_batch is 0
mean, meansqr, _ = paddle.split(vec / total_batch.clip(min=1), [C, C, int(vec.shape[0] - 2*C)]) # avoid div-by-zero
var = meansqr - mean * mean
invstd = paddle.rsqrt(var + self._epsilon)
scale = self.weight * invstd
bias = self.bias - mean * scale
scale = scale.reshape([1, -1, 1, 1])
bias = bias.reshape([1, -1, 1, 1])
tmp_mean = self._mean + momentum * (mean.detach() - self._mean)
self._mean.set_value(tmp_mean)
tmp_variance = self._variance + (momentum * (var.detach() - self._variance))
self._variance.set_value(tmp_variance)
ret = input * scale + bias
return ret
def convert_syncbn(model):
for n, m in model.named_children():
if isinstance(m, nn.layer.norm._BatchNormBase):
syncbn = NaiveSyncBatchNorm(m._num_features, m._momentum, m._epsilon, m._weight_attr, m._bias_attr)
setattr(model, n, syncbn)
else:
convert_syncbn(m)
================================================
FILE: ppdet/engine/tracker.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import glob
import re
import paddle
import paddle.nn as nn
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from ppdet.core.workspace import create
from ppdet.utils.checkpoint import load_weight, load_pretrain_weight
from ppdet.modeling.mot.utils import Detection, get_crops, scale_coords, clip_box
from ppdet.modeling.mot.utils import MOTTimer, load_det_results, write_mot_results, save_vis_results
from ppdet.modeling.mot.tracker import JDETracker, CenterTracker
from ppdet.modeling.mot.tracker import DeepSORTTracker, OCSORTTracker, BOTSORTTracker
from ppdet.modeling.architectures import YOLOX
from ppdet.metrics import Metric, MOTMetric, KITTIMOTMetric, MCMOTMetric
from ppdet.data.source.category import get_categories
import ppdet.utils.stats as stats
from .callbacks import Callback, ComposeCallback
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']
MOT_ARCH_JDE = MOT_ARCH[:2]
MOT_ARCH_SDE = MOT_ARCH[2:4]
MOT_DATA_TYPE = ['mot', 'mcmot', 'kitti']
__all__ = ['Tracker']
class Tracker(object):
def __init__(self, cfg, mode='eval'):
self.cfg = cfg
assert mode.lower() in ['test', 'eval'], \
"mode should be 'test' or 'eval'"
self.mode = mode.lower()
self.optimizer = None
# build MOT data loader
self.dataset = cfg['{}MOTDataset'.format(self.mode.capitalize())]
# build model
self.model = create(cfg.architecture)
if isinstance(self.model.detector, YOLOX):
for k, m in self.model.named_sublayers():
if isinstance(m, nn.BatchNorm2D):
m._epsilon = 1e-3 # for amp(fp16)
m._momentum = 0.97 # 0.03 in pytorch
anno_file = self.dataset.get_anno()
clsid2catid, catid2name = get_categories(
self.cfg.metric, anno_file=anno_file)
self.ids2names = []
for k, v in catid2name.items():
self.ids2names.append(v)
self.status = {}
self.start_epoch = 0
# initial default callbacks
self._init_callbacks()
# initial default metrics
self._init_metrics()
self._reset_metrics()
def _init_callbacks(self):
self._callbacks = []
self._compose_callback = None
def _init_metrics(self):
if self.mode in ['test']:
self._metrics = []
return
if self.cfg.metric == 'MOT':
self._metrics = [MOTMetric(), ]
elif self.cfg.metric == 'MCMOT':
self._metrics = [MCMOTMetric(self.cfg.num_classes), ]
elif self.cfg.metric == 'KITTI':
self._metrics = [KITTIMOTMetric(), ]
else:
logger.warning("Metric not support for metric type {}".format(
self.cfg.metric))
self._metrics = []
def _reset_metrics(self):
for metric in self._metrics:
metric.reset()
def register_callbacks(self, callbacks):
callbacks = [h for h in list(callbacks) if h is not None]
for c in callbacks:
assert isinstance(c, Callback), \
"metrics shoule be instances of subclass of Metric"
self._callbacks.extend(callbacks)
self._compose_callback = ComposeCallback(self._callbacks)
def register_metrics(self, metrics):
metrics = [m for m in list(metrics) if m is not None]
for m in metrics:
assert isinstance(m, Metric), \
"metrics shoule be instances of subclass of Metric"
self._metrics.extend(metrics)
def load_weights_jde(self, weights):
load_weight(self.model, weights, self.optimizer)
def load_weights_sde(self, det_weights, reid_weights):
with_detector = self.model.detector is not None
with_reid = self.model.reid is not None
if with_detector:
load_weight(self.model.detector, det_weights)
if with_reid:
load_weight(self.model.reid, reid_weights)
else:
load_weight(self.model.reid, reid_weights)
def _eval_seq_centertrack(self,
dataloader,
save_dir=None,
show_image=False,
frame_rate=30,
draw_threshold=0):
assert isinstance(self.model.tracker, CenterTracker)
if save_dir:
if not os.path.exists(save_dir): os.makedirs(save_dir)
tracker = self.model.tracker
timer = MOTTimer()
frame_id = 0
self.status['mode'] = 'track'
self.model.eval()
results = defaultdict(list) # only support single class now
for step_id, data in enumerate(tqdm(dataloader)):
self.status['step_id'] = step_id
if step_id == 0:
self.model.reset_tracking()
# forward
timer.tic()
pred_ret = self.model(data)
online_targets = tracker.update(pred_ret)
online_tlwhs, online_scores, online_ids = [], [], []
for t in online_targets:
bbox = t['bbox']
tlwh = [bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]]
tscore = float(t['score'])
tid = int(t['tracking_id'])
if tlwh[2] * tlwh[3] > 0:
online_tlwhs.append(tlwh)
online_ids.append(tid)
online_scores.append(tscore)
timer.toc()
# save results
results[0].append(
(frame_id + 1, online_tlwhs, online_scores, online_ids))
save_vis_results(data, frame_id, online_ids, online_tlwhs,
online_scores, timer.average_time, show_image,
save_dir, self.cfg.num_classes, self.ids2names)
frame_id += 1
return results, frame_id, timer.average_time, timer.calls
def _eval_seq_jde(self,
dataloader,
save_dir=None,
show_image=False,
frame_rate=30,
draw_threshold=0):
if save_dir:
if not os.path.exists(save_dir): os.makedirs(save_dir)
tracker = self.model.tracker
tracker.max_time_lost = int(frame_rate / 30.0 * tracker.track_buffer)
timer = MOTTimer()
frame_id = 0
self.status['mode'] = 'track'
self.model.eval()
results = defaultdict(list) # support single class and multi classes
for step_id, data in enumerate(tqdm(dataloader)):
self.status['step_id'] = step_id
# forward
timer.tic()
pred_dets, pred_embs = self.model(data)
pred_dets, pred_embs = pred_dets.numpy(), pred_embs.numpy()
online_targets_dict = self.model.tracker.update(pred_dets,
pred_embs)
online_tlwhs = defaultdict(list)
online_scores = defaultdict(list)
online_ids = defaultdict(list)
for cls_id in range(self.cfg.num_classes):
online_targets = online_targets_dict[cls_id]
for t in online_targets:
tlwh = t.tlwh
tid = t.track_id
tscore = t.score
if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue
if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[
3] > tracker.vertical_ratio:
continue
online_tlwhs[cls_id].append(tlwh)
online_ids[cls_id].append(tid)
online_scores[cls_id].append(tscore)
# save results
results[cls_id].append(
(frame_id + 1, online_tlwhs[cls_id], online_scores[cls_id],
online_ids[cls_id]))
timer.toc()
save_vis_results(data, frame_id, online_ids, online_tlwhs,
online_scores, timer.average_time, show_image,
save_dir, self.cfg.num_classes, self.ids2names)
frame_id += 1
return results, frame_id, timer.average_time, timer.calls
def _eval_seq_sde(self,
dataloader,
save_dir=None,
show_image=False,
frame_rate=30,
seq_name='',
scaled=False,
det_file='',
draw_threshold=0):
if save_dir:
if not os.path.exists(save_dir): os.makedirs(save_dir)
use_detector = False if not self.model.detector else True
use_reid = hasattr(self.model, 'reid')
if use_reid and self.model.reid is not None:
use_reid = True
else:
use_reid = False
timer = MOTTimer()
results = defaultdict(list)
frame_id = 0
self.status['mode'] = 'track'
self.model.eval()
if use_reid:
self.model.reid.eval()
if not use_detector:
dets_list = load_det_results(det_file, len(dataloader))
logger.info('Finish loading detection results file {}.'.format(
det_file))
tracker = self.model.tracker
for step_id, data in enumerate(tqdm(dataloader)):
self.status['step_id'] = step_id
ori_image = data['ori_image'] # [bs, H, W, 3]
ori_image_shape = data['ori_image'].shape[1:3]
# ori_image_shape: [H, W]
input_shape = data['image'].shape[2:]
# input_shape: [h, w], before data transforms, set in model config
im_shape = data['im_shape'][0].numpy()
# im_shape: [new_h, new_w], after data transforms
scale_factor = data['scale_factor'][0].numpy()
empty_detections = False
# when it has no detected bboxes, will not inference reid model
# and if visualize, use original image instead
# forward
timer.tic()
if not use_detector:
dets = dets_list[frame_id]
bbox_tlwh = np.array(dets['bbox'], dtype='float32')
if bbox_tlwh.shape[0] > 0:
# detector outputs: pred_cls_ids, pred_scores, pred_bboxes
pred_cls_ids = np.array(dets['cls_id'], dtype='float32')
pred_scores = np.array(dets['score'], dtype='float32')
pred_bboxes = np.concatenate(
(bbox_tlwh[:, 0:2],
bbox_tlwh[:, 2:4] + bbox_tlwh[:, 0:2]),
axis=1)
else:
logger.warning(
'Frame {} has not object, try to modify score threshold.'.
format(frame_id))
empty_detections = True
else:
outs = self.model.detector(data)
outs['bbox'] = outs['bbox'].numpy()
outs['bbox_num'] = outs['bbox_num'].numpy()
if len(outs['bbox']) > 0 and empty_detections == False:
# detector outputs: pred_cls_ids, pred_scores, pred_bboxes
pred_cls_ids = outs['bbox'][:, 0:1]
pred_scores = outs['bbox'][:, 1:2]
if not scaled:
# Note: scaled=False only in JDE YOLOv3 or other detectors
# with LetterBoxResize and JDEBBoxPostProcess.
#
# 'scaled' means whether the coords after detector outputs
# have been scaled back to the original image, set True
# in general detector, set False in JDE YOLOv3.
pred_bboxes = scale_coords(outs['bbox'][:, 2:],
input_shape, im_shape,
scale_factor)
else:
pred_bboxes = outs['bbox'][:, 2:]
pred_dets_old = np.concatenate(
(pred_cls_ids, pred_scores, pred_bboxes), axis=1)
else:
logger.warning(
'Frame {} has not detected object, try to modify score threshold.'.
format(frame_id))
empty_detections = True
if not empty_detections:
pred_xyxys, keep_idx = clip_box(pred_bboxes, ori_image_shape)
if len(keep_idx[0]) == 0:
logger.warning(
'Frame {} has not detected object left after clip_box.'.
format(frame_id))
empty_detections = True
if empty_detections:
timer.toc()
# if visualize, use original image instead
online_ids, online_tlwhs, online_scores = None, None, None
save_vis_results(data, frame_id, online_ids, online_tlwhs,
online_scores, timer.average_time, show_image,
save_dir, self.cfg.num_classes, self.ids2names)
frame_id += 1
# thus will not inference reid model
continue
pred_cls_ids = pred_cls_ids[keep_idx[0]]
pred_scores = pred_scores[keep_idx[0]]
pred_dets = np.concatenate(
(pred_cls_ids, pred_scores, pred_xyxys), axis=1)
if use_reid:
crops = get_crops(
pred_xyxys,
ori_image,
w=tracker.input_size[0],
h=tracker.input_size[1])
crops = paddle.to_tensor(crops)
data.update({'crops': crops})
pred_embs = self.model(data)['embeddings'].numpy()
else:
pred_embs = None
if isinstance(tracker, DeepSORTTracker):
online_tlwhs, online_scores, online_ids = [], [], []
tracker.predict()
online_targets = tracker.update(pred_dets, pred_embs)
for t in online_targets:
if not t.is_confirmed() or t.time_since_update > 1:
continue
tlwh = t.to_tlwh()
tscore = t.score
tid = t.track_id
if tscore < draw_threshold: continue
if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue
if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[
3] > tracker.vertical_ratio:
continue
online_tlwhs.append(tlwh)
online_scores.append(tscore)
online_ids.append(tid)
timer.toc()
# save results
results[0].append(
(frame_id + 1, online_tlwhs, online_scores, online_ids))
save_vis_results(data, frame_id, online_ids, online_tlwhs,
online_scores, timer.average_time, show_image,
save_dir, self.cfg.num_classes, self.ids2names)
elif isinstance(tracker, JDETracker):
# trick hyperparams only used for MOTChallenge (MOT17, MOT20) Test-set
tracker.track_buffer, tracker.conf_thres = get_trick_hyperparams(
seq_name, tracker.track_buffer, tracker.conf_thres)
online_targets_dict = tracker.update(pred_dets_old, pred_embs)
online_tlwhs = defaultdict(list)
online_scores = defaultdict(list)
online_ids = defaultdict(list)
for cls_id in range(self.cfg.num_classes):
online_targets = online_targets_dict[cls_id]
for t in online_targets:
tlwh = t.tlwh
tid = t.track_id
tscore = t.score
if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue
if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[
3] > tracker.vertical_ratio:
continue
online_tlwhs[cls_id].append(tlwh)
online_ids[cls_id].append(tid)
online_scores[cls_id].append(tscore)
# save results
results[cls_id].append(
(frame_id + 1, online_tlwhs[cls_id],
online_scores[cls_id], online_ids[cls_id]))
timer.toc()
save_vis_results(data, frame_id, online_ids, online_tlwhs,
online_scores, timer.average_time, show_image,
save_dir, self.cfg.num_classes, self.ids2names)
elif isinstance(tracker, OCSORTTracker):
# OC_SORT Tracker
online_targets = tracker.update(pred_dets_old, pred_embs)
online_tlwhs = []
online_ids = []
online_scores = []
for t in online_targets:
tlwh = [t[0], t[1], t[2] - t[0], t[3] - t[1]]
tscore = float(t[4])
tid = int(t[5])
if tlwh[2] * tlwh[3] > 0:
online_tlwhs.append(tlwh)
online_ids.append(tid)
online_scores.append(tscore)
timer.toc()
# save results
results[0].append(
(frame_id + 1, online_tlwhs, online_scores, online_ids))
save_vis_results(data, frame_id, online_ids, online_tlwhs,
online_scores, timer.average_time, show_image,
save_dir, self.cfg.num_classes, self.ids2names)
elif isinstance(tracker, BOTSORTTracker):
# BOTSORT Tracker
online_targets = tracker.update(
pred_dets_old, img=ori_image.numpy())
online_tlwhs = []
online_ids = []
online_scores = []
for t in online_targets:
tlwh = t.tlwh
tid = t.track_id
tscore = t.score
if tlwh[2] * tlwh[3] > 0:
online_tlwhs.append(tlwh)
online_ids.append(tid)
online_scores.append(tscore)
timer.toc()
# save results
results[0].append(
(frame_id + 1, online_tlwhs, online_scores, online_ids))
save_vis_results(data, frame_id, online_ids, online_tlwhs,
online_scores, timer.average_time, show_image,
save_dir, self.cfg.num_classes, self.ids2names)
else:
raise ValueError(tracker)
frame_id += 1
return results, frame_id, timer.average_time, timer.calls
def mot_evaluate(self,
data_root,
seqs,
output_dir,
data_type='mot',
model_type='JDE',
save_images=False,
save_videos=False,
show_image=False,
scaled=False,
det_results_dir=''):
if not os.path.exists(output_dir): os.makedirs(output_dir)
result_root = os.path.join(output_dir, 'mot_results')
if not os.path.exists(result_root): os.makedirs(result_root)
assert data_type in MOT_DATA_TYPE, \
"data_type should be 'mot', 'mcmot' or 'kitti'"
assert model_type in MOT_ARCH, \
"model_type should be 'JDE', 'DeepSORT', 'FairMOT' or 'ByteTrack'"
# run tracking
n_frame = 0
timer_avgs, timer_calls = [], []
for seq in seqs:
infer_dir = os.path.join(data_root, seq)
if not os.path.exists(infer_dir) or not os.path.isdir(infer_dir):
logger.warning("Seq {} error, {} has no images.".format(
seq, infer_dir))
continue
if os.path.exists(os.path.join(infer_dir, 'img1')):
infer_dir = os.path.join(infer_dir, 'img1')
frame_rate = 30
seqinfo = os.path.join(data_root, seq, 'seqinfo.ini')
if os.path.exists(seqinfo):
meta_info = open(seqinfo).read()
frame_rate = int(meta_info[meta_info.find('frameRate') + 10:
meta_info.find('\nseqLength')])
save_dir = os.path.join(output_dir, 'mot_outputs',
seq) if save_images or save_videos else None
logger.info('Evaluate seq: {}'.format(seq))
self.dataset.set_images(self.get_infer_images(infer_dir))
dataloader = create('EvalMOTReader')(self.dataset, 0)
result_filename = os.path.join(result_root, '{}.txt'.format(seq))
with paddle.no_grad():
if model_type in MOT_ARCH_JDE:
results, nf, ta, tc = self._eval_seq_jde(
dataloader,
save_dir=save_dir,
show_image=show_image,
frame_rate=frame_rate)
elif model_type in MOT_ARCH_SDE:
results, nf, ta, tc = self._eval_seq_sde(
dataloader,
save_dir=save_dir,
show_image=show_image,
frame_rate=frame_rate,
seq_name=seq,
scaled=scaled,
det_file=os.path.join(det_results_dir,
'{}.txt'.format(seq)))
elif model_type == 'CenterTrack':
results, nf, ta, tc = self._eval_seq_centertrack(
dataloader,
save_dir=save_dir,
show_image=show_image,
frame_rate=frame_rate)
else:
raise ValueError(model_type)
write_mot_results(result_filename, results, data_type,
self.cfg.num_classes)
n_frame += nf
timer_avgs.append(ta)
timer_calls.append(tc)
if save_videos:
output_video_path = os.path.join(save_dir, '..',
'{}_vis.mp4'.format(seq))
cmd_str = 'ffmpeg -f image2 -i {}/%05d.jpg {}'.format(
save_dir, output_video_path)
os.system(cmd_str)
logger.info('Save video in {}.'.format(output_video_path))
# update metrics
for metric in self._metrics:
metric.update(data_root, seq, data_type, result_root,
result_filename)
timer_avgs = np.asarray(timer_avgs)
timer_calls = np.asarray(timer_calls)
all_time = np.dot(timer_avgs, timer_calls)
avg_time = all_time / np.sum(timer_calls)
logger.info('Time elapsed: {:.2f} seconds, FPS: {:.2f}'.format(
all_time, 1.0 / avg_time))
# accumulate metric to log out
for metric in self._metrics:
metric.accumulate()
metric.log()
# reset metric states for metric may performed multiple times
self._reset_metrics()
def get_infer_images(self, infer_dir):
assert infer_dir is None or os.path.isdir(infer_dir), \
"{} is not a directory".format(infer_dir)
images = set()
assert os.path.isdir(infer_dir), \
"infer_dir {} is not a directory".format(infer_dir)
exts = ['jpg', 'jpeg', 'png', 'bmp']
exts += [ext.upper() for ext in exts]
for ext in exts:
images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
images = list(images)
images.sort()
assert len(images) > 0, "no image found in {}".format(infer_dir)
logger.info("Found {} inference images in total.".format(len(images)))
return images
def mot_predict_seq(self,
video_file,
frame_rate,
image_dir,
output_dir,
data_type='mot',
model_type='JDE',
save_images=False,
save_videos=True,
show_image=False,
scaled=False,
det_results_dir='',
draw_threshold=0.5):
assert video_file is not None or image_dir is not None, \
"--video_file or --image_dir should be set."
assert video_file is None or os.path.isfile(video_file), \
"{} is not a file".format(video_file)
assert image_dir is None or os.path.isdir(image_dir), \
"{} is not a directory".format(image_dir)
if not os.path.exists(output_dir): os.makedirs(output_dir)
result_root = os.path.join(output_dir, 'mot_results')
if not os.path.exists(result_root): os.makedirs(result_root)
assert data_type in MOT_DATA_TYPE, \
"data_type should be 'mot', 'mcmot' or 'kitti'"
assert model_type in MOT_ARCH, \
"model_type should be 'JDE', 'DeepSORT', 'FairMOT' or 'ByteTrack'"
# run tracking
if video_file:
seq = video_file.split('/')[-1].split('.')[0]
self.dataset.set_video(video_file, frame_rate)
logger.info('Starting tracking video {}'.format(video_file))
elif image_dir:
seq = image_dir.split('/')[-1].split('.')[0]
if os.path.exists(os.path.join(image_dir, 'img1')):
image_dir = os.path.join(image_dir, 'img1')
images = [
'{}/{}'.format(image_dir, x) for x in os.listdir(image_dir)
]
images.sort()
self.dataset.set_images(images)
logger.info('Starting tracking folder {}, found {} images'.format(
image_dir, len(images)))
else:
raise ValueError('--video_file or --image_dir should be set.')
save_dir = os.path.join(output_dir, 'mot_outputs',
seq) if save_images or save_videos else None
dataloader = create('TestMOTReader')(self.dataset, 0)
result_filename = os.path.join(result_root, '{}.txt'.format(seq))
if frame_rate == -1:
frame_rate = self.dataset.frame_rate
with paddle.no_grad():
if model_type in MOT_ARCH_JDE:
results, nf, ta, tc = self._eval_seq_jde(
dataloader,
save_dir=save_dir,
show_image=show_image,
frame_rate=frame_rate,
draw_threshold=draw_threshold)
elif model_type in MOT_ARCH_SDE:
results, nf, ta, tc = self._eval_seq_sde(
dataloader,
save_dir=save_dir,
show_image=show_image,
frame_rate=frame_rate,
seq_name=seq,
scaled=scaled,
det_file=os.path.join(det_results_dir,
'{}.txt'.format(seq)),
draw_threshold=draw_threshold)
elif model_type == 'CenterTrack':
results, nf, ta, tc = self._eval_seq_centertrack(
dataloader,
save_dir=save_dir,
show_image=show_image,
frame_rate=frame_rate)
else:
raise ValueError(model_type)
if save_videos:
output_video_path = os.path.join(save_dir, '..',
'{}_vis.mp4'.format(seq))
cmd_str = 'ffmpeg -f image2 -i {}/%05d.jpg {}'.format(
save_dir, output_video_path)
os.system(cmd_str)
logger.info('Save video in {}'.format(output_video_path))
write_mot_results(result_filename, results, data_type,
self.cfg.num_classes)
def get_trick_hyperparams(video_name, ori_buffer, ori_thresh):
if video_name[:3] != 'MOT':
# only used for MOTChallenge (MOT17, MOT20) Test-set
return ori_buffer, ori_thresh
video_name = video_name[:8]
if 'MOT17-05' in video_name:
track_buffer = 14
elif 'MOT17-13' in video_name:
track_buffer = 25
else:
track_buffer = ori_buffer
if 'MOT17-01' in video_name:
track_thresh = 0.65
elif 'MOT17-06' in video_name:
track_thresh = 0.65
elif 'MOT17-12' in video_name:
track_thresh = 0.7
elif 'MOT17-14' in video_name:
track_thresh = 0.67
else:
track_thresh = ori_thresh
if 'MOT20-06' in video_name or 'MOT20-08' in video_name:
track_thresh = 0.3
else:
track_thresh = ori_thresh
return track_buffer, ori_thresh
================================================
FILE: ppdet/engine/trainer.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
import copy
import time
import yaml
from tqdm import tqdm
import numpy as np
import typing
from PIL import Image, ImageOps, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
import paddle
import paddle.nn as nn
import paddle.distributed as dist
from paddle.distributed import fleet
from paddle.static import InputSpec
from ppdet.optimizer import ModelEMA
from ppdet.core.workspace import create
from ppdet.utils.checkpoint import load_weight, load_pretrain_weight, convert_to_dict
from ppdet.utils.visualizer import visualize_results, save_result
from ppdet.metrics import get_infer_results, KeyPointTopDownCOCOEval, KeyPointTopDownCOCOWholeBadyHandEval, KeyPointTopDownMPIIEval, Pose3DEval
from ppdet.metrics import Metric, COCOMetric, LVISMetric, VOCMetric, WiderFaceMetric, RBoxMetric, JDEDetMetric, SNIPERCOCOMetric, CULaneMetric
from ppdet.data.source.sniper_coco import SniperCOCODataSet
from ppdet.data.source.category import get_categories
import ppdet.utils.stats as stats
from ppdet.utils.fuse_utils import fuse_conv_bn
from ppdet.utils import profiler
from ppdet.modeling.post_process import multiclass_nms
from ppdet.modeling.lane_utils import imshow_lanes
from .callbacks import Callback, ComposeCallback, LogPrinter, Checkpointer, WiferFaceEval, VisualDLWriter, SniperProposalsGenerator, WandbCallback, SemiCheckpointer, SemiLogPrinter
from .export_utils import _dump_infer_config, _prune_input_spec, apply_to_static
from .naive_sync_bn import convert_syncbn
from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients
from ppdet.utils.logger import setup_logger
logger = setup_logger('ppdet.engine')
__all__ = ['Trainer']
MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']
class Trainer(object):
def __init__(self, cfg, mode='train'):
self.cfg = cfg.copy()
assert mode.lower() in ['train', 'eval', 'test'], \
"mode should be 'train', 'eval' or 'test'"
self.mode = mode.lower()
self.optimizer = None
self.is_loaded_weights = False
self.use_amp = self.cfg.get('amp', False)
self.amp_level = self.cfg.get('amp_level', 'O1')
self.custom_white_list = self.cfg.get('custom_white_list', None)
self.custom_black_list = self.cfg.get('custom_black_list', None)
self.use_master_grad = self.cfg.get('master_grad', False)
self.uniform_output_enabled = self.cfg.get('uniform_output_enabled', False)
if ('slim' in cfg and cfg['slim_type'] == 'PTQ') or self.uniform_output_enabled:
self.cfg['TestDataset'] = create('TestDataset')()
log_ranks = cfg.get('log_ranks', '0')
if isinstance(log_ranks, str):
self.log_ranks = [int(i) for i in log_ranks.split(',')]
elif isinstance(log_ranks, int):
self.log_ranks = [log_ranks]
train_results_path = os.path.abspath(os.path.join(self.cfg.save_dir, "train_result.json"))
if self.uniform_output_enabled:
if os.path.exists(train_results_path) and self.mode == 'train':
try:
os.remove(train_results_path)
except:
pass
if not os.path.exists(self.cfg.save_dir):
os.mkdir(self.cfg.save_dir)
with open(os.path.join(self.cfg.save_dir, "config.yaml"), "w") as f:
config_dict = convert_to_dict(self.cfg)
config_dict = {k: v for k, v in config_dict.items() if v != {}}
yaml.dump(config_dict, f)
# build data loader
capital_mode = self.mode.capitalize()
if cfg.architecture in MOT_ARCH and self.mode in [
'eval', 'test'
] and cfg.metric not in ['COCO', 'VOC']:
self.dataset = self.cfg['{}MOTDataset'.format(
capital_mode)] = create('{}MOTDataset'.format(capital_mode))()
else:
self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create(
'{}Dataset'.format(capital_mode))()
if cfg.architecture == 'DeepSORT' and self.mode == 'train':
logger.error('DeepSORT has no need of training on mot dataset.')
sys.exit(1)
if cfg.architecture == 'FairMOT' and self.mode == 'eval':
images = self.parse_mot_images(cfg)
self.dataset.set_images(images)
if self.mode == 'train':
self.loader = create('{}Reader'.format(capital_mode))(
self.dataset, cfg.worker_num)
if cfg.architecture == 'JDE' and self.mode == 'train':
self.cfg['JDEEmbeddingHead'][
'num_identities'] = self.dataset.num_identities_dict[0]
# JDE only support single class MOT now.
if cfg.architecture == 'FairMOT' and self.mode == 'train':
self.cfg['FairMOTEmbeddingHead'][
'num_identities_dict'] = self.dataset.num_identities_dict
# FairMOT support single class and multi-class MOT now.
# build model
if 'model' not in self.cfg:
self.model = create(cfg.architecture)
else:
self.model = self.cfg.model
self.is_loaded_weights = True
if cfg.architecture == 'YOLOX':
for k, m in self.model.named_sublayers():
if isinstance(m, nn.BatchNorm2D):
m._epsilon = 1e-3 # for amp(fp16)
m._momentum = 0.97 # 0.03 in pytorch
# reset norm param attr for setting them in optimizer
if 'reset_norm_param_attr' in cfg and cfg['reset_norm_param_attr']:
self.model = self.reset_norm_param_attr(
self.model, weight_attr=None, bias_attr=None)
# normalize params for deploy
if 'slim' in cfg and cfg['slim_type'] == 'OFA':
self.model.model.load_meanstd(cfg['TestReader'][
'sample_transforms'])
elif 'slim' in cfg and cfg['slim_type'] == 'Distill':
self.model.student_model.load_meanstd(cfg['TestReader'][
'sample_transforms'])
elif 'slim' in cfg and cfg[
'slim_type'] == 'DistillPrune' and self.mode == 'train':
self.model.student_model.load_meanstd(cfg['TestReader'][
'sample_transforms'])
else:
self.model.load_meanstd(cfg['TestReader']['sample_transforms'])
# EvalDataset build with BatchSampler to evaluate in single device
# TODO: multi-device evaluate
if self.mode == 'eval':
if cfg.architecture == 'FairMOT':
self.loader = create('EvalMOTReader')(self.dataset, 0)
elif cfg.architecture == "METRO_Body":
reader_name = '{}Reader'.format(self.mode.capitalize())
self.loader = create(reader_name)(self.dataset, cfg.worker_num)
else:
self._eval_batch_sampler = paddle.io.BatchSampler(
self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
reader_name = '{}Reader'.format(self.mode.capitalize())
# If metric is VOC, need to be set collate_batch=False.
if cfg.metric == 'VOC':
self.cfg[reader_name]['collate_batch'] = False
self.loader = create(reader_name)(self.dataset, cfg.worker_num,
self._eval_batch_sampler)
# TestDataset build after user set images, skip loader creation here
# get Params
print_params = self.cfg.get('print_params', False)
if print_params:
params = sum([
p.numel() for n, p in self.model.named_parameters()
if all([x not in n for x in ['_mean', '_variance', 'aux_']])
]) # exclude BatchNorm running status
logger.info('Model Params : {} M.'.format((params / 1e6).numpy()[
0]))
# build optimizer in train mode
if self.mode == 'train':
steps_per_epoch = len(self.loader)
if steps_per_epoch < 1:
logger.warning(
"Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader."
)
self.lr = create('LearningRate')(steps_per_epoch)
self.optimizer = create('OptimizerBuilder')(self.lr, self.model)
# Unstructured pruner is only enabled in the train mode.
if self.cfg.get('unstructured_prune'):
self.pruner = create('UnstructuredPruner')(self.model,
steps_per_epoch)
if self.use_amp and self.amp_level == 'O2':
paddle_version = paddle.__version__[:3]
# paddle version >= 2.5.0 or develop
if paddle_version in ["2.5", "0.0"]:
self.model, self.optimizer = paddle.amp.decorate(
models=self.model,
optimizers=self.optimizer,
level=self.amp_level,
master_grad=self.use_master_grad)
else:
self.model, self.optimizer = paddle.amp.decorate(
models=self.model,
optimizers=self.optimizer,
level=self.amp_level)
# support sync_bn for npu/xpu
if (paddle.get_device()[:3]=='npu' or paddle.get_device()[:3]=='xpu'):
use_npu = ('use_npu' in cfg and cfg['use_npu'])
use_xpu = ('use_xpu' in cfg and cfg['use_xpu'])
use_mlu = ('use_mlu' in cfg and cfg['use_mlu'])
norm_type = ('norm_type' in cfg and cfg['norm_type'])
if norm_type == 'sync_bn' and (use_npu or use_xpu or use_mlu) and dist.get_world_size() > 1:
convert_syncbn(self.model)
self.use_ema = ('use_ema' in cfg and cfg['use_ema'])
if self.use_ema:
ema_decay = self.cfg.get('ema_decay', 0.9998)
ema_decay_type = self.cfg.get('ema_decay_type', 'threshold')
cycle_epoch = self.cfg.get('cycle_epoch', -1)
ema_black_list = self.cfg.get('ema_black_list', None)
ema_filter_no_grad = self.cfg.get('ema_filter_no_grad', False)
self.ema = ModelEMA(
self.model,
decay=ema_decay,
ema_decay_type=ema_decay_type,
cycle_epoch=cycle_epoch,
ema_black_list=ema_black_list,
ema_filter_no_grad=ema_filter_no_grad)
self._nranks = dist.get_world_size()
self._local_rank = dist.get_rank()
self.status = {}
self.start_epoch = 0
self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch
# initial default callbacks
self._init_callbacks()
# initial default metrics
self._init_metrics()
self._reset_metrics()
def _init_callbacks(self):
if self.mode == 'train':
if self.cfg.get('ssod_method',
False) and self.cfg['ssod_method'] == 'Semi_RTDETR':
self._callbacks = [SemiLogPrinter(self), SemiCheckpointer(self)]
else:
self._callbacks = [LogPrinter(self), Checkpointer(self)]
if self.cfg.get('use_vdl', False):
self._callbacks.append(VisualDLWriter(self))
if self.cfg.get('save_proposals', False):
self._callbacks.append(SniperProposalsGenerator(self))
if self.cfg.get('use_wandb', False) or 'wandb' in self.cfg:
self._callbacks.append(WandbCallback(self))
self._compose_callback = ComposeCallback(self._callbacks)
elif self.mode == 'eval':
self._callbacks = [LogPrinter(self)]
# if self.cfg.metric == 'WiderFace':
# self._callbacks.append(WiferFaceEval(self))
self._compose_callback = ComposeCallback(self._callbacks)
elif self.mode == 'test' and self.cfg.get('use_vdl', False):
self._callbacks = [VisualDLWriter(self)]
self._compose_callback = ComposeCallback(self._callbacks)
else:
self._callbacks = []
self._compose_callback = None
def _init_metrics(self, validate=False):
if self.mode == 'test' or (self.mode == 'train' and not validate):
self._metrics = []
return
classwise = self.cfg['classwise'] if 'classwise' in self.cfg else False
if self.cfg.metric == 'COCO' or self.cfg.metric == "SNIPERCOCO" or self.cfg.metric == 'LVIS':
# TODO: bias should be unified
bias = 1 if self.cfg.get('bias', False) else 0
output_eval = self.cfg['output_eval'] \
if 'output_eval' in self.cfg else None
save_prediction_only = self.cfg.get('save_prediction_only', False)
# pass clsid2catid info to metric instance to avoid multiple loading
# annotation file
clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()} \
if self.mode == 'eval' else None
save_threshold = self.cfg.get('save_threshold', 0)
# when do validation in train, annotation file should be get from
# EvalReader instead of self.dataset(which is TrainReader)
if self.mode == 'train' and validate:
eval_dataset = self.cfg['EvalDataset']
eval_dataset.check_or_download_dataset()
anno_file = eval_dataset.get_anno()
dataset = eval_dataset
else:
dataset = self.dataset
anno_file = dataset.get_anno()
IouType = self.cfg['IouType'] if 'IouType' in self.cfg else 'bbox'
if self.cfg.metric == "COCO":
self._metrics = [
COCOMetric(
anno_file=anno_file,
clsid2catid=clsid2catid,
classwise=classwise,
output_eval=output_eval,
bias=bias,
IouType=IouType,
save_prediction_only=save_prediction_only,
save_threshold=save_threshold)
]
elif self.cfg.metric == "LVIS":
self._metrics = [
LVISMetric(
anno_file=anno_file,
clsid2catid=clsid2catid,
classwise=classwise,
output_eval=output_eval,
bias=bias,
IouType=IouType,
save_prediction_only=save_prediction_only)
]
elif self.cfg.metric == "SNIPERCOCO": # sniper
self._metrics = [
SNIPERCOCOMetric(
anno_file=anno_file,
dataset=dataset,
clsid2catid=clsid2catid,
classwise=classwise,
output_eval=output_eval,
bias=bias,
IouType=IouType,
save_prediction_only=save_prediction_only)
]
elif self.cfg.metric == 'RBOX':
# TODO: bias should be unified
bias = self.cfg['bias'] if 'bias' in self.cfg else 0
output_eval = self.cfg['output_eval'] \
if 'output_eval' in self.cfg else None
save_prediction_only = self.cfg.get('save_prediction_only', False)
imid2path = self.cfg.get('imid2path', None)
# when do validation in train, annotation file should be get from
# EvalReader instead of self.dataset(which is TrainReader)
anno_file = self.dataset.get_anno()
if self.mode == 'train' and validate:
eval_dataset = self.cfg['EvalDataset']
eval_dataset.check_or_download_dataset()
anno_file = eval_dataset.get_anno()
self._metrics = [
RBoxMetric(
anno_file=anno_file,
classwise=classwise,
output_eval=output_eval,
bias=bias,
save_prediction_only=save_prediction_only,
imid2path=imid2path)
]
elif self.cfg.metric == 'VOC':
output_eval = self.cfg['output_eval'] \
if 'output_eval' in self.cfg else None
save_prediction_only = self.cfg.get('save_prediction_only', False)
self._metrics = [
VOCMetric(
label_list=self.dataset.get_label_list(),
class_num=self.cfg.num_classes,
map_type=self.cfg.map_type,
classwise=classwise,
output_eval=output_eval,
save_prediction_only=save_prediction_only)
]
elif self.cfg.metric == 'WiderFace':
self._metrics = [
WiderFaceMetric()
]
elif self.cfg.metric == 'KeyPointTopDownCOCOEval':
eval_dataset = self.cfg['EvalDataset']
eval_dataset.check_or_download_dataset()
anno_file = eval_dataset.get_anno()
save_prediction_only = self.cfg.get('save_prediction_only', False)
self._metrics = [
KeyPointTopDownCOCOEval(
anno_file,
len(eval_dataset),
self.cfg.num_joints,
self.cfg.save_dir,
save_prediction_only=save_prediction_only)
]
elif self.cfg.metric == 'KeyPointTopDownCOCOWholeBadyHandEval':
eval_dataset = self.cfg['EvalDataset']
eval_dataset.check_or_download_dataset()
anno_file = eval_dataset.get_anno()
save_prediction_only = self.cfg.get('save_prediction_only', False)
self._metrics = [
KeyPointTopDownCOCOWholeBadyHandEval(
anno_file,
len(eval_dataset),
self.cfg.num_joints,
self.cfg.save_dir,
save_prediction_only=save_prediction_only)
]
elif self.cfg.metric == 'KeyPointTopDownMPIIEval':
eval_dataset = self.cfg['EvalDataset']
eval_dataset.check_or_download_dataset()
anno_file = eval_dataset.get_anno()
save_prediction_only = self.cfg.get('save_prediction_only', False)
self._metrics = [
KeyPointTopDownMPIIEval(
anno_file,
len(eval_dataset),
self.cfg.num_joints,
self.cfg.save_dir,
save_prediction_only=save_prediction_only)
]
elif self.cfg.metric == 'Pose3DEval':
save_prediction_only = self.cfg.get('save_prediction_only', False)
self._metrics = [
Pose3DEval(
self.cfg.save_dir,
save_prediction_only=save_prediction_only)
]
elif self.cfg.metric == 'MOTDet':
self._metrics = [JDEDetMetric(), ]
elif self.cfg.metric == 'CULaneMetric':
output_eval = self.cfg.get('output_eval', None)
self._metrics = [
CULaneMetric(
cfg=self.cfg,
output_eval=output_eval,
split=self.dataset.split,
dataset_dir=self.cfg.dataset_dir)
]
else:
logger.warning("Metric not support for metric type {}".format(
self.cfg.metric))
self._metrics = []
def _reset_metrics(self):
for metric in self._metrics:
metric.reset()
def register_callbacks(self, callbacks):
callbacks = [c for c in list(callbacks) if c is not None]
for c in callbacks:
assert isinstance(c, Callback), \
"metrics shoule be instances of subclass of Metric"
self._callbacks.extend(callbacks)
self._compose_callback = ComposeCallback(self._callbacks)
def register_metrics(self, metrics):
metrics = [m for m in list(metrics) if m is not None]
for m in metrics:
assert isinstance(m, Metric), \
"metrics shoule be instances of subclass of Metric"
self._metrics.extend(metrics)
def load_weights(self, weights, ARSL_eval=False):
if self.is_loaded_weights:
return
self.start_epoch = 0
load_pretrain_weight(self.model, weights, ARSL_eval)
logger.debug("Load weights {} to start training".format(weights))
def load_weights_sde(self, det_weights, reid_weights):
if self.model.detector:
load_weight(self.model.detector, det_weights)
if self.model.reid:
load_weight(self.model.reid, reid_weights)
else:
load_weight(self.model.reid, reid_weights)
def resume_weights(self, weights):
# support Distill resume weights
if hasattr(self.model, 'student_model'):
self.start_epoch = load_weight(self.model.student_model, weights,
self.optimizer)
else:
self.start_epoch = load_weight(self.model, weights, self.optimizer,
self.ema if self.use_ema else None)
logger.debug("Resume weights of epoch {}".format(self.start_epoch))
def train(self, validate=False):
assert self.mode == 'train', "Model not in 'train' mode"
Init_mark = False
if validate:
self.cfg['EvalDataset'] = self.cfg.EvalDataset = create(
"EvalDataset")()
model = self.model
if self.cfg.get('to_static', False):
model = apply_to_static(self.cfg, model)
sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and
self.cfg.use_gpu and self._nranks > 1)
if sync_bn:
model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)
# enabel auto mixed precision mode
if self.use_amp:
scaler = paddle.amp.GradScaler(
enable=self.cfg.use_gpu or self.cfg.use_npu or self.cfg.use_mlu,
init_loss_scaling=self.cfg.get('init_loss_scaling', 1024))
# get distributed model
if self.cfg.get('fleet', False):
model = fleet.distributed_model(model)
self.optimizer = fleet.distributed_optimizer(self.optimizer)
elif self._nranks > 1:
find_unused_parameters = self.cfg[
'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
model = paddle.DataParallel(
model, find_unused_parameters=find_unused_parameters)
self.status.update({
'epoch_id': self.start_epoch,
'step_id': 0,
'steps_per_epoch': len(self.loader)
})
self.status['batch_time'] = stats.SmoothedValue(
self.cfg.log_iter, fmt='{avg:.4f}')
self.status['data_time'] = stats.SmoothedValue(
self.cfg.log_iter, fmt='{avg:.4f}')
self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)
if self.cfg.get('print_flops', False):
flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
self.dataset, self.cfg.worker_num)
self._flops(flops_loader)
profiler_options = self.cfg.get('profiler_options', None)
self._compose_callback.on_train_begin(self.status)
use_fused_allreduce_gradients = self.cfg[
'use_fused_allreduce_gradients'] if 'use_fused_allreduce_gradients' in self.cfg else False
for epoch_id in range(self.start_epoch, self.cfg.epoch):
self.status['mode'] = 'train'
self.status['epoch_id'] = epoch_id
self._compose_callback.on_epoch_begin(self.status)
self.loader.dataset.set_epoch(epoch_id)
model.train()
iter_tic = time.time()
for step_id, data in enumerate(self.loader):
def deep_pin(blob, blocking):
if isinstance(blob, paddle.Tensor):
return blob.cuda(blocking=blocking)
elif isinstance(blob, dict):
return {k: deep_pin(v, blocking) for k, v in blob.items()}
elif isinstance(blob, (list, tuple)):
return type(blob)([deep_pin(x, blocking) for x in blob])
else:
return blob
# if paddle.base.core.is_compiled_with_cuda():
# data = deep_pin(data, False)
self.status['data_time'].update(time.time() - iter_tic)
self.status['step_id'] = step_id
profiler.add_profiler_step(profiler_options)
self._compose_callback.on_step_begin(self.status)
data['epoch_id'] = epoch_id
if self.cfg.get('to_static',
False) and 'image_file' in data.keys():
data.pop('image_file')
if self.use_amp:
if isinstance(
model, paddle.
DataParallel) and use_fused_allreduce_gradients:
with model.no_sync():
with paddle.amp.auto_cast(
enable=self.cfg.use_gpu or
self.cfg.use_npu or self.cfg.use_mlu,
custom_white_list=self.custom_white_list,
custom_black_list=self.custom_black_list,
level=self.amp_level):
# model forward
outputs = model(data)
loss = outputs['loss']
# model backward
scaled_loss = scaler.scale(loss)
scaled_loss.backward()
fused_allreduce_gradients(
list(model.parameters()), None)
else:
with paddle.amp.auto_cast(
enable=self.cfg.use_gpu or self.cfg.use_npu or
self.cfg.use_mlu,
custom_white_list=self.custom_white_list,
custom_black_list=self.custom_black_list,
level=self.amp_level):
# model forward
outputs = model(data)
loss = outputs['loss']
# model backward
scaled_loss = scaler.scale(loss)
scaled_loss.backward()
# in dygraph mode, optimizer.minimize is equal to optimizer.step
scaler.minimize(self.optimizer, scaled_loss)
else:
if isinstance(
model, paddle.
DataParallel) and use_fused_allreduce_gradients:
with model.no_sync():
# model forward
outputs = model(data)
loss = outputs['loss']
# model backward
loss.backward()
fused_allreduce_gradients(
list(model.parameters()), None)
else:
# model forward
outputs = model(data)
loss = outputs['loss']
# model backward
loss.backward()
self.optimizer.step()
curr_lr = self.optimizer.get_lr()
self.lr.step()
if self.cfg.get('unstructured_prune'):
self.pruner.step()
self.optimizer.clear_grad()
self.status['learning_rate'] = curr_lr
if self._nranks < 2 or self._local_rank in self.log_ranks:
self.status['training_staus'].update(outputs)
self.status['batch_time'].update(time.time() - iter_tic)
self._compose_callback.on_step_end(self.status)
if self.use_ema:
self.ema.update()
iter_tic = time.time()
if self.cfg.get('unstructured_prune'):
self.pruner.update_params()
is_snapshot = (self._nranks < 2 or (self._local_rank == 0 or self.cfg.metric == "Pose3DEval")) \
and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 or epoch_id == self.end_epoch - 1)
if is_snapshot and self.use_ema:
# apply ema weight on model
weight = copy.deepcopy(self.model.state_dict())
self.model.set_dict(self.ema.apply())
self.status['weight'] = weight
self._compose_callback.on_epoch_end(self.status)
if validate and is_snapshot:
if not hasattr(self, '_eval_loader'):
# build evaluation dataset and loader
self._eval_dataset = self.cfg.EvalDataset
self._eval_batch_sampler = \
paddle.io.BatchSampler(
self._eval_dataset,
batch_size=self.cfg.EvalReader['batch_size'])
# If metric is VOC, need to be set collate_batch=False.
if self.cfg.metric == 'VOC':
self.cfg['EvalReader']['collate_batch'] = False
if self.cfg.metric == "Pose3DEval":
self._eval_loader = create('EvalReader')(
self._eval_dataset, self.cfg.worker_num)
else:
self._eval_loader = create('EvalReader')(
self._eval_dataset,
self.cfg.worker_num,
batch_sampler=self._eval_batch_sampler)
# if validation in training is enabled, metrics should be re-init
# Init_mark makes sure this code will only execute once
if validate and Init_mark == False:
Init_mark = True
self._init_metrics(validate=validate)
self._reset_metrics()
with paddle.no_grad():
self.status['save_best_model'] = True
self._eval_with_loader(self._eval_loader)
if is_snapshot and self.use_ema:
# reset original weight
self.model.set_dict(weight)
self.status.pop('weight')
self._compose_callback.on_train_end(self.status)
def _eval_with_loader(self, loader):
sample_num = 0
tic = time.time()
self._compose_callback.on_epoch_begin(self.status)
self.status['mode'] = 'eval'
self.model.eval()
if self.cfg.get('print_flops', False):
flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
self.dataset, self.cfg.worker_num, self._eval_batch_sampler)
self._flops(flops_loader)
for step_id, data in enumerate(loader):
self.status['step_id'] = step_id
self._compose_callback.on_step_begin(self.status)
# forward
if self.use_amp:
with paddle.amp.auto_cast(
enable=self.cfg.use_gpu or self.cfg.use_npu or
self.cfg.use_mlu,
custom_white_list=self.custom_white_list,
custom_black_list=self.custom_black_list,
level=self.amp_level):
outs = self.model(data)
else:
outs = self.model(data)
# update metrics
for metric in self._metrics:
metric.update(data, outs)
# multi-scale inputs: all inputs have same im_id
if isinstance(data, typing.Sequence):
sample_num += data[0]['im_id'].numpy().shape[0]
else:
sample_num += data['im_id'].numpy().shape[0]
self._compose_callback.on_step_end(self.status)
self.status['sample_num'] = sample_num
self.status['cost_time'] = time.time() - tic
# accumulate metric to log out
for metric in self._metrics:
metric.accumulate()
metric.log()
self._compose_callback.on_epoch_end(self.status)
# reset metric states for metric may performed multiple times
self._reset_metrics()
def evaluate(self):
# get distributed model
if self.cfg.get('fleet', False):
self.model = fleet.distributed_model(self.model)
self.optimizer = fleet.distributed_optimizer(self.optimizer)
elif self._nranks > 1:
find_unused_parameters = self.cfg[
'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
self.model = paddle.DataParallel(
self.model, find_unused_parameters=find_unused_parameters)
with paddle.no_grad():
self._eval_with_loader(self.loader)
def _eval_with_loader_slice(self,
loader,
slice_size=[640, 640],
overlap_ratio=[0.25, 0.25],
combine_method='nms',
match_threshold=0.6,
match_metric='iou'):
sample_num = 0
tic = time.time()
self._compose_callback.on_epoch_begin(self.status)
self.status['mode'] = 'eval'
self.model.eval()
if self.cfg.get('print_flops', False):
flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
self.dataset, self.cfg.worker_num, self._eval_batch_sampler)
self._flops(flops_loader)
merged_bboxs = []
for step_id, data in enumerate(loader):
self.status['step_id'] = step_id
self._compose_callback.on_step_begin(self.status)
# forward
if self.use_amp:
with paddle.amp.auto_cast(
enable=self.cfg.use_gpu or self.cfg.use_npu or
self.cfg.use_mlu,
custom_white_list=self.custom_white_list,
custom_black_list=self.custom_black_list,
level=self.amp_level):
outs = self.model(data)
else:
outs = self.model(data)
shift_amount = data['st_pix']
outs['bbox'][:, 2:4] = outs['bbox'][:, 2:4] + shift_amount
outs['bbox'][:, 4:6] = outs['bbox'][:, 4:6] + shift_amount
merged_bboxs.append(outs['bbox'])
if data['is_last'] > 0:
# merge matching predictions
merged_results = {'bbox': []}
if combine_method == 'nms':
final_boxes = multiclass_nms(
np.concatenate(merged_bboxs), self.cfg.num_classes,
match_threshold, match_metric)
merged_results['bbox'] = np.concatenate(final_boxes)
elif combine_method == 'concat':
merged_results['bbox'] = np.concatenate(merged_bboxs)
else:
raise ValueError(
"Now only support 'nms' or 'concat' to fuse detection results."
)
merged_results['im_id'] = np.array([[0]])
merged_results['bbox_num'] = np.array(
[len(merged_results['bbox'])])
merged_bboxs = []
data['im_id'] = data['ori_im_id']
# update metrics
for metric in self._metrics:
metric.update(data, merged_results)
# multi-scale inputs: all inputs have same im_id
if isinstance(data, typing.Sequence):
sample_num += data[0]['im_id'].numpy().shape[0]
else:
sample_num += data['im_id'].numpy().shape[0]
self._compose_callback.on_step_end(self.status)
self.status['sample_num'] = sample_num
self.status['cost_time'] = time.time() - tic
# accumulate metric to log out
for metric in self._metrics:
metric.accumulate()
metric.log()
self._compose_callback.on_epoch_end(self.status)
# reset metric states for metric may performed multiple times
self._reset_metrics()
def evaluate_slice(self,
slice_size=[640, 640],
overlap_ratio=[0.25, 0.25],
combine_method='nms',
match_threshold=0.6,
match_metric='iou'):
with paddle.no_grad():
self._eval_with_loader_slice(self.loader, slice_size, overlap_ratio,
combine_method, match_threshold,
match_metric)
def slice_predict(self,
images,
slice_size=[640, 640],
overlap_ratio=[0.25, 0.25],
combine_method='nms',
match_threshold=0.6,
match_metric='iou',
draw_threshold=0.5,
output_dir='output',
save_results=False,
visualize=True):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
self.dataset.set_slice_images(images, slice_size, overlap_ratio)
loader = create('TestReader')(self.dataset, 0)
imid2path = self.dataset.get_imid2path()
def setup_metrics_for_loader():
# mem
metrics = copy.deepcopy(self._metrics)
mode = self.mode
save_prediction_only = self.cfg[
'save_prediction_only'] if 'save_prediction_only' in self.cfg else None
output_eval = self.cfg[
'output_eval'] if 'output_eval' in self.cfg else None
# modify
self.mode = '_test'
self.cfg['save_prediction_only'] = True
self.cfg['output_eval'] = output_dir
self.cfg['imid2path'] = imid2path
self._init_metrics()
# restore
self.mode = mode
self.cfg.pop('save_prediction_only')
if save_prediction_only is not None:
self.cfg['save_prediction_only'] = save_prediction_only
self.cfg.pop('output_eval')
if output_eval is not None:
self.cfg['output_eval'] = output_eval
self.cfg.pop('imid2path')
_metrics = copy.deepcopy(self._metrics)
self._metrics = metrics
return _metrics
if save_results:
metrics = setup_metrics_for_loader()
else:
metrics = []
anno_file = self.dataset.get_anno()
clsid2catid, catid2name = get_categories(
self.cfg.metric, anno_file=anno_file)
# Run Infer
self.status['mode'] = 'test'
self.model.eval()
if self.cfg.get('print_flops', False):
flops_loader = create('TestReader')(self.dataset, 0)
self._flops(flops_loader)
results = [] # all images
merged_bboxs = [] # single image
for step_id, data in enumerate(tqdm(loader)):
self.status['step_id'] = step_id
# forward
outs = self.model(data)
outs['bbox'] = outs['bbox'].numpy() # only in test mode
shift_amount = data['st_pix']
outs['bbox'][:, 2:4] = outs['bbox'][:, 2:4] + shift_amount.numpy()
outs['bbox'][:, 4:6] = outs['bbox'][:, 4:6] + shift_amount.numpy()
merged_bboxs.append(outs['bbox'])
if data['is_last'] > 0:
# merge matching predictions
merged_results = {'bbox': []}
if combine_method == 'nms':
final_boxes = multiclass_nms(
np.concatenate(merged_bboxs), self.cfg.num_classes,
match_threshold, match_metric)
merged_results['bbox'] = np.concatenate(final_boxes)
elif combine_method == 'concat':
merged_results['bbox'] = np.concatenate(merged_bboxs)
else:
raise ValueError(
"Now only support 'nms' or 'concat' to fuse detection results."
)
merged_results['im_id'] = np.array([[0]])
merged_results['bbox_num'] = np.array(
[len(merged_results['bbox'])])
merged_bboxs = []
data['im_id'] = data['ori_im_id']
for _m in metrics:
_m.update(data, merged_results)
for key in ['im_shape', 'scale_factor', 'im_id']:
if isinstance(data, typing.Sequence):
merged_results[key] = data[0][key]
else:
merged_results[key] = data[key]
for key, value in merged_results.items():
if hasattr(value, 'numpy'):
merged_results[key] = value.numpy()
results.append(merged_results)
for _m in metrics:
_m.accumulate()
_m.reset()
if visualize:
for outs in results:
batch_res = get_infer_results(outs, clsid2catid)
bbox_num = outs['bbox_num']
start = 0
for i, im_id in enumerate(outs['im_id']):
image_path = imid2path[int(im_id)]
image = Image.open(image_path).convert('RGB')
image = ImageOps.exif_transpose(image)
self.status['original_image'] = np.array(image.copy())
end = start + bbox_num[i]
bbox_res = batch_res['bbox'][start:end] \
if 'bbox' in batch_res else None
mask_res = batch_res['mask'][start:end] \
if 'mask' in batch_res else None
segm_res = batch_res['segm'][start:end] \
if 'segm' in batch_res else None
keypoint_res = batch_res['keypoint'][start:end] \
if 'keypoint' in batch_res else None
pose3d_res = batch_res['pose3d'][start:end] \
if 'pose3d' in batch_res else None
image = visualize_results(
image, bbox_res, mask_res, segm_res, keypoint_res,
pose3d_res, int(im_id), catid2name, draw_threshold)
self.status['result_image'] = np.array(image.copy())
if self._compose_callback:
self._compose_callback.on_step_end(self.status)
# save image with detection
save_name = self._get_save_image_name(output_dir,
image_path)
logger.info("Detection bbox results save in {}".format(
save_name))
image.save(save_name, quality=95)
start = end
def predict(self,
images,
draw_threshold=0.5,
output_dir='output',
save_results=False,
visualize=True,
save_threshold=0,
do_eval=False):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if do_eval:
save_threshold = 0.0
self.dataset.set_images(images, do_eval=do_eval)
loader = create('TestReader')(self.dataset, 0)
imid2path = self.dataset.get_imid2path()
def setup_metrics_for_loader():
# mem
metrics = copy.deepcopy(self._metrics)
mode = self.mode
save_prediction_only = self.cfg[
'save_prediction_only'] if 'save_prediction_only' in self.cfg else None
output_eval = self.cfg[
'output_eval'] if 'output_eval' in self.cfg else None
# modify
self.mode = '_test'
self.cfg['save_prediction_only'] = True
self.cfg['output_eval'] = output_dir
self.cfg['imid2path'] = imid2path
self.cfg['save_threshold'] = save_threshold
self._init_metrics()
# restore
self.mode = mode
self.cfg.pop('save_prediction_only')
if save_prediction_only is not None:
self.cfg['save_prediction_only'] = save_prediction_only
self.cfg.pop('output_eval')
if output_eval is not None:
self.cfg['output_eval'] = output_eval
self.cfg.pop('imid2path')
_metrics = copy.deepcopy(self._metrics)
self._metrics = metrics
return _metrics
if save_results:
metrics = setup_metrics_for_loader()
else:
metrics = []
anno_file = self.dataset.get_anno()
clsid2catid, catid2name = get_categories(
self.cfg.metric, anno_file=anno_file)
# Run Infer
self.status['mode'] = 'test'
self.model.eval()
if self.cfg.get('print_flops', False):
flops_loader = create('TestReader')(self.dataset, 0)
self._flops(flops_loader)
results = []
for step_id, data in enumerate(tqdm(loader)):
self.status['step_id'] = step_id
# forward
if hasattr(self.model, 'modelTeacher'):
outs = self.model.modelTeacher(data)
else:
outs = self.model(data)
for _m in metrics:
_m.update(data, outs)
for key in ['im_shape', 'scale_factor', 'im_id']:
if isinstance(data, typing.Sequence):
outs[key] = data[0][key]
else:
outs[key] = data[key]
for key, value in outs.items():
if hasattr(value, 'numpy'):
outs[key] = value.numpy()
results.append(outs)
# sniper
if type(self.dataset) == SniperCOCODataSet:
results = self.dataset.anno_cropper.aggregate_chips_detections(
results)
for _m in metrics:
_m.accumulate()
_m.reset()
if visualize:
for outs in results:
batch_res = get_infer_results(outs, clsid2catid)
bbox_num = outs['bbox_num']
start = 0
for i, im_id in enumerate(outs['im_id']):
image_path = imid2path[int(im_id)]
image = Image.open(image_path).convert('RGB')
image = ImageOps.exif_transpose(image)
self.status['original_image'] = np.array(image.copy())
end = start + bbox_num[i]
bbox_res = batch_res['bbox'][start:end] \
if 'bbox' in batch_res else None
mask_res = batch_res['mask'][start:end] \
if 'mask' in batch_res else None
segm_res = batch_res['segm'][start:end] \
if 'segm' in batch_res else None
keypoint_res = batch_res['keypoint'][start:end] \
if 'keypoint' in batch_res else None
pose3d_res = batch_res['pose3d'][start:end] \
if 'pose3d' in batch_res else None
image = visualize_results(
image, bbox_res, mask_res, segm_res, keypoint_res,
pose3d_res, int(im_id), catid2name, draw_threshold)
self.status['result_image'] = np.array(image.copy())
if self._compose_callback:
self._compose_callback.on_step_end(self.status)
# save image with detection
save_name = self._get_save_image_name(output_dir,
image_path)
logger.info("Detection bbox results save in {}".format(
save_name))
image.save(save_name, quality=95)
start = end
return results
def _get_save_image_name(self, output_dir, image_path):
"""
Get save image name from source image path.
"""
image_name = os.path.split(image_path)[-1]
name, ext = os.path.splitext(image_name)
return os.path.join(output_dir, "{}".format(name)) + ext
def _get_infer_cfg_and_input_spec(self,
save_dir,
prune_input=True,
kl_quant=False,
yaml_name=None,
model=None):
if yaml_name is None:
yaml_name = 'infer_cfg.yml'
if model is None:
model = self.model
image_shape = None
im_shape = [None, 2]
scale_factor = [None, 2]
if self.cfg.architecture in MOT_ARCH:
test_reader_name = 'TestMOTReader'
else:
test_reader_name = 'TestReader'
if 'inputs_def' in self.cfg[test_reader_name]:
inputs_def = self.cfg[test_reader_name]['inputs_def']
image_shape = inputs_def.get('image_shape', None)
# set image_shape=[None, 3, -1, -1] as default
if image_shape is None:
image_shape = [None, 3, -1, -1]
if len(image_shape) == 3:
image_shape = [None] + image_shape
else:
im_shape = [image_shape[0], 2]
scale_factor = [image_shape[0], 2]
if hasattr(model, 'deploy'):
model.deploy = True
if 'slim' not in self.cfg:
for layer in model.sublayers():
if hasattr(layer, 'convert_to_deploy'):
layer.convert_to_deploy()
if hasattr(self.cfg, 'export') and 'fuse_conv_bn' in self.cfg[
'export'] and self.cfg['export']['fuse_conv_bn']:
model = fuse_conv_bn(model)
export_post_process = self.cfg['export'].get(
'post_process', False) if hasattr(self.cfg, 'export') else True
export_nms = self.cfg['export'].get('nms', False) if hasattr(
self.cfg, 'export') else True
export_benchmark = self.cfg['export'].get(
'benchmark', False) if hasattr(self.cfg, 'export') else False
if hasattr(model, 'fuse_norm'):
model.fuse_norm = self.cfg['TestReader'].get('fuse_normalize',
False)
if hasattr(model, 'export_post_process'):
model.export_post_process = export_post_process if not export_benchmark else False
if hasattr(model, 'export_nms'):
model.export_nms = export_nms if not export_benchmark else False
if export_post_process and not export_benchmark:
image_shape = [None] + image_shape[1:]
# Save infer cfg
_dump_infer_config(self.cfg,
os.path.join(save_dir, yaml_name), image_shape,
model)
input_spec = [{
"image": InputSpec(
shape=image_shape, name='image'),
"im_shape": InputSpec(
shape=im_shape, name='im_shape'),
"scale_factor": InputSpec(
shape=scale_factor, name='scale_factor')
}]
if self.cfg.architecture == 'DeepSORT':
input_spec[0].update({
"crops": InputSpec(
shape=[None, 3, 192, 64], name='crops')
})
if self.cfg.architecture == 'CLRNet':
input_spec[0].update({
"full_img_path": str,
"img_name": str,
})
if prune_input:
static_model = paddle.jit.to_static(
model, input_spec=input_spec, full_graph=True)
# NOTE: dy2st do not pruned program, but jit.save will prune program
# input spec, prune input spec here and save with pruned input spec
pruned_input_spec = _prune_input_spec(
input_spec, static_model.forward.main_program,
static_model.forward.outputs)
else:
static_model = None
pruned_input_spec = input_spec
# TODO: Hard code, delete it when support prune input_spec.
if self.cfg.architecture == 'PicoDet' and not export_post_process:
pruned_input_spec = [{
"image": InputSpec(
shape=image_shape, name='image')
}]
if kl_quant:
if self.cfg.architecture == 'PicoDet' or 'ppyoloe' in self.cfg.weights:
pruned_input_spec = [{
"image": InputSpec(
shape=image_shape, name='image'),
"scale_factor": InputSpec(
shape=scale_factor, name='scale_factor')
}]
elif 'tinypose' in self.cfg.weights:
pruned_input_spec = [{
"image": InputSpec(
shape=image_shape, name='image')
}]
return static_model, pruned_input_spec
def export(self, output_dir='output_inference', for_fd=False):
if hasattr(self.model, 'aux_neck'):
self.model.__delattr__('aux_neck')
if hasattr(self.model, 'aux_head'):
self.model.__delattr__('aux_head')
self.model.eval()
model = copy.deepcopy(self.model)
model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0]
if for_fd:
save_dir = output_dir
save_name = 'inference'
yaml_name = 'inference.yml'
else:
save_dir = os.path.join(output_dir, model_name)
save_name = 'model'
yaml_name = None
if not os.path.exists(save_dir):
os.makedirs(save_dir)
static_model, pruned_input_spec = self._get_infer_cfg_and_input_spec(
save_dir, yaml_name=yaml_name, model=model)
# dy2st and save model
if 'slim' not in self.cfg or 'QAT' not in self.cfg['slim_type']:
paddle.jit.save(
static_model,
os.path.join(save_dir, save_name),
input_spec=pruned_input_spec)
else:
self.cfg.slim.save_quantized_model(
self.model,
os.path.join(save_dir, save_name),
input_spec=pruned_input_spec)
logger.info("Export model and saved in {}".format(save_dir))
def post_quant(self, output_dir='output_inference'):
model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0]
save_dir = os.path.join(output_dir, model_name)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
for idx, data in enumerate(self.loader):
self.model(data)
if idx == int(self.cfg.get('quant_batch_num', 10)):
break
# TODO: support prune input_spec
kl_quant = True if hasattr(self.cfg.slim, 'ptq') else False
_, pruned_input_spec = self._get_infer_cfg_and_input_spec(
save_dir, prune_input=False, kl_quant=kl_quant)
self.cfg.slim.save_quantized_model(
self.model,
os.path.join(save_dir, 'model'),
input_spec=pruned_input_spec)
logger.info("Export Post-Quant model and saved in {}".format(save_dir))
def _flops(self, loader):
if hasattr(self.model, 'aux_neck'):
self.model.__delattr__('aux_neck')
if hasattr(self.model, 'aux_head'):
self.model.__delattr__('aux_head')
self.model.eval()
try:
import paddleslim
except Exception as e:
logger.warning(
'Unable to calculate flops, please install paddleslim, for example: `pip install paddleslim`'
)
return
from paddleslim.analysis import dygraph_flops as flops
input_data = None
for data in loader:
input_data = data
break
input_spec = [{
"image": input_data['image'][0].unsqueeze(0),
"im_shape": input_data['im_shape'][0].unsqueeze(0),
"scale_factor": input_data['scale_factor'][0].unsqueeze(0)
}]
flops = flops(self.model, input_spec) / (1000**3)
logger.info(" Model FLOPs : {:.6f}G. (image shape is {})".format(
flops, input_data['image'][0].unsqueeze(0).shape))
def parse_mot_images(self, cfg):
import glob
# for quant
dataset_dir = cfg['EvalMOTDataset'].dataset_dir
data_root = cfg['EvalMOTDataset'].data_root
data_root = '{}/{}'.format(dataset_dir, data_root)
seqs = os.listdir(data_root)
seqs.sort()
all_images = []
for seq in seqs:
infer_dir = os.path.join(data_root, seq)
assert infer_dir is None or os.path.isdir(infer_dir), \
"{} is not a directory".format(infer_dir)
images = set()
exts = ['jpg', 'jpeg', 'png', 'bmp']
exts += [ext.upper() for ext in exts]
for ext in exts:
images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
images = list(images)
images.sort()
assert len(images) > 0, "no image found in {}".format(infer_dir)
all_images.extend(images)
logger.info("Found {} inference images in total.".format(
len(images)))
return all_images
def predict_culane(self,
images,
output_dir='output',
save_results=False,
visualize=True):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
self.dataset.set_images(images)
loader = create('TestReader')(self.dataset, 0)
imid2path = self.dataset.get_imid2path()
def setup_metrics_for_loader():
# mem
metrics = copy.deepcopy(self._metrics)
mode = self.mode
save_prediction_only = self.cfg[
'save_prediction_only'] if 'save_prediction_only' in self.cfg else None
output_eval = self.cfg[
'output_eval'] if 'output_eval' in self.cfg else None
# modify
self.mode = '_test'
self.cfg['save_prediction_only'] = True
self.cfg['output_eval'] = output_dir
self.cfg['imid2path'] = imid2path
self._init_metrics()
# restore
self.mode = mode
self.cfg.pop('save_prediction_only')
if save_prediction_only is not None:
self.cfg['save_prediction_only'] = save_prediction_only
self.cfg.pop('output_eval')
if output_eval is not None:
self.cfg['output_eval'] = output_eval
self.cfg.pop('imid2path')
_metrics = copy.deepcopy(self._metrics)
self._metrics = metrics
return _metrics
if save_results:
metrics = setup_metrics_for_loader()
else:
metrics = []
# Run Infer
self.status['mode'] = 'test'
self.model.eval()
if self.cfg.get('print_flops', False):
flops_loader = create('TestReader')(self.dataset, 0)
self._flops(flops_loader)
results = []
for step_id, data in enumerate(tqdm(loader)):
self.status['step_id'] = step_id
# forward
outs = self.model(data)
for _m in metrics:
_m.update(data, outs)
for key in ['im_shape', 'scale_factor', 'im_id']:
if isinstance(data, typing.Sequence):
outs[key] = data[0][key]
else:
outs[key] = data[key]
for key, value in outs.items():
if hasattr(value, 'numpy'):
outs[key] = value.numpy()
results.append(outs)
for _m in metrics:
_m.accumulate()
_m.reset()
if visualize:
import cv2
for outs in results:
for i in range(len(outs['img_path'])):
lanes = outs['lanes'][i]
img_path = outs['img_path'][i]
img = cv2.imread(img_path)
out_file = os.path.join(output_dir,
os.path.basename(img_path))
lanes = [
lane.to_array(
sample_y_range=[
self.cfg['sample_y']['start'],
self.cfg['sample_y']['end'],
self.cfg['sample_y']['step']
],
img_w=self.cfg.ori_img_w,
img_h=self.cfg.ori_img_h) for lane in lanes
]
imshow_lanes(img, lanes, out_file=out_file)
return results
def reset_norm_param_attr(self, layer, **kwargs):
if isinstance(layer, (nn.BatchNorm2D, nn.LayerNorm, nn.GroupNorm)):
src_state_dict = layer.state_dict()
if isinstance(layer, nn.BatchNorm2D):
layer = nn.BatchNorm2D(
num_features=layer._num_features,
momentum=layer._momentum,
epsilon=layer._epsilon,
**kwargs)
elif isinstance(layer, nn.LayerNorm):
layer = nn.LayerNorm(
normalized_shape=layer._normalized_shape,
epsilon=layer._epsilon,
**kwargs)
else:
layer = nn.GroupNorm(
num_groups=layer._num_groups,
num_channels=layer._num_channels,
epsilon=layer._epsilon,
**kwargs)
layer.set_state_dict(src_state_dict)
else:
for name, sublayer in layer.named_children():
new_sublayer = self.reset_norm_param_attr(sublayer, **kwargs)
if new_sublayer is not sublayer:
setattr(layer, name, new_sublayer)
return layer
================================================
FILE: ppdet/engine/trainer_cot.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ppdet.core.workspace import create
from ppdet.utils.logger import setup_logger
logger = setup_logger('ppdet.engine')
from . import Trainer
__all__ = ['TrainerCot']
class TrainerCot(Trainer):
"""
Trainer for label-cotuning
calculate the relationship between base_classes and novel_classes
"""
def __init__(self, cfg, mode='train'):
super(TrainerCot, self).__init__(cfg, mode)
self.cotuning_init()
def cotuning_init(self):
num_classes_novel = self.cfg['num_classes']
self.load_weights(self.cfg.pretrain_weights)
self.model.eval()
relationship = self.model.relationship_learning(self.loader, num_classes_novel)
self.model.init_cot_head(relationship)
self.optimizer = create('OptimizerBuilder')(self.lr, self.model)
================================================
FILE: ppdet/engine/trainer_ssod.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import copy
import time
import typing
import numpy as np
import paddle
import paddle.nn as nn
import paddle.distributed as dist
from paddle.distributed import fleet
from ppdet.optimizer import ModelEMA, SimpleModelEMA
from ppdet.core.workspace import create
from ppdet.utils.checkpoint import load_weight, load_pretrain_weight, save_model
import ppdet.utils.stats as stats
from ppdet.utils import profiler
from ppdet.modeling.ssod.utils import align_weak_strong_shape
from .trainer import Trainer
from ppdet.utils.logger import setup_logger
from paddle.static import InputSpec
from ppdet.engine.export_utils import _dump_infer_config, _prune_input_spec
MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']
logger = setup_logger('ppdet.engine')
__all__ = ['Trainer_DenseTeacher', 'Trainer_ARSL', 'Trainer_Semi_RTDETR']
class Trainer_DenseTeacher(Trainer):
def __init__(self, cfg, mode='train'):
self.cfg = cfg
assert mode.lower() in ['train', 'eval', 'test'], \
"mode should be 'train', 'eval' or 'test'"
self.mode = mode.lower()
self.optimizer = None
self.is_loaded_weights = False
self.use_amp = self.cfg.get('amp', False)
self.amp_level = self.cfg.get('amp_level', 'O1')
self.custom_white_list = self.cfg.get('custom_white_list', None)
self.custom_black_list = self.cfg.get('custom_black_list', None)
# build data loader
capital_mode = self.mode.capitalize()
self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create(
'{}Dataset'.format(capital_mode))()
if self.mode == 'train':
self.dataset_unlabel = self.cfg['UnsupTrainDataset'] = create(
'UnsupTrainDataset')
self.loader = create('SemiTrainReader')(
self.dataset, self.dataset_unlabel, cfg.worker_num)
# build model
if 'model' not in self.cfg:
self.model = create(cfg.architecture)
else:
self.model = self.cfg.model
self.is_loaded_weights = True
# EvalDataset build with BatchSampler to evaluate in single device
# TODO: multi-device evaluate
if self.mode == 'eval':
self._eval_batch_sampler = paddle.io.BatchSampler(
self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
# If metric is VOC, need to be set collate_batch=False.
if cfg.metric == 'VOC':
cfg['EvalReader']['collate_batch'] = False
self.loader = create('EvalReader')(self.dataset, cfg.worker_num,
self._eval_batch_sampler)
# TestDataset build after user set images, skip loader creation here
# build optimizer in train mode
if self.mode == 'train':
steps_per_epoch = len(self.loader)
if steps_per_epoch < 1:
logger.warning(
"Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader."
)
self.lr = create('LearningRate')(steps_per_epoch)
self.optimizer = create('OptimizerBuilder')(self.lr, self.model)
# Unstructured pruner is only enabled in the train mode.
if self.cfg.get('unstructured_prune'):
self.pruner = create('UnstructuredPruner')(self.model,
steps_per_epoch)
if self.use_amp and self.amp_level == 'O2':
self.model, self.optimizer = paddle.amp.decorate(
models=self.model,
optimizers=self.optimizer,
level=self.amp_level)
self.use_ema = ('use_ema' in cfg and cfg['use_ema'])
if self.use_ema:
ema_decay = self.cfg.get('ema_decay', 0.9998)
ema_decay_type = self.cfg.get('ema_decay_type', 'threshold')
cycle_epoch = self.cfg.get('cycle_epoch', -1)
ema_black_list = self.cfg.get('ema_black_list', None)
self.ema = ModelEMA(
self.model,
decay=ema_decay,
ema_decay_type=ema_decay_type,
cycle_epoch=cycle_epoch,
ema_black_list=ema_black_list)
self.ema_start_iters = self.cfg.get('ema_start_iters', 0)
# simple_ema for SSOD
self.use_simple_ema = ('use_simple_ema' in cfg and
cfg['use_simple_ema'])
if self.use_simple_ema:
self.use_ema = True
ema_decay = self.cfg.get('ema_decay', 0.9996)
self.ema = SimpleModelEMA(self.model, decay=ema_decay)
self.ema_start_iters = self.cfg.get('ema_start_iters', 0)
self._nranks = dist.get_world_size()
self._local_rank = dist.get_rank()
self.status = {}
self.start_epoch = 0
self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch
# initial default callbacks
self._init_callbacks()
# initial default metrics
self._init_metrics()
self._reset_metrics()
def load_weights(self, weights):
if self.is_loaded_weights:
return
self.start_epoch = 0
load_pretrain_weight(self.model, weights)
load_pretrain_weight(self.ema.model, weights)
logger.info("Load weights {} to start training for teacher and student".
format(weights))
def resume_weights(self, weights, exchange=True):
# support Distill resume weights
if hasattr(self.model, 'student_model'):
self.start_epoch = load_weight(self.model.student_model, weights,
self.optimizer, exchange)
else:
self.start_epoch = load_weight(self.model, weights, self.optimizer,
self.ema
if self.use_ema else None, exchange)
logger.debug("Resume weights of epoch {}".format(self.start_epoch))
def train(self, validate=False):
self.semi_start_iters = self.cfg.get('semi_start_iters', 5000)
Init_mark = False
if validate:
self.cfg['EvalDataset'] = self.cfg.EvalDataset = create(
"EvalDataset")()
sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and
self.cfg.use_gpu and self._nranks > 1)
if sync_bn:
self.model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(
self.model)
if self.cfg.get('fleet', False):
self.model = fleet.distributed_model(self.model)
self.optimizer = fleet.distributed_optimizer(self.optimizer)
elif self._nranks > 1:
find_unused_parameters = self.cfg[
'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
self.model = paddle.DataParallel(
self.model, find_unused_parameters=find_unused_parameters)
self.ema.model = paddle.DataParallel(
self.ema.model, find_unused_parameters=find_unused_parameters)
self.status.update({
'epoch_id': self.start_epoch,
'step_id': 0,
'steps_per_epoch': len(self.loader),
'exchange_save_model': True,
})
# Note: exchange_save_model
# in DenseTeacher SSOD, the teacher model will be higher, so exchange when saving pdparams
self.status['batch_time'] = stats.SmoothedValue(
self.cfg.log_iter, fmt='{avg:.4f}')
self.status['data_time'] = stats.SmoothedValue(
self.cfg.log_iter, fmt='{avg:.4f}')
self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)
profiler_options = self.cfg.get('profiler_options', None)
self._compose_callback.on_train_begin(self.status)
train_cfg = self.cfg.DenseTeacher['train_cfg']
concat_sup_data = train_cfg.get('concat_sup_data', True)
for param in self.ema.model.parameters():
param.stop_gradient = True
for epoch_id in range(self.start_epoch, self.cfg.epoch):
self.status['mode'] = 'train'
self.status['epoch_id'] = epoch_id
self._compose_callback.on_epoch_begin(self.status)
self.loader.dataset_label.set_epoch(epoch_id)
self.loader.dataset_unlabel.set_epoch(epoch_id)
iter_tic = time.time()
loss_dict = {
'loss': paddle.to_tensor([0]),
'loss_sup_sum': paddle.to_tensor([0]),
'loss_unsup_sum': paddle.to_tensor([0]),
'fg_sum': paddle.to_tensor([0]),
}
if self._nranks > 1:
for k in self.model._layers.get_loss_keys():
loss_dict.update({k: paddle.to_tensor([0.])})
for k in self.model._layers.get_loss_keys():
loss_dict.update({'distill_' + k: paddle.to_tensor([0.])})
else:
for k in self.model.get_loss_keys():
loss_dict.update({k: paddle.to_tensor([0.])})
for k in self.model.get_loss_keys():
loss_dict.update({'distill_' + k: paddle.to_tensor([0.])})
# Note: for step_id, data in enumerate(self.loader): # enumerate bug
for step_id in range(len(self.loader)):
data = next(self.loader)
self.model.train()
self.ema.model.eval()
data_sup_w, data_sup_s, data_unsup_w, data_unsup_s = data
self.status['data_time'].update(time.time() - iter_tic)
self.status['step_id'] = step_id
profiler.add_profiler_step(profiler_options)
self._compose_callback.on_step_begin(self.status)
if data_sup_w['image'].shape != data_sup_s['image'].shape:
data_sup_w, data_sup_s = align_weak_strong_shape(data_sup_w,
data_sup_s)
data_sup_w['epoch_id'] = epoch_id
data_sup_s['epoch_id'] = epoch_id
if concat_sup_data:
for k, v in data_sup_s.items():
if k in ['epoch_id']:
continue
data_sup_s[k] = paddle.concat([v, data_sup_w[k]])
loss_dict_sup = self.model(data_sup_s)
else:
loss_dict_sup_w = self.model(data_sup_w)
loss_dict_sup = self.model(data_sup_s)
for k, v in loss_dict_sup_w.items():
loss_dict_sup[k] = (loss_dict_sup[k] + v) * 0.5
losses_sup = loss_dict_sup['loss'] * train_cfg['sup_weight']
losses_sup.backward()
losses = losses_sup.detach()
loss_dict.update(loss_dict_sup)
loss_dict.update({'loss_sup_sum': loss_dict['loss']})
curr_iter = len(self.loader) * epoch_id + step_id
st_iter = self.semi_start_iters
if curr_iter == st_iter:
logger.info("***" * 30)
logger.info('Semi starting ...')
logger.info("***" * 30)
if curr_iter > st_iter:
unsup_weight = train_cfg['unsup_weight']
if train_cfg['suppress'] == 'linear':
tar_iter = st_iter * 2
if curr_iter <= tar_iter:
unsup_weight *= (curr_iter - st_iter) / st_iter
elif train_cfg['suppress'] == 'exp':
tar_iter = st_iter + 2000
if curr_iter <= tar_iter:
scale = np.exp((curr_iter - tar_iter) / 1000)
unsup_weight *= scale
elif train_cfg['suppress'] == 'step':
tar_iter = st_iter * 2
if curr_iter <= tar_iter:
unsup_weight *= 0.25
else:
raise ValueError
if data_unsup_w['image'].shape != data_unsup_s[
'image'].shape:
data_unsup_w, data_unsup_s = align_weak_strong_shape(
data_unsup_w, data_unsup_s)
data_unsup_w['epoch_id'] = epoch_id
data_unsup_s['epoch_id'] = epoch_id
data_unsup_s['get_data'] = True
student_preds = self.model(data_unsup_s)
with paddle.no_grad():
data_unsup_w['is_teacher'] = True
teacher_preds = self.ema.model(data_unsup_w)
train_cfg['curr_iter'] = curr_iter
train_cfg['st_iter'] = st_iter
if self._nranks > 1:
loss_dict_unsup = self.model._layers.get_ssod_loss(
student_preds, teacher_preds, train_cfg)
else:
loss_dict_unsup = self.model.get_ssod_loss(
student_preds, teacher_preds, train_cfg)
fg_num = loss_dict_unsup["fg_sum"]
del loss_dict_unsup["fg_sum"]
distill_weights = train_cfg['loss_weight']
loss_dict_unsup = {
k: v * distill_weights[k]
for k, v in loss_dict_unsup.items()
}
losses_unsup = sum([
metrics_value
for metrics_value in loss_dict_unsup.values()
]) * unsup_weight
losses_unsup.backward()
loss_dict.update(loss_dict_unsup)
loss_dict.update({'loss_unsup_sum': losses_unsup})
losses += losses_unsup.detach()
loss_dict.update({"fg_sum": fg_num})
loss_dict['loss'] = losses
self.optimizer.step()
curr_lr = self.optimizer.get_lr()
self.lr.step()
self.optimizer.clear_grad()
self.status['learning_rate'] = curr_lr
if self._nranks < 2 or self._local_rank == 0:
self.status['training_staus'].update(loss_dict)
self.status['batch_time'].update(time.time() - iter_tic)
self._compose_callback.on_step_end(self.status)
# Note: ema_start_iters
if self.use_ema and curr_iter == self.ema_start_iters:
logger.info("***" * 30)
logger.info('EMA starting ...')
logger.info("***" * 30)
self.ema.update(self.model, decay=0)
elif self.use_ema and curr_iter > self.ema_start_iters:
self.ema.update(self.model)
iter_tic = time.time()
is_snapshot = (self._nranks < 2 or self._local_rank == 0) \
and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 or epoch_id == self.end_epoch - 1)
if is_snapshot and self.use_ema:
# apply ema weight on model
weight = copy.deepcopy(self.ema.model.state_dict())
for k, v in weight.items():
if paddle.is_floating_point(v):
weight[k].stop_gradient = True
self.status['weight'] = weight
self._compose_callback.on_epoch_end(self.status)
if validate and is_snapshot:
if not hasattr(self, '_eval_loader'):
# build evaluation dataset and loader
self._eval_dataset = self.cfg.EvalDataset
self._eval_batch_sampler = \
paddle.io.BatchSampler(
self._eval_dataset,
batch_size=self.cfg.EvalReader['batch_size'])
# If metric is VOC, need to be set collate_batch=False.
if self.cfg.metric == 'VOC':
self.cfg['EvalReader']['collate_batch'] = False
self._eval_loader = create('EvalReader')(
self._eval_dataset,
self.cfg.worker_num,
batch_sampler=self._eval_batch_sampler)
# if validation in training is enabled, metrics should be re-init
# Init_mark makes sure this code will only execute once
if validate and Init_mark == False:
Init_mark = True
self._init_metrics(validate=validate)
self._reset_metrics()
with paddle.no_grad():
self.status['save_best_model'] = True
self._eval_with_loader(self._eval_loader)
if is_snapshot and self.use_ema:
self.status.pop('weight')
self._compose_callback.on_train_end(self.status)
def evaluate(self):
# get distributed model
if self.cfg.get('fleet', False):
self.model = fleet.distributed_model(self.model)
self.optimizer = fleet.distributed_optimizer(self.optimizer)
elif self._nranks > 1:
find_unused_parameters = self.cfg[
'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
self.model = paddle.DataParallel(
self.model, find_unused_parameters=find_unused_parameters)
with paddle.no_grad():
self._eval_with_loader(self.loader)
def _eval_with_loader(self, loader):
sample_num = 0
tic = time.time()
self._compose_callback.on_epoch_begin(self.status)
self.status['mode'] = 'eval'
test_cfg = self.cfg.DenseTeacher['test_cfg']
if test_cfg['inference_on'] == 'teacher':
logger.info("***** teacher model evaluating *****")
eval_model = self.ema.model
else:
logger.info("***** student model evaluating *****")
eval_model = self.model
eval_model.eval()
if self.cfg.get('print_flops', False):
flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
self.dataset, self.cfg.worker_num, self._eval_batch_sampler)
self._flops(flops_loader)
for step_id, data in enumerate(loader):
self.status['step_id'] = step_id
self._compose_callback.on_step_begin(self.status)
# forward
if self.use_amp:
with paddle.amp.auto_cast(
enable=self.cfg.use_gpu or self.cfg.use_mlu,
custom_white_list=self.custom_white_list,
custom_black_list=self.custom_black_list,
level=self.amp_level):
outs = eval_model(data)
else:
outs = eval_model(data)
# update metrics
for metric in self._metrics:
metric.update(data, outs)
# multi-scale inputs: all inputs have same im_id
if isinstance(data, typing.Sequence):
sample_num += data[0]['im_id'].numpy().shape[0]
else:
sample_num += data['im_id'].numpy().shape[0]
self._compose_callback.on_step_end(self.status)
self.status['sample_num'] = sample_num
self.status['cost_time'] = time.time() - tic
# accumulate metric to log out
for metric in self._metrics:
metric.accumulate()
metric.log()
self._compose_callback.on_epoch_end(self.status)
self._reset_metrics()
class Trainer_ARSL(Trainer):
def __init__(self, cfg, mode='train'):
self.cfg = cfg
assert mode.lower() in ['train', 'eval', 'test'], \
"mode should be 'train', 'eval' or 'test'"
self.mode = mode.lower()
self.optimizer = None
self.is_loaded_weights = False
capital_mode = self.mode.capitalize()
self.use_ema = False
self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create(
'{}Dataset'.format(capital_mode))()
if self.mode == 'train':
self.dataset_unlabel = self.cfg['UnsupTrainDataset'] = create(
'UnsupTrainDataset')
self.loader = create('SemiTrainReader')(
self.dataset, self.dataset_unlabel, cfg.worker_num)
# build model
if 'model' not in self.cfg:
self.student_model = create(cfg.architecture)
self.teacher_model = create(cfg.architecture)
self.model = EnsembleTSModel(self.teacher_model, self.student_model)
else:
self.model = self.cfg.model
self.is_loaded_weights = True
# save path for burn-in model
self.base_path = cfg.get('weights')
self.base_path = os.path.dirname(self.base_path)
# EvalDataset build with BatchSampler to evaluate in single device
# TODO: multi-device evaluate
if self.mode == 'eval':
self._eval_batch_sampler = paddle.io.BatchSampler(
self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
self.loader = create('{}Reader'.format(self.mode.capitalize()))(
self.dataset, cfg.worker_num, self._eval_batch_sampler)
# TestDataset build after user set images, skip loader creation here
self.start_epoch = 0
self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch
self.epoch_iter = self.cfg.epoch_iter # set fixed iter in each epoch to control checkpoint
# build optimizer in train mode
if self.mode == 'train':
steps_per_epoch = self.epoch_iter
self.lr = create('LearningRate')(steps_per_epoch)
self.optimizer = create('OptimizerBuilder')(self.lr,
self.model.modelStudent)
self._nranks = dist.get_world_size()
self._local_rank = dist.get_rank()
self.status = {}
# initial default callbacks
self._init_callbacks()
# initial default metrics
self._init_metrics()
self._reset_metrics()
self.iter = 0
def resume_weights(self, weights):
# support Distill resume weights
if hasattr(self.model, 'student_model'):
self.start_epoch = load_weight(self.model.student_model, weights,
self.optimizer)
else:
self.start_epoch = load_weight(self.model, weights, self.optimizer)
logger.debug("Resume weights of epoch {}".format(self.start_epoch))
def train(self, validate=False):
assert self.mode == 'train', "Model not in 'train' mode"
Init_mark = False
# if validation in training is enabled, metrics should be re-init
if validate:
self._init_metrics(validate=validate)
self._reset_metrics()
if self.cfg.get('fleet', False):
self.model.modelStudent = fleet.distributed_model(
self.model.modelStudent)
self.optimizer = fleet.distributed_optimizer(self.optimizer)
elif self._nranks > 1:
find_unused_parameters = self.cfg[
'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
self.model.modelStudent = paddle.DataParallel(
self.model.modelStudent,
find_unused_parameters=find_unused_parameters)
# set fixed iter in each epoch to control checkpoint
self.status.update({
'epoch_id': self.start_epoch,
'step_id': 0,
'steps_per_epoch': self.epoch_iter
})
print('338 Len of DataLoader: {}'.format(len(self.loader)))
self.status['batch_time'] = stats.SmoothedValue(
self.cfg.log_iter, fmt='{avg:.4f}')
self.status['data_time'] = stats.SmoothedValue(
self.cfg.log_iter, fmt='{avg:.4f}')
self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)
self._compose_callback.on_train_begin(self.status)
epoch_id = self.start_epoch
self.iter = self.start_epoch * self.epoch_iter
# use iter rather than epoch to control training schedule
while self.iter < self.cfg.max_iter:
# epoch loop
self.status['mode'] = 'train'
self.status['epoch_id'] = epoch_id
self._compose_callback.on_epoch_begin(self.status)
self.loader.dataset_label.set_epoch(epoch_id)
self.loader.dataset_unlabel.set_epoch(epoch_id)
paddle.device.cuda.empty_cache() # clear GPU memory
# set model status
self.model.modelStudent.train()
self.model.modelTeacher.eval()
iter_tic = time.time()
# iter loop in each eopch
for step_id in range(self.epoch_iter):
data = next(self.loader)
self.status['data_time'].update(time.time() - iter_tic)
self.status['step_id'] = step_id
# profiler.add_profiler_step(profiler_options)
self._compose_callback.on_step_begin(self.status)
# model forward and calculate loss
loss_dict = self.run_step_full_semisup(data)
if (step_id + 1) % self.cfg.optimize_rate == 0:
self.optimizer.step()
self.optimizer.clear_grad()
curr_lr = self.optimizer.get_lr()
self.lr.step()
# update log status
self.status['learning_rate'] = curr_lr
if self._nranks < 2 or self._local_rank == 0:
self.status['training_staus'].update(loss_dict)
self.status['batch_time'].update(time.time() - iter_tic)
self._compose_callback.on_step_end(self.status)
self.iter += 1
iter_tic = time.time()
self._compose_callback.on_epoch_end(self.status)
if validate and (self._nranks < 2 or self._local_rank == 0) \
and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 \
or epoch_id == self.end_epoch - 1):
if not hasattr(self, '_eval_loader'):
# build evaluation dataset and loader
self._eval_dataset = self.cfg.EvalDataset
self._eval_batch_sampler = \
paddle.io.BatchSampler(
self._eval_dataset,
batch_size=self.cfg.EvalReader['batch_size'])
self._eval_loader = create('EvalReader')(
self._eval_dataset,
self.cfg.worker_num,
batch_sampler=self._eval_batch_sampler)
if validate and Init_mark == False:
Init_mark = True
self._init_metrics(validate=validate)
self._reset_metrics()
with paddle.no_grad():
self.status['save_best_model'] = True
# before burn-in stage, eval student. after burn-in stage, eval teacher
if self.iter <= self.cfg.SEMISUPNET['BURN_UP_STEP']:
print("start eval student model")
self._eval_with_loader(
self._eval_loader, mode="student")
else:
print("start eval teacher model")
self._eval_with_loader(
self._eval_loader, mode="teacher")
epoch_id += 1
self._compose_callback.on_train_end(self.status)
def merge_data(self, data1, data2):
data = copy.deepcopy(data1)
for k, v in data1.items():
if type(v) is paddle.Tensor:
data[k] = paddle.concat(x=[data[k], data2[k]], axis=0)
elif type(v) is list:
data[k].extend(data2[k])
return data
def run_step_full_semisup(self, data):
label_data_k, label_data_q, unlabel_data_k, unlabel_data_q = data
data_merge = self.merge_data(label_data_k, label_data_q)
loss_sup_dict = self.model.modelStudent(data_merge, branch="supervised")
loss_dict = {}
for key in loss_sup_dict.keys():
if key[:4] == "loss":
loss_dict[key] = loss_sup_dict[key] * 1
losses_sup = paddle.add_n(list(loss_dict.values()))
# norm loss when using gradient accumulation
losses_sup = losses_sup / self.cfg.optimize_rate
losses_sup.backward()
for key in loss_sup_dict.keys():
loss_dict[key + "_pseudo"] = paddle.to_tensor([0])
loss_dict["loss_tot"] = losses_sup
"""
semi-supervised training after burn-in stage
"""
if self.iter >= self.cfg.SEMISUPNET['BURN_UP_STEP']:
# init teacher model with burn-up weight
if self.iter == self.cfg.SEMISUPNET['BURN_UP_STEP']:
print(
'Starting semi-supervised learning and load the teacher model.'
)
self._update_teacher_model(keep_rate=0.00)
# save burn-in model
if dist.get_world_size() < 2 or dist.get_rank() == 0:
print('saving burn-in model.')
save_name = 'burnIn'
epoch_id = self.iter // self.epoch_iter
save_model(self.model, self.optimizer, self.base_path,
save_name, epoch_id)
# Update teacher model with EMA
elif (self.iter + 1) % self.cfg.optimize_rate == 0:
self._update_teacher_model(
keep_rate=self.cfg.SEMISUPNET['EMA_KEEP_RATE'])
#warm-up weight for pseudo loss
pseudo_weight = self.cfg.SEMISUPNET['UNSUP_LOSS_WEIGHT']
pseudo_warmup_iter = self.cfg.SEMISUPNET['PSEUDO_WARM_UP_STEPS']
temp = self.iter - self.cfg.SEMISUPNET['BURN_UP_STEP']
if temp <= pseudo_warmup_iter:
pseudo_weight *= (temp / pseudo_warmup_iter)
# get teacher predictions on weak-augmented unlabeled data
with paddle.no_grad():
teacher_pred = self.model.modelTeacher(
unlabel_data_k, branch='semi_supervised')
# calculate unsupervised loss on strong-augmented unlabeled data
loss_unsup_dict = self.model.modelStudent(
unlabel_data_q,
branch="semi_supervised",
teacher_prediction=teacher_pred, )
for key in loss_unsup_dict.keys():
if key[-6:] == "pseudo":
loss_unsup_dict[key] = loss_unsup_dict[key] * pseudo_weight
losses_unsup = paddle.add_n(list(loss_unsup_dict.values()))
# norm loss when using gradient accumulation
losses_unsup = losses_unsup / self.cfg.optimize_rate
losses_unsup.backward()
loss_dict.update(loss_unsup_dict)
loss_dict["loss_tot"] += losses_unsup
return loss_dict
def export(self, output_dir='output_inference'):
self.model.eval()
model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0]
save_dir = os.path.join(output_dir, model_name)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
image_shape = None
if self.cfg.architecture in MOT_ARCH:
test_reader_name = 'TestMOTReader'
else:
test_reader_name = 'TestReader'
if 'inputs_def' in self.cfg[test_reader_name]:
inputs_def = self.cfg[test_reader_name]['inputs_def']
image_shape = inputs_def.get('image_shape', None)
# set image_shape=[3, -1, -1] as default
if image_shape is None:
image_shape = [3, -1, -1]
self.model.modelTeacher.eval()
if hasattr(self.model.modelTeacher, 'deploy'):
self.model.modelTeacher.deploy = True
# Save infer cfg
_dump_infer_config(self.cfg,
os.path.join(save_dir, 'infer_cfg.yml'), image_shape,
self.model.modelTeacher)
input_spec = [{
"image": InputSpec(
shape=[None] + image_shape, name='image'),
"im_shape": InputSpec(
shape=[None, 2], name='im_shape'),
"scale_factor": InputSpec(
shape=[None, 2], name='scale_factor')
}]
if self.cfg.architecture == 'DeepSORT':
input_spec[0].update({
"crops": InputSpec(
shape=[None, 3, 192, 64], name='crops')
})
static_model = paddle.jit.to_static(
self.model.modelTeacher, input_spec=input_spec)
# NOTE: dy2st do not pruned program, but jit.save will prune program
# input spec, prune input spec here and save with pruned input spec
pruned_input_spec = _prune_input_spec(input_spec,
static_model.forward.main_program,
static_model.forward.outputs)
# dy2st and save model
if 'slim' not in self.cfg or self.cfg['slim_type'] != 'QAT':
paddle.jit.save(
static_model,
os.path.join(save_dir, 'model'),
input_spec=pruned_input_spec)
else:
self.cfg.slim.save_quantized_model(
self.model.modelTeacher,
os.path.join(save_dir, 'model'),
input_spec=pruned_input_spec)
logger.info("Export model and saved in {}".format(save_dir))
def _eval_with_loader(self, loader, mode="teacher"):
sample_num = 0
tic = time.time()
self._compose_callback.on_epoch_begin(self.status)
self.status['mode'] = 'eval'
# self.model.eval()
self.model.modelTeacher.eval()
self.model.modelStudent.eval()
for step_id, data in enumerate(loader):
self.status['step_id'] = step_id
self._compose_callback.on_step_begin(self.status)
if mode == "teacher":
outs = self.model.modelTeacher(data)
else:
outs = self.model.modelStudent(data)
# update metrics
for metric in self._metrics:
metric.update(data, outs)
sample_num += data['im_id'].numpy().shape[0]
self._compose_callback.on_step_end(self.status)
self.status['sample_num'] = sample_num
self.status['cost_time'] = time.time() - tic
# accumulate metric to log out
for metric in self._metrics:
metric.accumulate()
metric.log()
self._compose_callback.on_epoch_end(self.status)
# reset metric states for metric may performed multiple times
self._reset_metrics()
def evaluate(self):
with paddle.no_grad():
self._eval_with_loader(self.loader)
@paddle.no_grad()
def _update_teacher_model(self, keep_rate=0.996):
student_model_dict = copy.deepcopy(self.model.modelStudent.state_dict())
new_teacher_dict = dict()
for key, value in self.model.modelTeacher.state_dict().items():
if key in student_model_dict.keys():
v = student_model_dict[key] * (1 - keep_rate
) + value * keep_rate
v.stop_gradient = True
new_teacher_dict[key] = v
else:
raise Exception("{} is not found in student model".format(key))
self.model.modelTeacher.set_dict(new_teacher_dict)
class EnsembleTSModel(nn.Layer):
def __init__(self, modelTeacher, modelStudent):
super(EnsembleTSModel, self).__init__()
self.modelTeacher = modelTeacher
self.modelStudent = modelStudent
class Trainer_Semi_RTDETR(Trainer):
def __init__(self, cfg, mode='train'):
self.cfg = cfg
assert mode.lower() in ['train', 'eval', 'test'], \
"mode should be 'train', 'eval' or 'test'"
self.mode = mode.lower()
self.optimizer = None
self.is_loaded_weights = False
self.use_amp = self.cfg.get('amp', False)
self.amp_level = self.cfg.get('amp_level', 'O1')
self.custom_white_list = self.cfg.get('custom_white_list', None)
self.custom_black_list = self.cfg.get('custom_black_list', None)
# build data loader
capital_mode = self.mode.capitalize()
self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create(
'{}Dataset'.format(capital_mode))()
if self.mode == 'train':
self.dataset_unlabel = self.cfg['UnsupTrainDataset'] = create(
'UnsupTrainDataset')
self.loader = create('SemiTrainReader')(
self.dataset, self.dataset_unlabel, cfg.worker_num)
# build model
if 'model' not in self.cfg:
self.model = create(cfg.SSOD)
else:
self.model = self.cfg.model
self.is_loaded_weights = True
# EvalDataset build with BatchSampler to evaluate in single device
# TODO: multi-device evaluate
if self.mode == 'eval':
self._eval_batch_sampler = paddle.io.BatchSampler(
self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
# If metric is VOC, need to be set collate_batch=False.
if cfg.metric == 'VOC':
cfg['EvalReader']['collate_batch'] = False
self.loader = create('EvalReader')(self.dataset, cfg.worker_num,
self._eval_batch_sampler)
# TestDataset build after user set images, skip loader creation here
# build optimizer in train mode
if self.mode == 'train':
steps_per_epoch = len(self.loader)
if steps_per_epoch < 1:
logger.warning(
"Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader."
)
self.lr = create('LearningRate')(steps_per_epoch)
self.optimizer = create('OptimizerBuilder')(self.lr, self.model)
# Unstructured pruner is only enabled in the train mode.
if self.cfg.get('unstructured_prune'):
self.pruner = create('UnstructuredPruner')(self.model,
steps_per_epoch)
if self.use_amp and self.amp_level == 'O2':
self.model, self.optimizer = paddle.amp.decorate(
models=self.model,
optimizers=self.optimizer,
level=self.amp_level)
self._nranks = dist.get_world_size()
self._local_rank = dist.get_rank()
self.status = {}
self.start_epoch = 0
self.start_iter = 0
self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch
# initial default callbacks
self._init_callbacks()
# initial default metrics
self._init_metrics()
self._reset_metrics()
def load_semi_weights(self, t_weights, s_weights):
if self.is_loaded_weights:
return
self.start_epoch = 0
load_pretrain_weight(self.model.teacher, t_weights)
load_pretrain_weight(self.model.student, s_weights)
logger.info("Load teacher weights {} to start training".format(
t_weights))
logger.info("Load student weights {} to start training".format(
s_weights))
def resume_weights(self, weights, exchange=True):
# support Distill resume weights
if hasattr(self.model, 'student_model'):
self.start_epoch = load_weight(self.model.student_model, weights,
self.optimizer, exchange)
else:
self.start_iter, self.start_epoch = load_weight(
self.model, weights, self.optimizer, self.ema
if self.use_ema else None, exchange)
logger.debug("Resume weights of epoch {}".format(self.start_epoch))
logger.debug("Resume weights of iter {}".format(self.start_iter))
def train(self, validate=False):
assert self.mode == 'train', "Model not in 'train' mode"
Init_mark = False
if validate:
self.cfg.EvalDataset = create("EvalDataset")()
model = self.model
sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and
self.cfg.use_gpu and self._nranks > 1)
if sync_bn:
# self.model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(
# self.model)
model.teacher = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(
model.teacher)
model.student = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(
self.model.student)
if self.cfg.get('fleet', False):
# model = fleet.distributed_model(model)
model = fleet.distributed_model(model)
self.optimizer = fleet.distributed_optimizer(self.optimizer)
elif self._nranks > 1:
find_unused_parameters = self.cfg[
'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
model = paddle.DataParallel(
model, find_unused_parameters=find_unused_parameters)
if self.cfg.get('amp', False):
scaler = amp.GradScaler(
enable=self.cfg.use_gpu or self.cfg.use_npu,
init_loss_scaling=1024)
self.status.update({
'epoch_id': self.start_epoch,
'iter_id': self.start_iter,
# 'step_id': self.start_step,
'steps_per_epoch': len(self.loader),
})
self.status['batch_time'] = stats.SmoothedValue(
self.cfg.log_iter, fmt='{avg:.4f}')
self.status['data_time'] = stats.SmoothedValue(
self.cfg.log_iter, fmt='{avg:.4f}')
self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)
if self.cfg.get('print_flops', False):
flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
self.dataset, self.cfg.worker_num)
self._flops(flops_loader)
profiler_options = self.cfg.get('profiler_options', None)
self._compose_callback.on_train_begin(self.status)
iter_id = self.start_iter
self.status['iter_id'] = iter_id
self.status['eval_interval'] = self.cfg.eval_interval
self.status['save_interval'] = self.cfg.save_interval
for epoch_id in range(self.start_epoch, self.cfg.epoch):
self.status['mode'] = 'train'
self.status['epoch_id'] = epoch_id
self._compose_callback.on_epoch_begin(self.status)
self.loader.dataset_label.set_epoch(epoch_id)
self.loader.dataset_unlabel.set_epoch(epoch_id)
iter_tic = time.time()
if self._nranks > 1:
# print(model)
model._layers.teacher.eval()
model._layers.student.train()
else:
model.teacher.eval()
model.student.train()
iter_tic = time.time()
for step_id in range(len(self.loader)):
data = next(self.loader)
data_sup_w, data_sup_s, data_unsup_w, data_unsup_s = data
data_sup_w['epoch_id'] = epoch_id
data_sup_s['epoch_id'] = epoch_id
data_unsup_w['epoch_id'] = epoch_id
data_unsup_s['epoch_id'] = epoch_id
data = [data_sup_w, data_sup_s, data_unsup_w, data_unsup_s]
iter_id += 1
self.status['data_time'].update(time.time() - iter_tic)
self.status['step_id'] = step_id
self.status['iter_id'] = iter_id
data.append(iter_id)
profiler.add_profiler_step(profiler_options)
self._compose_callback.on_step_begin(self.status)
if self.cfg.get('amp', False):
with amp.auto_cast(enable=self.cfg.use_gpu):
# model forward
if self._nranks > 1:
outputs = model._layers(data)
else:
outputs = model(data)
loss = outputs['loss']
scaled_loss = scaler.scale(loss)
scaled_loss.backward()
scaler.minimize(self.optimizer, scaled_loss)
else:
outputs = model(data)
loss = outputs['loss']
# model backward
loss.backward()
self.optimizer.step()
curr_lr = self.optimizer.get_lr()
self.lr.step()
if self.cfg.get('unstructured_prune'):
self.pruner.step()
self.optimizer.clear_grad()
# print(outputs)
# outputs=reduce_dict(outputs)
# if self.model.debug:
# check_gradient(model)
# self.check_gradient()
self.status['learning_rate'] = curr_lr
if self._nranks < 2 or self._local_rank == 0:
self.status['training_staus'].update(outputs)
self.status['batch_time'].update(time.time() - iter_tic)
if validate and (self._nranks < 2 or self._local_rank == 0) and \
((iter_id + 1) % self.cfg.eval_interval == 0):
if not hasattr(self, '_eval_loader'):
# build evaluation dataset and loader
self._eval_dataset = self.cfg.EvalDataset
self._eval_batch_sampler = \
paddle.io.BatchSampler(
self._eval_dataset,
batch_size=self.cfg.EvalReader['batch_size'])
# If metric is VOC, need to be set collate_batch=False.
if self.cfg.metric == 'VOC':
self.cfg['EvalReader']['collate_batch'] = False
self._eval_loader = create('EvalReader')(
self._eval_dataset,
self.cfg.worker_num,
batch_sampler=self._eval_batch_sampler)
# if validation in training is enabled, metrics should be re-init
# Init_mark makes sure this code will only execute once
if validate and Init_mark == False:
Init_mark = True
self._init_metrics(validate=validate)
self._reset_metrics()
with paddle.no_grad():
self.status['save_best_model'] = True
self._eval_with_loader(self._eval_loader)
model._layers.student.train()
self._compose_callback.on_step_end(self.status)
iter_tic = time.time()
if self.cfg.get('unstructured_prune'):
self.pruner.update_params()
self._compose_callback.on_epoch_end(self.status)
self._compose_callback.on_train_end(self.status)
def _eval_with_loader(self, loader):
sample_num = 0
tic = time.time()
self._compose_callback.on_epoch_begin(self.status)
self.status['mode'] = 'eval'
self.model.eval()
if self.cfg.get('print_flops', False):
flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
self.dataset, self.cfg.worker_num, self._eval_batch_sampler)
self._flops(flops_loader)
print("*****teacher evaluate*****")
for step_id, data in enumerate(loader):
self.status['step_id'] = step_id
self._compose_callback.on_step_begin(self.status)
# forward
outs = self.model.teacher(data)
# update metrics
for metric in self._metrics:
metric.update(data, outs)
# multi-scale inputs: all inputs have same im_id
if isinstance(data, typing.Sequence):
sample_num += data[0]['im_id'].numpy().shape[0]
else:
sample_num += data['im_id'].numpy().shape[0]
self._compose_callback.on_step_end(self.status)
self.status['sample_num'] = sample_num
self.status['cost_time'] = time.time() - tic
# accumulate metric to log out
for metric in self._metrics:
metric.accumulate()
metric.log()
self._compose_callback.on_epoch_end(self.status)
# reset metric states for metric may performed multiple times
self._reset_metrics()
print("*****student evaluate*****")
for step_id, data in enumerate(loader):
self.status['step_id'] = step_id
self._compose_callback.on_step_begin(self.status)
# forward
outs = self.model.student(data)
# update metrics
for metric in self._metrics:
metric.update(data, outs)
# multi-scale inputs: all inputs have same im_id
if isinstance(data, typing.Sequence):
sample_num += data[0]['im_id'].numpy().shape[0]
else:
sample_num += data['im_id'].numpy().shape[0]
self._compose_callback.on_step_end(self.status)
self.status['sample_num'] = sample_num
self.status['cost_time'] = time.time() - tic
# accumulate metric to log out
for metric in self._metrics:
metric.accumulate()
metric.log()
# reset metric states for metric may performed multiple times
self._reset_metrics()
self.status['mode'] = 'train'
def evaluate(self):
with paddle.no_grad():
self._eval_with_loader(self.loader)
================================================
FILE: ppdet/ext_op/README.md
================================================
# 自定义OP编译
旋转框IOU计算OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。
## 1. 环境依赖
- Paddle >= 2.0.1
- gcc 8.2
## 2. 安装
```
python setup.py install
```
编译完成后即可使用,以下为`rbox_iou`的使用示例
```
# 引入自定义op
from ext_op import rbox_iou
paddle.set_device('gpu:0')
paddle.disable_static()
rbox1 = np.random.rand(13000, 5)
rbox2 = np.random.rand(7, 5)
pd_rbox1 = paddle.to_tensor(rbox1)
pd_rbox2 = paddle.to_tensor(rbox2)
iou = rbox_iou(pd_rbox1, pd_rbox2)
print('iou', iou)
```
## 3. 单元测试
可以通过执行单元测试来确认自定义算子功能的正确性,执行单元测试的示例如下所示:
```
python unittest/test_matched_rbox_iou.py
```
================================================
FILE: ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cc
================================================
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// The code is based on
// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
#include "../rbox_iou/rbox_iou_utils.h"
#include "paddle/extension.h"
template
void matched_rbox_iou_cpu_kernel(const int rbox_num, const T *rbox1_data_ptr,
const T *rbox2_data_ptr, T *output_data_ptr) {
int i;
for (i = 0; i < rbox_num; i++) {
output_data_ptr[i] =
rbox_iou_single(rbox1_data_ptr + i * 5, rbox2_data_ptr + i * 5);
}
}
#define CHECK_INPUT_CPU(x) \
PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
std::vector
MatchedRboxIouCPUForward(const paddle::Tensor &rbox1,
const paddle::Tensor &rbox2) {
CHECK_INPUT_CPU(rbox1);
CHECK_INPUT_CPU(rbox2);
PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], "inputs must be same dim");
auto rbox_num = rbox1.shape()[0];
auto output = paddle::empty({rbox_num}, rbox1.dtype(), paddle::CPUPlace());
PD_DISPATCH_FLOATING_TYPES(rbox1.type(), "matched_rbox_iou_cpu_kernel", ([&] {
matched_rbox_iou_cpu_kernel(
rbox_num, rbox1.data(),
rbox2.data(), output.data());
}));
return {output};
}
#ifdef PADDLE_WITH_CUDA
std::vector
MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1,
const paddle::Tensor &rbox2);
#endif
#define CHECK_INPUT_SAME(x1, x2) \
PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.")
std::vector MatchedRboxIouForward(const paddle::Tensor &rbox1,
const paddle::Tensor &rbox2) {
CHECK_INPUT_SAME(rbox1, rbox2);
if (rbox1.is_cpu()) {
return MatchedRboxIouCPUForward(rbox1, rbox2);
#ifdef PADDLE_WITH_CUDA
} else if (rbox1.is_gpu()) {
return MatchedRboxIouCUDAForward(rbox1, rbox2);
#endif
}
}
std::vector>
MatchedRboxIouInferShape(std::vector rbox1_shape,
std::vector rbox2_shape) {
return {{rbox1_shape[0]}};
}
std::vector MatchedRboxIouInferDtype(paddle::DataType t1,
paddle::DataType t2) {
return {t1};
}
PD_BUILD_OP(matched_rbox_iou)
.Inputs({"RBOX1", "RBOX2"})
.Outputs({"Output"})
.SetKernelFn(PD_KERNEL(MatchedRboxIouForward))
.SetInferShapeFn(PD_INFER_SHAPE(MatchedRboxIouInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(MatchedRboxIouInferDtype));
================================================
FILE: ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cu
================================================
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// The code is based on
// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
#include "../rbox_iou/rbox_iou_utils.h"
#include "paddle/extension.h"
template
__global__ void
matched_rbox_iou_cuda_kernel(const int rbox_num, const T *rbox1_data_ptr,
const T *rbox2_data_ptr, T *output_data_ptr) {
for (int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < rbox_num;
tid += blockDim.x * gridDim.x) {
output_data_ptr[tid] =
rbox_iou_single(rbox1_data_ptr + tid * 5, rbox2_data_ptr + tid * 5);
}
}
#define CHECK_INPUT_GPU(x) \
PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
std::vector
MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1,
const paddle::Tensor &rbox2) {
CHECK_INPUT_GPU(rbox1);
CHECK_INPUT_GPU(rbox2);
PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], "inputs must be same dim");
auto rbox_num = rbox1.shape()[0];
auto output = paddle::empty({rbox_num}, rbox1.dtype(), paddle::GPUPlace());
const int thread_per_block = 512;
const int block_per_grid = CeilDiv(rbox_num, thread_per_block);
PD_DISPATCH_FLOATING_TYPES(
rbox1.type(), "matched_rbox_iou_cuda_kernel", ([&] {
matched_rbox_iou_cuda_kernel<
data_t><<>>(
rbox_num, rbox1.data(), rbox2.data(),
output.data());
}));
return {output};
}
================================================
FILE: ppdet/ext_op/csrc/nms_rotated/nms_rotated.cc
================================================
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../rbox_iou/rbox_iou_utils.h"
#include "paddle/extension.h"
template
void nms_rotated_cpu_kernel(const T *boxes_data, const float threshold,
const int64_t num_boxes, int64_t *num_keep_boxes,
int64_t *output_data) {
int num_masks = CeilDiv(num_boxes, 64);
std::vector masks(num_masks, 0);
for (int64_t i = 0; i < num_boxes; ++i) {
if (masks[i / 64] & 1ULL << (i % 64))
continue;
T box_1[5];
for (int k = 0; k < 5; ++k) {
box_1[k] = boxes_data[i * 5 + k];
}
for (int64_t j = i + 1; j < num_boxes; ++j) {
if (masks[j / 64] & 1ULL << (j % 64))
continue;
T box_2[5];
for (int k = 0; k < 5; ++k) {
box_2[k] = boxes_data[j * 5 + k];
}
if (rbox_iou_single(box_1, box_2) > threshold) {
masks[j / 64] |= 1ULL << (j % 64);
}
}
}
int64_t output_data_idx = 0;
for (int64_t i = 0; i < num_boxes; ++i) {
if (masks[i / 64] & 1ULL << (i % 64))
continue;
output_data[output_data_idx++] = i;
}
*num_keep_boxes = output_data_idx;
for (; output_data_idx < num_boxes; ++output_data_idx) {
output_data[output_data_idx] = 0;
}
}
#define CHECK_INPUT_CPU(x) \
PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
std::vector NMSRotatedCPUForward(const paddle::Tensor &boxes,
const paddle::Tensor &scores,
float threshold) {
CHECK_INPUT_CPU(boxes);
CHECK_INPUT_CPU(scores);
auto num_boxes = boxes.shape()[0];
auto order_t =
std::get<1>(paddle::argsort(scores, /* axis=*/0, /* descending=*/true));
auto boxes_sorted = paddle::gather(boxes, order_t, /* axis=*/0);
auto keep =
paddle::empty({num_boxes}, paddle::DataType::INT64, paddle::CPUPlace());
int64_t num_keep_boxes = 0;
PD_DISPATCH_FLOATING_TYPES(boxes.type(), "nms_rotated_cpu_kernel", ([&] {
nms_rotated_cpu_kernel(
boxes_sorted.data(), threshold,
num_boxes, &num_keep_boxes,
keep.data());
}));
keep = keep.slice(0, num_keep_boxes);
return {paddle::gather(order_t, keep, /* axis=*/0)};
}
#ifdef PADDLE_WITH_CUDA
std::vector NMSRotatedCUDAForward(const paddle::Tensor &boxes,
const paddle::Tensor &scores,
float threshold);
#endif
std::vector NMSRotatedForward(const paddle::Tensor &boxes,
const paddle::Tensor &scores,
float threshold) {
if (boxes.is_cpu()) {
return NMSRotatedCPUForward(boxes, scores, threshold);
#ifdef PADDLE_WITH_CUDA
} else if (boxes.is_gpu()) {
return NMSRotatedCUDAForward(boxes, scores, threshold);
#endif
}
}
std::vector>
NMSRotatedInferShape(std::vector boxes_shape,
std::vector scores_shape) {
return {{-1}};
}
std::vector NMSRotatedInferDtype(paddle::DataType t1,
paddle::DataType t2) {
return {paddle::DataType::INT64};
}
PD_BUILD_OP(nms_rotated)
.Inputs({"Boxes", "Scores"})
.Outputs({"Output"})
.Attrs({"threshold: float"})
.SetKernelFn(PD_KERNEL(NMSRotatedForward))
.SetInferShapeFn(PD_INFER_SHAPE(NMSRotatedInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(NMSRotatedInferDtype));
================================================
FILE: ppdet/ext_op/csrc/nms_rotated/nms_rotated.cu
================================================
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../rbox_iou/rbox_iou_utils.h"
#include "paddle/extension.h"
static const int64_t threadsPerBlock = sizeof(int64_t) * 8;
template
__global__ void
nms_rotated_cuda_kernel(const T *boxes_data, const float threshold,
const int64_t num_boxes, int64_t *masks) {
auto raw_start = blockIdx.y;
auto col_start = blockIdx.x;
if (raw_start > col_start)
return;
const int raw_last_storage =
min(num_boxes - raw_start * threadsPerBlock, threadsPerBlock);
const int col_last_storage =
min(num_boxes - col_start * threadsPerBlock, threadsPerBlock);
if (threadIdx.x < raw_last_storage) {
int64_t mask = 0;
auto current_box_idx = raw_start * threadsPerBlock + threadIdx.x;
const T *current_box = boxes_data + current_box_idx * 5;
for (int i = 0; i < col_last_storage; ++i) {
const T *target_box = boxes_data + (col_start * threadsPerBlock + i) * 5;
if (rbox_iou_single(current_box, target_box) > threshold) {
mask |= 1ULL << i;
}
}
const int blocks_per_line = CeilDiv(num_boxes, threadsPerBlock);
masks[current_box_idx * blocks_per_line + col_start] = mask;
}
}
#define CHECK_INPUT_GPU(x) \
PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
std::vector NMSRotatedCUDAForward(const paddle::Tensor &boxes,
const paddle::Tensor &scores,
float threshold) {
CHECK_INPUT_GPU(boxes);
CHECK_INPUT_GPU(scores);
auto num_boxes = boxes.shape()[0];
auto order_t =
std::get<1>(paddle::argsort(scores, /* axis=*/0, /* descending=*/true));
auto boxes_sorted = paddle::gather(boxes, order_t, /* axis=*/0);
const auto blocks_per_line = CeilDiv(num_boxes, threadsPerBlock);
dim3 block(threadsPerBlock);
dim3 grid(blocks_per_line, blocks_per_line);
auto mask_dev = paddle::empty({num_boxes * blocks_per_line},
paddle::DataType::INT64, paddle::GPUPlace());
PD_DISPATCH_FLOATING_TYPES(
boxes.type(), "nms_rotated_cuda_kernel", ([&] {
nms_rotated_cuda_kernel<<>>(
boxes_sorted.data(), threshold, num_boxes,
mask_dev.data());
}));
auto mask_host = mask_dev.copy_to(paddle::CPUPlace(), true);
auto keep_host =
paddle::empty({num_boxes}, paddle::DataType::INT64, paddle::CPUPlace());
int64_t *keep_host_ptr = keep_host.data();
int64_t *mask_host_ptr = mask_host.data();
std::vector remv(blocks_per_line);
int64_t last_box_num = 0;
for (int64_t i = 0; i < num_boxes; ++i) {
auto remv_element_id = i / threadsPerBlock;
auto remv_bit_id = i % threadsPerBlock;
if (!(remv[remv_element_id] & 1ULL << remv_bit_id)) {
keep_host_ptr[last_box_num++] = i;
int64_t *current_mask = mask_host_ptr + i * blocks_per_line;
for (auto j = remv_element_id; j < blocks_per_line; ++j) {
remv[j] |= current_mask[j];
}
}
}
keep_host = keep_host.slice(0, last_box_num);
auto keep_dev = keep_host.copy_to(paddle::GPUPlace(), true);
return {paddle::gather(order_t, keep_dev, /* axis=*/0)};
}
================================================
FILE: ppdet/ext_op/csrc/rbox_iou/rbox_iou.cc
================================================
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// The code is based on
// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
#include "paddle/extension.h"
#include "rbox_iou_utils.h"
template
void rbox_iou_cpu_kernel(const int rbox1_num, const int rbox2_num,
const T *rbox1_data_ptr, const T *rbox2_data_ptr,
T *output_data_ptr) {
int i, j;
for (i = 0; i < rbox1_num; i++) {
for (j = 0; j < rbox2_num; j++) {
int offset = i * rbox2_num + j;
output_data_ptr[offset] =
rbox_iou_single(rbox1_data_ptr + i * 5, rbox2_data_ptr + j * 5);
}
}
}
#define CHECK_INPUT_CPU(x) \
PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
std::vector RboxIouCPUForward(const paddle::Tensor &rbox1,
const paddle::Tensor &rbox2) {
CHECK_INPUT_CPU(rbox1);
CHECK_INPUT_CPU(rbox2);
auto rbox1_num = rbox1.shape()[0];
auto rbox2_num = rbox2.shape()[0];
auto output =
paddle::empty({rbox1_num, rbox2_num}, rbox1.dtype(), paddle::CPUPlace());
PD_DISPATCH_FLOATING_TYPES(rbox1.type(), "rbox_iou_cpu_kernel", ([&] {
rbox_iou_cpu_kernel(
rbox1_num, rbox2_num, rbox1.data(),
rbox2.data(), output.data());
}));
return {output};
}
#ifdef PADDLE_WITH_CUDA
std::vector RboxIouCUDAForward(const paddle::Tensor &rbox1,
const paddle::Tensor &rbox2);
#endif
#define CHECK_INPUT_SAME(x1, x2) \
PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.")
std::vector RboxIouForward(const paddle::Tensor &rbox1,
const paddle::Tensor &rbox2) {
CHECK_INPUT_SAME(rbox1, rbox2);
if (rbox1.is_cpu()) {
return RboxIouCPUForward(rbox1, rbox2);
#ifdef PADDLE_WITH_CUDA
} else if (rbox1.is_gpu()) {
return RboxIouCUDAForward(rbox1, rbox2);
#endif
}
}
std::vector>
RboxIouInferShape(std::vector rbox1_shape,
std::vector rbox2_shape) {
return {{rbox1_shape[0], rbox2_shape[0]}};
}
std::vector RboxIouInferDtype(paddle::DataType t1,
paddle::DataType t2) {
return {t1};
}
PD_BUILD_OP(rbox_iou)
.Inputs({"RBox1", "RBox2"})
.Outputs({"Output"})
.SetKernelFn(PD_KERNEL(RboxIouForward))
.SetInferShapeFn(PD_INFER_SHAPE(RboxIouInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(RboxIouInferDtype));
================================================
FILE: ppdet/ext_op/csrc/rbox_iou/rbox_iou.cu
================================================
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// The code is based on
// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
#include "paddle/extension.h"
#include "rbox_iou_utils.h"
// 2D block with 32 * 16 = 512 threads per block
const int BLOCK_DIM_X = 32;
const int BLOCK_DIM_Y = 16;
template
__global__ void rbox_iou_cuda_kernel(const int rbox1_num, const int rbox2_num,
const T *rbox1_data_ptr,
const T *rbox2_data_ptr,
T *output_data_ptr) {
// get row_start and col_start
const int rbox1_block_idx = blockIdx.x * blockDim.x;
const int rbox2_block_idx = blockIdx.y * blockDim.y;
const int rbox1_thread_num = min(rbox1_num - rbox1_block_idx, blockDim.x);
const int rbox2_thread_num = min(rbox2_num - rbox2_block_idx, blockDim.y);
__shared__ T block_boxes1[BLOCK_DIM_X * 5];
__shared__ T block_boxes2[BLOCK_DIM_Y * 5];
// It's safe to copy using threadIdx.x since BLOCK_DIM_X >= BLOCK_DIM_Y
if (threadIdx.x < rbox1_thread_num && threadIdx.y == 0) {
block_boxes1[threadIdx.x * 5 + 0] =
rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 0];
block_boxes1[threadIdx.x * 5 + 1] =
rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 1];
block_boxes1[threadIdx.x * 5 + 2] =
rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 2];
block_boxes1[threadIdx.x * 5 + 3] =
rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 3];
block_boxes1[threadIdx.x * 5 + 4] =
rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 4];
}
// threadIdx.x < BLOCK_DIM_Y=rbox2_thread_num, just use same condition as
// above: threadIdx.y == 0
if (threadIdx.x < rbox2_thread_num && threadIdx.y == 0) {
block_boxes2[threadIdx.x * 5 + 0] =
rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 0];
block_boxes2[threadIdx.x * 5 + 1] =
rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 1];
block_boxes2[threadIdx.x * 5 + 2] =
rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 2];
block_boxes2[threadIdx.x * 5 + 3] =
rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 3];
block_boxes2[threadIdx.x * 5 + 4] =
rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 4];
}
// sync
__syncthreads();
if (threadIdx.x < rbox1_thread_num && threadIdx.y < rbox2_thread_num) {
int offset = (rbox1_block_idx + threadIdx.x) * rbox2_num + rbox2_block_idx +
threadIdx.y;
output_data_ptr[offset] = rbox_iou_single(
block_boxes1 + threadIdx.x * 5, block_boxes2 + threadIdx.y * 5);
}
}
#define CHECK_INPUT_GPU(x) \
PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
std::vector RboxIouCUDAForward(const paddle::Tensor &rbox1,
const paddle::Tensor &rbox2) {
CHECK_INPUT_GPU(rbox1);
CHECK_INPUT_GPU(rbox2);
auto rbox1_num = rbox1.shape()[0];
auto rbox2_num = rbox2.shape()[0];
auto output =
paddle::empty({rbox1_num, rbox2_num}, rbox1.dtype(), paddle::GPUPlace());
const int blocks_x = CeilDiv(rbox1_num, BLOCK_DIM_X);
const int blocks_y = CeilDiv(rbox2_num, BLOCK_DIM_Y);
dim3 blocks(blocks_x, blocks_y);
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
PD_DISPATCH_FLOATING_TYPES(
rbox1.type(), "rbox_iou_cuda_kernel", ([&] {
rbox_iou_cuda_kernel<<>>(
rbox1_num, rbox2_num, rbox1.data(), rbox2.data(),
output.data());
}));
return {output};
}
================================================
FILE: ppdet/ext_op/csrc/rbox_iou/rbox_iou_utils.h
================================================
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// The code is based on
// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
#pragma once
#include
#include
#include
#ifdef __CUDACC__
// Designates functions callable from the host (CPU) and the device (GPU)
#define HOST_DEVICE __host__ __device__
#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__
#else
#include
#define HOST_DEVICE
#define HOST_DEVICE_INLINE HOST_DEVICE inline
#endif
namespace {
template struct RotatedBox { T x_ctr, y_ctr, w, h, a; };
template struct Point {
T x, y;
HOST_DEVICE_INLINE Point(const T &px = 0, const T &py = 0) : x(px), y(py) {}
HOST_DEVICE_INLINE Point operator+(const Point &p) const {
return Point(x + p.x, y + p.y);
}
HOST_DEVICE_INLINE Point &operator+=(const Point &p) {
x += p.x;
y += p.y;
return *this;
}
HOST_DEVICE_INLINE Point operator-(const Point &p) const {
return Point(x - p.x, y - p.y);
}
HOST_DEVICE_INLINE Point operator*(const T coeff) const {
return Point(x * coeff, y * coeff);
}
};
template
HOST_DEVICE_INLINE T dot_2d(const Point &A, const Point &B) {
return A.x * B.x + A.y * B.y;
}
template
HOST_DEVICE_INLINE T cross_2d(const Point &A, const Point &B) {
return A.x * B.y - B.x * A.y;
}
template
HOST_DEVICE_INLINE void get_rotated_vertices(const RotatedBox &box,
Point (&pts)[4]) {
// M_PI / 180. == 0.01745329251
// double theta = box.a * 0.01745329251;
// MODIFIED
double theta = box.a;
T cosTheta2 = (T)cos(theta) * 0.5f;
T sinTheta2 = (T)sin(theta) * 0.5f;
// y: top --> down; x: left --> right
pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
pts[2].x = 2 * box.x_ctr - pts[0].x;
pts[2].y = 2 * box.y_ctr - pts[0].y;
pts[3].x = 2 * box.x_ctr - pts[1].x;
pts[3].y = 2 * box.y_ctr - pts[1].y;
}
template
HOST_DEVICE_INLINE int get_intersection_points(const Point (&pts1)[4],
const Point (&pts2)[4],
Point (&intersections)[24]) {
// Line vector
// A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
Point vec1[4], vec2[4];
for (int i = 0; i < 4; i++) {
vec1[i] = pts1[(i + 1) % 4] - pts1[i];
vec2[i] = pts2[(i + 1) % 4] - pts2[i];
}
// Line test - test all line combos for intersection
int num = 0; // number of intersections
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
// Solve for 2x2 Ax=b
T det = cross_2d(vec2[j], vec1[i]);
// This takes care of parallel lines
if (fabs(det) <= 1e-14) {
continue;
}
auto vec12 = pts2[j] - pts1[i];
T t1 = cross_2d(vec2[j], vec12) / det;
T t2 = cross_2d(vec1[i], vec12) / det;
if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {
intersections[num++] = pts1[i] + vec1[i] * t1;
}
}
}
// Check for vertices of rect1 inside rect2
{
const auto &AB = vec2[0];
const auto &DA = vec2[3];
auto ABdotAB = dot_2d(AB, AB);
auto ADdotAD = dot_2d(DA, DA);
for (int i = 0; i < 4; i++) {
// assume ABCD is the rectangle, and P is the point to be judged
// P is inside ABCD iff. P's projection on AB lies within AB
// and P's projection on AD lies within AD
auto AP = pts1[i] - pts2[0];
auto APdotAB = dot_2d(AP, AB);
auto APdotAD = -dot_2d(AP, DA);
if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
(APdotAD <= ADdotAD)) {
intersections[num++] = pts1[i];
}
}
}
// Reverse the check - check for vertices of rect2 inside rect1
{
const auto &AB = vec1[0];
const auto &DA = vec1[3];
auto ABdotAB = dot_2d(AB, AB);
auto ADdotAD = dot_2d(DA, DA);
for (int i = 0; i < 4; i++) {
auto AP = pts2[i] - pts1[0];
auto APdotAB = dot_2d(AP, AB);
auto APdotAD = -dot_2d(AP, DA);
if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
(APdotAD <= ADdotAD)) {
intersections[num++] = pts2[i];
}
}
}
return num;
}
template
HOST_DEVICE_INLINE int convex_hull_graham(const Point (&p)[24],
const int &num_in, Point (&q)[24],
bool shift_to_zero = false) {
assert(num_in >= 2);
// Step 1:
// Find point with minimum y
// if more than 1 points have the same minimum y,
// pick the one with the minimum x.
int t = 0;
for (int i = 1; i < num_in; i++) {
if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
t = i;
}
}
auto &start = p[t]; // starting point
// Step 2:
// Subtract starting point from every points (for sorting in the next step)
for (int i = 0; i < num_in; i++) {
q[i] = p[i] - start;
}
// Swap the starting point to position 0
auto tmp = q[0];
q[0] = q[t];
q[t] = tmp;
// Step 3:
// Sort point 1 ~ num_in according to their relative cross-product values
// (essentially sorting according to angles)
// If the angles are the same, sort according to their distance to origin
T dist[24];
for (int i = 0; i < num_in; i++) {
dist[i] = dot_2d(q[i], q[i]);
}
#ifdef __CUDACC__
// CUDA version
// In the future, we can potentially use thrust
// for sorting here to improve speed (though not guaranteed)
for (int i = 1; i < num_in - 1; i++) {
for (int j = i + 1; j < num_in; j++) {
T crossProduct = cross_2d(q[i], q[j]);
if ((crossProduct < -1e-6) ||
(fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
auto q_tmp = q[i];
q[i] = q[j];
q[j] = q_tmp;
auto dist_tmp = dist[i];
dist[i] = dist[j];
dist[j] = dist_tmp;
}
}
}
#else
// CPU version
std::sort(q + 1, q + num_in,
[](const Point &A, const Point &B) -> bool {
T temp = cross_2d(A, B);
if (fabs(temp) < 1e-6) {
return dot_2d(A, A) < dot_2d(B, B);
} else {
return temp > 0;
}
});
#endif
// Step 4:
// Make sure there are at least 2 points (that don't overlap with each other)
// in the stack
int k; // index of the non-overlapped second point
for (k = 1; k < num_in; k++) {
if (dist[k] > 1e-8) {
break;
}
}
if (k == num_in) {
// We reach the end, which means the convex hull is just one point
q[0] = p[t];
return 1;
}
q[1] = q[k];
int m = 2; // 2 points in the stack
// Step 5:
// Finally we can start the scanning process.
// When a non-convex relationship between the 3 points is found
// (either concave shape or duplicated points),
// we pop the previous point from the stack
// until the 3-point relationship is convex again, or
// until the stack only contains two points
for (int i = k + 1; i < num_in; i++) {
while (m > 1 && cross_2d(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {
m--;
}
q[m++] = q[i];
}
// Step 6 (Optional):
// In general sense we need the original coordinates, so we
// need to shift the points back (reverting Step 2)
// But if we're only interested in getting the area/perimeter of the shape
// We can simply return.
if (!shift_to_zero) {
for (int i = 0; i < m; i++) {
q[i] += start;
}
}
return m;
}
template
HOST_DEVICE_INLINE T polygon_area(const Point (&q)[24], const int &m) {
if (m <= 2) {
return 0;
}
T area = 0;
for (int i = 1; i < m - 1; i++) {
area += fabs(cross_2d(q[i] - q[0], q[i + 1] - q[0]));
}
return area / 2.0;
}
template
HOST_DEVICE_INLINE T rboxes_intersection(const RotatedBox &box1,
const RotatedBox &box2) {
// There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
// from rotated_rect_intersection_pts
Point intersectPts[24], orderedPts[24];
Point pts1[4];
Point pts2[4];
get_rotated_vertices(box1, pts1);
get_rotated_vertices(box2, pts2);
int num = get_intersection_points(pts1, pts2, intersectPts);
if (num <= 2) {
return 0.0;
}
// Convex Hull to order the intersection points in clockwise order and find
// the contour area.
int num_convex = convex_hull_graham(intersectPts, num, orderedPts, true);
return polygon_area(orderedPts, num_convex);
}
} // namespace
template
HOST_DEVICE_INLINE T rbox_iou_single(T const *const box1_raw,
T const *const box2_raw) {
// shift center to the middle point to achieve higher precision in result
RotatedBox box1, box2;
auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
box1.x_ctr = box1_raw[0] - center_shift_x;
box1.y_ctr = box1_raw[1] - center_shift_y;
box1.w = box1_raw[2];
box1.h = box1_raw[3];
box1.a = box1_raw[4];
box2.x_ctr = box2_raw[0] - center_shift_x;
box2.y_ctr = box2_raw[1] - center_shift_y;
box2.w = box2_raw[2];
box2.h = box2_raw[3];
box2.a = box2_raw[4];
if (box1.w < 1e-2 || box1.h < 1e-2 || box2.w < 1e-2 || box2.h < 1e-2) {
return 0.f;
}
const T area1 = box1.w * box1.h;
const T area2 = box2.w * box2.h;
const T intersection = rboxes_intersection(box1, box2);
const T iou = intersection / (area1 + area2 - intersection);
return iou;
}
/**
Computes ceil(a / b)
*/
HOST_DEVICE inline int CeilDiv(const int a, const int b) {
return (a + b - 1) / b;
}
================================================
FILE: ppdet/ext_op/setup.py
================================================
import os
import glob
import paddle
from paddle.utils.cpp_extension import CppExtension, CUDAExtension, setup
def get_extensions():
root_dir = os.path.dirname(os.path.abspath(__file__))
ext_root_dir = os.path.join(root_dir, 'csrc')
sources = []
for ext_name in os.listdir(ext_root_dir):
ext_dir = os.path.join(ext_root_dir, ext_name)
source = glob.glob(os.path.join(ext_dir, '*.cc'))
kwargs = dict()
if paddle.device.is_compiled_with_cuda():
source += glob.glob(os.path.join(ext_dir, '*.cu'))
if not source:
continue
sources += source
if paddle.device.is_compiled_with_cuda():
extension = CUDAExtension(
sources, extra_compile_args={'cxx': ['-DPADDLE_WITH_CUDA']})
else:
extension = CppExtension(sources)
return extension
if __name__ == "__main__":
setup(name='ext_op', ext_modules=get_extensions())
================================================
FILE: ppdet/ext_op/unittest/test_matched_rbox_iou.py
================================================
import numpy as np
import sys
import time
from shapely.geometry import Polygon
import paddle
import unittest
from ext_op import matched_rbox_iou
def rbox2poly_single(rrect, get_best_begin_point=False):
"""
rrect:[x_ctr,y_ctr,w,h,angle]
to
poly:[x0,y0,x1,y1,x2,y2,x3,y3]
"""
x_ctr, y_ctr, width, height, angle = rrect[:5]
tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2
# rect 2x4
rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]])
R = np.array([[np.cos(angle), -np.sin(angle)],
[np.sin(angle), np.cos(angle)]])
# poly
poly = R.dot(rect)
x0, x1, x2, x3 = poly[0, :4] + x_ctr
y0, y1, y2, y3 = poly[1, :4] + y_ctr
poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float64)
return poly
def intersection(g, p):
"""
Intersection.
"""
g = g[:8].reshape((4, 2))
p = p[:8].reshape((4, 2))
a = g
b = p
use_filter = True
if use_filter:
# step1:
inter_x1 = np.maximum(np.min(a[:, 0]), np.min(b[:, 0]))
inter_x2 = np.minimum(np.max(a[:, 0]), np.max(b[:, 0]))
inter_y1 = np.maximum(np.min(a[:, 1]), np.min(b[:, 1]))
inter_y2 = np.minimum(np.max(a[:, 1]), np.max(b[:, 1]))
if inter_x1 >= inter_x2 or inter_y1 >= inter_y2:
return 0.
x1 = np.minimum(np.min(a[:, 0]), np.min(b[:, 0]))
x2 = np.maximum(np.max(a[:, 0]), np.max(b[:, 0]))
y1 = np.minimum(np.min(a[:, 1]), np.min(b[:, 1]))
y2 = np.maximum(np.max(a[:, 1]), np.max(b[:, 1]))
if x1 >= x2 or y1 >= y2 or (x2 - x1) < 2 or (y2 - y1) < 2:
return 0.
g = Polygon(g)
p = Polygon(p)
if not g.is_valid or not p.is_valid:
return 0
inter = Polygon(g).intersection(Polygon(p)).area
union = g.area + p.area - inter
if union == 0:
return 0
else:
return inter / union
def matched_rbox_overlaps(anchors, gt_bboxes, use_cv2=False):
"""
Args:
anchors: [M, 5] x1,y1,x2,y2,angle
gt_bboxes: [M, 5] x1,y1,x2,y2,angle
Returns:
macthed_iou: [M]
"""
assert anchors.shape[1] == 5
assert gt_bboxes.shape[1] == 5
gt_bboxes_ploy = [rbox2poly_single(e) for e in gt_bboxes]
anchors_ploy = [rbox2poly_single(e) for e in anchors]
num = len(anchors_ploy)
iou = np.zeros((num, ), dtype=np.float64)
start_time = time.time()
for i in range(num):
try:
iou[i] = intersection(gt_bboxes_ploy[i], anchors_ploy[i])
except Exception as e:
print('cur gt_bboxes_ploy[i]', gt_bboxes_ploy[i],
'anchors_ploy[j]', anchors_ploy[i], e)
return iou
def gen_sample(n):
rbox = np.random.rand(n, 5)
rbox[:, 0:4] = rbox[:, 0:4] * 0.45 + 0.001
rbox[:, 4] = rbox[:, 4] - 0.5
return rbox
class MatchedRBoxIoUTest(unittest.TestCase):
def setUp(self):
self.initTestCase()
self.rbox1 = gen_sample(self.n)
self.rbox2 = gen_sample(self.n)
def initTestCase(self):
self.n = 1000
def assertAllClose(self, x, y, msg, atol=5e-1, rtol=1e-2):
self.assertTrue(np.allclose(x, y, atol=atol, rtol=rtol), msg=msg)
def get_places(self):
places = [paddle.CPUPlace()]
if paddle.device.is_compiled_with_cuda():
places.append(paddle.CUDAPlace(0))
return places
def check_output(self, place):
paddle.disable_static()
pd_rbox1 = paddle.to_tensor(self.rbox1, place=place)
pd_rbox2 = paddle.to_tensor(self.rbox2, place=place)
actual_t = matched_rbox_iou(pd_rbox1, pd_rbox2).numpy()
poly_rbox1 = self.rbox1
poly_rbox2 = self.rbox2
poly_rbox1[:, 0:4] = self.rbox1[:, 0:4] * 1024
poly_rbox2[:, 0:4] = self.rbox2[:, 0:4] * 1024
expect_t = matched_rbox_overlaps(poly_rbox1, poly_rbox2, use_cv2=False)
self.assertAllClose(
actual_t,
expect_t,
msg="rbox_iou has diff at {} \nExpect {}\nBut got {}".format(
str(place), str(expect_t), str(actual_t)))
def test_output(self):
places = self.get_places()
for place in places:
self.check_output(place)
if __name__ == "__main__":
unittest.main()
================================================
FILE: ppdet/ext_op/unittest/test_rbox_iou.py
================================================
import numpy as np
import sys
import time
from shapely.geometry import Polygon
import paddle
import unittest
from ext_op import rbox_iou
def rbox2poly_single(rrect, get_best_begin_point=False):
"""
rrect:[x_ctr,y_ctr,w,h,angle]
to
poly:[x0,y0,x1,y1,x2,y2,x3,y3]
"""
x_ctr, y_ctr, width, height, angle = rrect[:5]
tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2
# rect 2x4
rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]])
R = np.array([[np.cos(angle), -np.sin(angle)],
[np.sin(angle), np.cos(angle)]])
# poly
poly = R.dot(rect)
x0, x1, x2, x3 = poly[0, :4] + x_ctr
y0, y1, y2, y3 = poly[1, :4] + y_ctr
poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float64)
return poly
def intersection(g, p):
"""
Intersection.
"""
g = g[:8].reshape((4, 2))
p = p[:8].reshape((4, 2))
a = g
b = p
use_filter = True
if use_filter:
# step1:
inter_x1 = np.maximum(np.min(a[:, 0]), np.min(b[:, 0]))
inter_x2 = np.minimum(np.max(a[:, 0]), np.max(b[:, 0]))
inter_y1 = np.maximum(np.min(a[:, 1]), np.min(b[:, 1]))
inter_y2 = np.minimum(np.max(a[:, 1]), np.max(b[:, 1]))
if inter_x1 >= inter_x2 or inter_y1 >= inter_y2:
return 0.
x1 = np.minimum(np.min(a[:, 0]), np.min(b[:, 0]))
x2 = np.maximum(np.max(a[:, 0]), np.max(b[:, 0]))
y1 = np.minimum(np.min(a[:, 1]), np.min(b[:, 1]))
y2 = np.maximum(np.max(a[:, 1]), np.max(b[:, 1]))
if x1 >= x2 or y1 >= y2 or (x2 - x1) < 2 or (y2 - y1) < 2:
return 0.
g = Polygon(g)
p = Polygon(p)
if not g.is_valid or not p.is_valid:
return 0
inter = Polygon(g).intersection(Polygon(p)).area
union = g.area + p.area - inter
if union == 0:
return 0
else:
return inter / union
def rbox_overlaps(anchors, gt_bboxes, use_cv2=False):
"""
Args:
anchors: [NA, 5] x1,y1,x2,y2,angle
gt_bboxes: [M, 5] x1,y1,x2,y2,angle
Returns:
iou: [NA, M]
"""
assert anchors.shape[1] == 5
assert gt_bboxes.shape[1] == 5
gt_bboxes_ploy = [rbox2poly_single(e) for e in gt_bboxes]
anchors_ploy = [rbox2poly_single(e) for e in anchors]
num_gt, num_anchors = len(gt_bboxes_ploy), len(anchors_ploy)
iou = np.zeros((num_anchors, num_gt), dtype=np.float64)
start_time = time.time()
for i in range(num_anchors):
for j in range(num_gt):
try:
iou[i, j] = intersection(anchors_ploy[i], gt_bboxes_ploy[j])
except Exception as e:
print('cur anchors_ploy[i]', anchors_ploy[i],
'gt_bboxes_ploy[j]', gt_bboxes_ploy[j], e)
return iou
def gen_sample(n):
rbox = np.random.rand(n, 5)
rbox[:, 0:4] = rbox[:, 0:4] * 0.45 + 0.001
rbox[:, 4] = rbox[:, 4] - 0.5
return rbox
class RBoxIoUTest(unittest.TestCase):
def setUp(self):
self.initTestCase()
self.rbox1 = gen_sample(self.n)
self.rbox2 = gen_sample(self.m)
def initTestCase(self):
self.n = 13000
self.m = 7
def assertAllClose(self, x, y, msg, atol=5e-1, rtol=1e-2):
self.assertTrue(np.allclose(x, y, atol=atol, rtol=rtol), msg=msg)
def get_places(self):
places = [paddle.CPUPlace()]
if paddle.device.is_compiled_with_cuda():
places.append(paddle.CUDAPlace(0))
return places
def check_output(self, place):
paddle.disable_static()
pd_rbox1 = paddle.to_tensor(self.rbox1, place=place)
pd_rbox2 = paddle.to_tensor(self.rbox2, place=place)
actual_t = rbox_iou(pd_rbox1, pd_rbox2).numpy()
poly_rbox1 = self.rbox1
poly_rbox2 = self.rbox2
poly_rbox1[:, 0:4] = self.rbox1[:, 0:4] * 1024
poly_rbox2[:, 0:4] = self.rbox2[:, 0:4] * 1024
expect_t = rbox_overlaps(poly_rbox1, poly_rbox2, use_cv2=False)
self.assertAllClose(
actual_t,
expect_t,
msg="rbox_iou has diff at {} \nExpect {}\nBut got {}".format(
str(place), str(expect_t), str(actual_t)))
def test_output(self):
places = self.get_places()
for place in places:
self.check_output(place)
if __name__ == "__main__":
unittest.main()
================================================
FILE: ppdet/metrics/__init__.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import metrics
from . import keypoint_metrics
from .metrics import *
from .keypoint_metrics import *
from .pose3d_metrics import *
__all__ = metrics.__all__ + keypoint_metrics.__all__
from . import mot_metrics
from .mot_metrics import *
__all__ = metrics.__all__ + mot_metrics.__all__
from . import mcmot_metrics
from .mcmot_metrics import *
__all__ = metrics.__all__ + mcmot_metrics.__all__
from . import culane_metrics
from .culane_metrics import *
__all__ = metrics.__all__ + culane_metrics.__all__
================================================
FILE: ppdet/metrics/coco_utils.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
import numpy as np
import itertools
from ppdet.metrics.json_results import get_det_res, get_det_poly_res, get_seg_res, get_solov2_segm_res, get_keypoint_res, get_pose3d_res
from ppdet.metrics.map_utils import draw_pr_curve
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
def get_infer_results(outs, catid, bias=0, save_threshold=0):
"""
Get result at the stage of inference.
The output format is dictionary containing bbox or mask result.
For example, bbox result is a list and each element contains
image_id, category_id, bbox and score.
"""
if outs is None or len(outs) == 0:
raise ValueError(
'The number of valid detection result if zero. Please use reasonable model and check input data.'
)
im_id = outs['im_id']
im_file = outs['im_file'] if 'im_file' in outs else None
infer_res = {}
if 'bbox' in outs:
if len(outs['bbox']) > 0 and len(outs['bbox'][0]) > 6:
infer_res['bbox'] = get_det_poly_res(
outs['bbox'], outs['bbox_num'], im_id, catid, bias=bias)
else:
infer_res['bbox'] = get_det_res(
outs['bbox'],
outs['bbox_num'],
im_id,
catid,
bias=bias,
im_file=im_file,
save_threshold=save_threshold)
if 'mask' in outs:
# mask post process
infer_res['mask'] = get_seg_res(outs['mask'], outs['bbox'],
outs['bbox_num'], im_id, catid)
if 'segm' in outs:
infer_res['segm'] = get_solov2_segm_res(outs, im_id, catid)
if 'keypoint' in outs:
infer_res['keypoint'] = get_keypoint_res(outs, im_id)
outs['bbox_num'] = [len(infer_res['keypoint'])]
if 'pose3d' in outs:
infer_res['pose3d'] = get_pose3d_res(outs, im_id)
outs['bbox_num'] = [len(infer_res['pose3d'])]
return infer_res
def cocoapi_eval(jsonfile,
style,
coco_gt=None,
anno_file=None,
max_dets=(100, 300, 1000),
classwise=False,
sigmas=None,
use_area=True):
"""
Args:
jsonfile (str): Evaluation json file, eg: bbox.json, mask.json.
style (str): COCOeval style, can be `bbox` , `segm` , `proposal`, `keypoints` and `keypoints_crowd`.
coco_gt (str): Whether to load COCOAPI through anno_file,
eg: coco_gt = COCO(anno_file)
anno_file (str): COCO annotations file.
max_dets (tuple): COCO evaluation maxDets.
classwise (bool): Whether per-category AP and draw P-R Curve or not.
sigmas (nparray): keypoint labelling sigmas.
use_area (bool): If gt annotations (eg. CrowdPose, AIC)
do not have 'area', please set use_area=False.
"""
assert coco_gt != None or anno_file != None
if style == 'keypoints_crowd':
#please install xtcocotools==1.6
from xtcocotools.coco import COCO
from xtcocotools.cocoeval import COCOeval
else:
from pycocotools.coco import COCO
try:
from .fast_cocoeval import FastCOCOeval as COCOeval
except:
from pycocotools.cocoeval import COCOeval
if coco_gt == None:
coco_gt = COCO(anno_file)
logger.info("Start evaluate...")
coco_dt = coco_gt.loadRes(jsonfile)
if style == 'proposal':
coco_eval = COCOeval(coco_gt, coco_dt, 'bbox')
coco_eval.params.useCats = 0
coco_eval.params.maxDets = list(max_dets)
elif style == 'keypoints_crowd':
coco_eval = COCOeval(coco_gt, coco_dt, style, sigmas, use_area)
else:
coco_eval = COCOeval(coco_gt, coco_dt, style)
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()
if classwise:
# Compute per-category AP and PR curve
try:
from terminaltables import AsciiTable
except Exception as e:
logger.error(
'terminaltables not found, plaese install terminaltables. '
'for example: `pip install terminaltables`.')
raise e
precisions = coco_eval.eval['precision']
cat_ids = coco_gt.getCatIds()
# precision: (iou, recall, cls, area range, max dets)
assert len(cat_ids) == precisions.shape[2]
results_per_category = []
for idx, catId in enumerate(cat_ids):
# area range index 0: all area ranges
# max dets index -1: typically 100 per image
nm = coco_gt.loadCats(catId)[0]
precision = precisions[:, :, idx, 0, -1]
precision = precision[precision > -1]
if precision.size:
ap = np.mean(precision)
else:
ap = float('nan')
results_per_category.append(
(str(nm["name"]), '{:0.3f}'.format(float(ap))))
pr_array = precisions[0, :, idx, 0, 2]
recall_array = np.arange(0.0, 1.01, 0.01)
draw_pr_curve(
pr_array,
recall_array,
out_dir=style + '_pr_curve',
file_name='{}_precision_recall_curve.jpg'.format(nm["name"]))
num_columns = min(6, len(results_per_category) * 2)
results_flatten = list(itertools.chain(*results_per_category))
headers = ['category', 'AP'] * (num_columns // 2)
results_2d = itertools.zip_longest(
*[results_flatten[i::num_columns] for i in range(num_columns)])
table_data = [headers]
table_data += [result for result in results_2d]
table = AsciiTable(table_data)
logger.info('Per-category of {} AP: \n{}'.format(style, table.table))
logger.info("per-category PR curve has output to {} folder.".format(
style + '_pr_curve'))
# flush coco evaluation result
sys.stdout.flush()
return coco_eval.stats
def json_eval_results(metric, json_directory, dataset):
"""
cocoapi eval with already exists proposal.json, bbox.json or mask.json
"""
assert metric == 'COCO'
anno_file = dataset.get_anno()
json_file_list = ['proposal.json', 'bbox.json', 'mask.json']
if json_directory:
assert os.path.exists(
json_directory), "The json directory:{} does not exist".format(
json_directory)
for k, v in enumerate(json_file_list):
json_file_list[k] = os.path.join(str(json_directory), v)
coco_eval_style = ['proposal', 'bbox', 'segm']
for i, v_json in enumerate(json_file_list):
if os.path.exists(v_json):
cocoapi_eval(v_json, coco_eval_style[i], anno_file=anno_file)
else:
logger.info("{} not exists!".format(v_json))
================================================
FILE: ppdet/metrics/culane_metrics.py
================================================
import os
import cv2
import numpy as np
import os.path as osp
from functools import partial
from .metrics import Metric
from scipy.interpolate import splprep, splev
from scipy.optimize import linear_sum_assignment
from shapely.geometry import LineString, Polygon
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
__all__ = [
'draw_lane', 'discrete_cross_iou', 'continuous_cross_iou', 'interp',
'culane_metric', 'load_culane_img_data', 'load_culane_data',
'eval_predictions', "CULaneMetric"
]
LIST_FILE = {
'train': 'list/train_gt.txt',
'val': 'list/val.txt',
'test': 'list/test.txt',
}
CATEGORYS = {
'normal': 'list/test_split/test0_normal.txt',
'crowd': 'list/test_split/test1_crowd.txt',
'hlight': 'list/test_split/test2_hlight.txt',
'shadow': 'list/test_split/test3_shadow.txt',
'noline': 'list/test_split/test4_noline.txt',
'arrow': 'list/test_split/test5_arrow.txt',
'curve': 'list/test_split/test6_curve.txt',
'cross': 'list/test_split/test7_cross.txt',
'night': 'list/test_split/test8_night.txt',
}
def draw_lane(lane, img=None, img_shape=None, width=30):
if img is None:
img = np.zeros(img_shape, dtype=np.uint8)
lane = lane.astype(np.int32)
for p1, p2 in zip(lane[:-1], lane[1:]):
cv2.line(
img, tuple(p1), tuple(p2), color=(255, 255, 255), thickness=width)
return img
def discrete_cross_iou(xs, ys, width=30, img_shape=(590, 1640, 3)):
xs = [draw_lane(lane, img_shape=img_shape, width=width) > 0 for lane in xs]
ys = [draw_lane(lane, img_shape=img_shape, width=width) > 0 for lane in ys]
ious = np.zeros((len(xs), len(ys)))
for i, x in enumerate(xs):
for j, y in enumerate(ys):
ious[i, j] = (x & y).sum() / (x | y).sum()
return ious
def continuous_cross_iou(xs, ys, width=30, img_shape=(590, 1640, 3)):
h, w, _ = img_shape
image = Polygon([(0, 0), (0, h - 1), (w - 1, h - 1), (w - 1, 0)])
xs = [
LineString(lane).buffer(
distance=width / 2., cap_style=1, join_style=2).intersection(image)
for lane in xs
]
ys = [
LineString(lane).buffer(
distance=width / 2., cap_style=1, join_style=2).intersection(image)
for lane in ys
]
ious = np.zeros((len(xs), len(ys)))
for i, x in enumerate(xs):
for j, y in enumerate(ys):
ious[i, j] = x.intersection(y).area / x.union(y).area
return ious
def interp(points, n=50):
x = [x for x, _ in points]
y = [y for _, y in points]
tck, u = splprep([x, y], s=0, t=n, k=min(3, len(points) - 1))
u = np.linspace(0., 1., num=(len(u) - 1) * n + 1)
return np.array(splev(u, tck)).T
def culane_metric(pred,
anno,
width=30,
iou_thresholds=[0.5],
official=True,
img_shape=(590, 1640, 3)):
_metric = {}
for thr in iou_thresholds:
tp = 0
fp = 0 if len(anno) != 0 else len(pred)
fn = 0 if len(pred) != 0 else len(anno)
_metric[thr] = [tp, fp, fn]
interp_pred = np.array(
[interp(
pred_lane, n=5) for pred_lane in pred], dtype=object) # (4, 50, 2)
interp_anno = np.array(
[interp(
anno_lane, n=5) for anno_lane in anno], dtype=object) # (4, 50, 2)
if official:
ious = discrete_cross_iou(
interp_pred, interp_anno, width=width, img_shape=img_shape)
else:
ious = continuous_cross_iou(
interp_pred, interp_anno, width=width, img_shape=img_shape)
row_ind, col_ind = linear_sum_assignment(1 - ious)
_metric = {}
for thr in iou_thresholds:
tp = int((ious[row_ind, col_ind] > thr).sum())
fp = len(pred) - tp
fn = len(anno) - tp
_metric[thr] = [tp, fp, fn]
return _metric
def load_culane_img_data(path):
with open(path, 'r') as data_file:
img_data = data_file.readlines()
img_data = [line.split() for line in img_data]
img_data = [list(map(float, lane)) for lane in img_data]
img_data = [[(lane[i], lane[i + 1]) for i in range(0, len(lane), 2)]
for lane in img_data]
img_data = [lane for lane in img_data if len(lane) >= 2]
return img_data
def load_culane_data(data_dir, file_list_path):
with open(file_list_path, 'r') as file_list:
filepaths = [
os.path.join(data_dir,
line[1 if line[0] == '/' else 0:].rstrip().replace(
'.jpg', '.lines.txt'))
for line in file_list.readlines()
]
data = []
for path in filepaths:
img_data = load_culane_img_data(path)
data.append(img_data)
return data
def eval_predictions(pred_dir,
anno_dir,
list_path,
iou_thresholds=[0.5],
width=30,
official=True,
sequential=False):
logger.info('Calculating metric for List: {}'.format(list_path))
predictions = load_culane_data(pred_dir, list_path)
annotations = load_culane_data(anno_dir, list_path)
img_shape = (590, 1640, 3)
if sequential:
results = map(partial(
culane_metric,
width=width,
official=official,
iou_thresholds=iou_thresholds,
img_shape=img_shape),
predictions,
annotations)
else:
from multiprocessing import Pool, cpu_count
from itertools import repeat
with Pool(cpu_count()) as p:
results = p.starmap(culane_metric,
zip(predictions, annotations,
repeat(width),
repeat(iou_thresholds),
repeat(official), repeat(img_shape)))
mean_f1, mean_prec, mean_recall, total_tp, total_fp, total_fn = 0, 0, 0, 0, 0, 0
ret = {}
for thr in iou_thresholds:
tp = sum(m[thr][0] for m in results)
fp = sum(m[thr][1] for m in results)
fn = sum(m[thr][2] for m in results)
precision = float(tp) / (tp + fp) if tp != 0 else 0
recall = float(tp) / (tp + fn) if tp != 0 else 0
f1 = 2 * precision * recall / (precision + recall) if tp != 0 else 0
logger.info('iou thr: {:.2f}, tp: {}, fp: {}, fn: {},'
'precision: {}, recall: {}, f1: {}'.format(
thr, tp, fp, fn, precision, recall, f1))
mean_f1 += f1 / len(iou_thresholds)
mean_prec += precision / len(iou_thresholds)
mean_recall += recall / len(iou_thresholds)
total_tp += tp
total_fp += fp
total_fn += fn
ret[thr] = {
'TP': tp,
'FP': fp,
'FN': fn,
'Precision': precision,
'Recall': recall,
'F1': f1
}
if len(iou_thresholds) > 2:
logger.info(
'mean result, total_tp: {}, total_fp: {}, total_fn: {},'
'precision: {}, recall: {}, f1: {}'.format(
total_tp, total_fp, total_fn, mean_prec, mean_recall, mean_f1))
ret['mean'] = {
'TP': total_tp,
'FP': total_fp,
'FN': total_fn,
'Precision': mean_prec,
'Recall': mean_recall,
'F1': mean_f1
}
return ret
class CULaneMetric(Metric):
def __init__(self,
cfg,
output_eval=None,
split="test",
dataset_dir="dataset/CULane/"):
super(CULaneMetric, self).__init__()
self.output_eval = "evaluation" if output_eval is None else output_eval
self.dataset_dir = dataset_dir
self.split = split
self.list_path = osp.join(dataset_dir, LIST_FILE[split])
self.predictions = []
self.img_names = []
self.lanes = []
self.eval_results = {}
self.cfg = cfg
self.reset()
def reset(self):
self.predictions = []
self.img_names = []
self.lanes = []
self.eval_results = {}
def get_prediction_string(self, pred):
ys = np.arange(270, 590, 8) / self.cfg.ori_img_h
out = []
for lane in pred:
xs = lane(ys)
valid_mask = (xs >= 0) & (xs < 1)
xs = xs * self.cfg.ori_img_w
lane_xs = xs[valid_mask]
lane_ys = ys[valid_mask] * self.cfg.ori_img_h
lane_xs, lane_ys = lane_xs[::-1], lane_ys[::-1]
lane_str = ' '.join([
'{:.5f} {:.5f}'.format(x, y) for x, y in zip(lane_xs, lane_ys)
])
if lane_str != '':
out.append(lane_str)
return '\n'.join(out)
def accumulate(self):
loss_lines = [[], [], [], []]
for idx, pred in enumerate(self.predictions):
output_dir = os.path.join(self.output_eval,
os.path.dirname(self.img_names[idx]))
output_filename = os.path.basename(self.img_names[
idx])[:-3] + 'lines.txt'
os.makedirs(output_dir, exist_ok=True)
output = self.get_prediction_string(pred)
# store loss lines
lanes = self.lanes[idx]
if len(lanes) - len(pred) in [1, 2, 3, 4]:
loss_lines[len(lanes) - len(pred) - 1].append(self.img_names[
idx])
with open(os.path.join(output_dir, output_filename),
'w') as out_file:
out_file.write(output)
for i, names in enumerate(loss_lines):
with open(
os.path.join(output_dir, 'loss_{}_lines.txt'.format(i + 1)),
'w') as f:
for name in names:
f.write(name + '\n')
for cate, cate_file in CATEGORYS.items():
result = eval_predictions(
self.output_eval,
self.dataset_dir,
os.path.join(self.dataset_dir, cate_file),
iou_thresholds=[0.5],
official=True)
result = eval_predictions(
self.output_eval,
self.dataset_dir,
self.list_path,
iou_thresholds=np.linspace(0.5, 0.95, 10),
official=True)
self.eval_results['F1@50'] = result[0.5]['F1']
self.eval_results['result'] = result
def update(self, inputs, outputs):
assert len(inputs['img_name']) == len(outputs['lanes'])
self.predictions.extend(outputs['lanes'])
self.img_names.extend(inputs['img_name'])
self.lanes.extend(inputs['lane_line'])
def log(self):
logger.info(self.eval_results)
# abstract method for getting metric results
def get_results(self):
return self.eval_results
================================================
FILE: ppdet/metrics/fast_cocoeval/README.md
================================================
# COCOeval C++ 扩展编译
## 安装
```
cd ext
python setup.py install
```
================================================
FILE: ppdet/metrics/fast_cocoeval/__init__.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import fast_cocoeval
from .fast_cocoeval import *
================================================
FILE: ppdet/metrics/fast_cocoeval/ext/cocoeval.cc
================================================
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// The code is based on
// https://github.com/facebookresearch/detectron2/tree/main/detectron2/layers/csrc/cocoeval/
#include "cocoeval.h"
#include
#include
#include
#include
using namespace pybind11::literals;
// Sort detections from highest score to lowest, such that
// detection_instances[detection_sorted_indices[t]] >=
// detection_instances[detection_sorted_indices[t+1]]. Use stable_sort to match
// original COCO API
void SortInstancesByDetectionScore(
const std::vector& detection_instances,
std::vector* detection_sorted_indices) {
detection_sorted_indices->resize(detection_instances.size());
std::iota(
detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
std::stable_sort(
detection_sorted_indices->begin(),
detection_sorted_indices->end(),
[&detection_instances](size_t j1, size_t j2) {
return detection_instances[j1].score > detection_instances[j2].score;
});
}
// Partition the ground truth objects based on whether or not to ignore them
// based on area
void SortInstancesByIgnore(
const std::array& area_range,
const std::vector& ground_truth_instances,
std::vector* ground_truth_sorted_indices,
std::vector* ignores) {
ignores->clear();
ignores->reserve(ground_truth_instances.size());
for (auto o : ground_truth_instances) {
ignores->push_back(
o.ignore || o.area < area_range[0] || o.area > area_range[1]);
}
ground_truth_sorted_indices->resize(ground_truth_instances.size());
std::iota(
ground_truth_sorted_indices->begin(),
ground_truth_sorted_indices->end(),
0);
std::stable_sort(
ground_truth_sorted_indices->begin(),
ground_truth_sorted_indices->end(),
[&ignores](size_t j1, size_t j2) {
return (int)(*ignores)[j1] < (int)(*ignores)[j2];
});
}
// For each IOU threshold, greedily match each detected instance to a ground
// truth instance (if possible) and store the results
void MatchDetectionsToGroundTruth(
const std::vector& detection_instances,
const std::vector& detection_sorted_indices,
const std::vector& ground_truth_instances,
const std::vector& ground_truth_sorted_indices,
const std::vector& ignores,
const std::vector>& ious,
const std::vector& iou_thresholds,
const std::array& area_range,
ImageEvaluation* results) {
// Initialize memory to store return data matches and ignore
const int num_iou_thresholds = iou_thresholds.size();
const int num_ground_truth = ground_truth_sorted_indices.size();
const int num_detections = detection_sorted_indices.size();
std::vector ground_truth_matches(
num_iou_thresholds * num_ground_truth, 0);
std::vector& detection_matches = results->detection_matches;
std::vector& detection_ignores = results->detection_ignores;
std::vector& ground_truth_ignores = results->ground_truth_ignores;
detection_matches.resize(num_iou_thresholds * num_detections, 0);
detection_ignores.resize(num_iou_thresholds * num_detections, false);
ground_truth_ignores.resize(num_ground_truth);
for (auto g = 0; g < num_ground_truth; ++g) {
ground_truth_ignores[g] = ignores[ground_truth_sorted_indices[g]];
}
for (auto t = 0; t < num_iou_thresholds; ++t) {
for (auto d = 0; d < num_detections; ++d) {
// information about best match so far (match=-1 -> unmatched)
double best_iou = std::min(iou_thresholds[t], 1 - 1e-10);
int match = -1;
for (auto g = 0; g < num_ground_truth; ++g) {
// if this ground truth instance is already matched and not a
// crowd, it cannot be matched to another detection
if (ground_truth_matches[t * num_ground_truth + g] > 0 &&
!ground_truth_instances[ground_truth_sorted_indices[g]].is_crowd) {
continue;
}
// if detected instance matched to a regular ground truth
// instance, we can break on the first ground truth instance
// tagged as ignore (because they are sorted by the ignore tag)
if (match >= 0 && !ground_truth_ignores[match] &&
ground_truth_ignores[g]) {
break;
}
// if IOU overlap is the best so far, store the match appropriately
if (ious[d][ground_truth_sorted_indices[g]] >= best_iou) {
best_iou = ious[d][ground_truth_sorted_indices[g]];
match = g;
}
}
// if match was made, store id of match for both detection and
// ground truth
if (match >= 0) {
detection_ignores[t * num_detections + d] = ground_truth_ignores[match];
detection_matches[t * num_detections + d] =
ground_truth_instances[ground_truth_sorted_indices[match]].id;
ground_truth_matches[t * num_ground_truth + match] =
detection_instances[detection_sorted_indices[d]].id;
}
// set unmatched detections outside of area range to ignore
const InstanceAnnotation& detection =
detection_instances[detection_sorted_indices[d]];
detection_ignores[t * num_detections + d] =
detection_ignores[t * num_detections + d] ||
(detection_matches[t * num_detections + d] == 0 &&
(detection.area < area_range[0] || detection.area > area_range[1]));
}
}
// store detection score results
results->detection_scores.resize(detection_sorted_indices.size());
for (size_t d = 0; d < detection_sorted_indices.size(); ++d) {
results->detection_scores[d] =
detection_instances[detection_sorted_indices[d]].score;
}
}
std::vector EvaluateImages(
const std::vector>& area_ranges,
int max_detections,
const std::vector& iou_thresholds,
const ImageCategoryInstances>& image_category_ious,
const ImageCategoryInstances&
image_category_ground_truth_instances,
const ImageCategoryInstances&
image_category_detection_instances) {
const int num_area_ranges = area_ranges.size();
const int num_images = image_category_ground_truth_instances.size();
const int num_categories =
image_category_ious.size() > 0 ? image_category_ious[0].size() : 0;
std::vector detection_sorted_indices;
std::vector ground_truth_sorted_indices;
std::vector ignores;
std::vector results_all(
num_images * num_area_ranges * num_categories);
// Store results for each image, category, and area range combination. Results
// for each IOU threshold are packed into the same ImageEvaluation object
for (auto i = 0; i < num_images; ++i) {
for (auto c = 0; c < num_categories; ++c) {
const std::vector& ground_truth_instances =
image_category_ground_truth_instances[i][c];
const std::vector& detection_instances =
image_category_detection_instances[i][c];
SortInstancesByDetectionScore(
detection_instances, &detection_sorted_indices);
if ((int)detection_sorted_indices.size() > max_detections) {
detection_sorted_indices.resize(max_detections);
}
for (size_t a = 0; a < area_ranges.size(); ++a) {
SortInstancesByIgnore(
area_ranges[a],
ground_truth_instances,
&ground_truth_sorted_indices,
&ignores);
MatchDetectionsToGroundTruth(
detection_instances,
detection_sorted_indices,
ground_truth_instances,
ground_truth_sorted_indices,
ignores,
image_category_ious[i][c],
iou_thresholds,
area_ranges[a],
&results_all
[c * num_area_ranges * num_images + a * num_images + i]);
}
}
}
return results_all;
}
// Convert a python list to a vector
template
std::vector list_to_vec(const py::list& l) {
std::vector v(py::len(l));
for (int i = 0; i < (int)py::len(l); ++i) {
v[i] = l[i].cast();
}
return v;
}
// Helper function to Accumulate()
// Considers the evaluation results applicable to a particular category, area
// range, and max_detections parameter setting, which begin at
// evaluations[evaluation_index]. Extracts a sorted list of length n of all
// applicable detection instances concatenated across all images in the dataset,
// which are represented by the outputs evaluation_indices, detection_scores,
// image_detection_indices, and detection_sorted_indices--all of which are
// length n. evaluation_indices[i] stores the applicable index into
// evaluations[] for instance i, which has detection score detection_score[i],
// and is the image_detection_indices[i]'th of the list of detections
// for the image containing i. detection_sorted_indices[] defines a sorted
// permutation of the 3 other outputs
int BuildSortedDetectionList(
const std::vector& evaluations,
const int64_t evaluation_index,
const int64_t num_images,
const int max_detections,
std::vector* evaluation_indices,
std::vector* detection_scores,
std::vector* detection_sorted_indices,
std::vector* image_detection_indices) {
assert(evaluations.size() >= evaluation_index + num_images);
// Extract a list of object instances of the applicable category, area
// range, and max detections requirements such that they can be sorted
image_detection_indices->clear();
evaluation_indices->clear();
detection_scores->clear();
image_detection_indices->reserve(num_images * max_detections);
evaluation_indices->reserve(num_images * max_detections);
detection_scores->reserve(num_images * max_detections);
int num_valid_ground_truth = 0;
for (auto i = 0; i < num_images; ++i) {
const ImageEvaluation& evaluation = evaluations[evaluation_index + i];
for (int d = 0;
d < (int)evaluation.detection_scores.size() && d < max_detections;
++d) { // detected instances
evaluation_indices->push_back(evaluation_index + i);
image_detection_indices->push_back(d);
detection_scores->push_back(evaluation.detection_scores[d]);
}
for (auto ground_truth_ignore : evaluation.ground_truth_ignores) {
if (!ground_truth_ignore) {
++num_valid_ground_truth;
}
}
}
// Sort detections by decreasing score, using stable sort to match
// python implementation
detection_sorted_indices->resize(detection_scores->size());
std::iota(
detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
std::stable_sort(
detection_sorted_indices->begin(),
detection_sorted_indices->end(),
[&detection_scores](size_t j1, size_t j2) {
return (*detection_scores)[j1] > (*detection_scores)[j2];
});
return num_valid_ground_truth;
}
// Helper function to Accumulate()
// Compute a precision recall curve given a sorted list of detected instances
// encoded in evaluations, evaluation_indices, detection_scores,
// detection_sorted_indices, image_detection_indices (see
// BuildSortedDetectionList()). Using vectors precisions and recalls
// and temporary storage, output the results into precisions_out, recalls_out,
// and scores_out, which are large buffers containing many precion/recall curves
// for all possible parameter settings, with precisions_out_index and
// recalls_out_index defining the applicable indices to store results.
void ComputePrecisionRecallCurve(
const int64_t precisions_out_index,
const int64_t precisions_out_stride,
const int64_t recalls_out_index,
const std::vector& recall_thresholds,
const int iou_threshold_index,
const int num_iou_thresholds,
const int num_valid_ground_truth,
const std::vector& evaluations,
const std::vector& evaluation_indices,
const std::vector& detection_scores,
const std::vector& detection_sorted_indices,
const std::vector& image_detection_indices,
std::vector* precisions,
std::vector* recalls,
std::vector* precisions_out,
std::vector* scores_out,
std::vector* recalls_out) {
assert(recalls_out->size() > recalls_out_index);
// Compute precision/recall for each instance in the sorted list of detections
int64_t true_positives_sum = 0, false_positives_sum = 0;
precisions->clear();
recalls->clear();
precisions->reserve(detection_sorted_indices.size());
recalls->reserve(detection_sorted_indices.size());
assert(!evaluations.empty() || detection_sorted_indices.empty());
for (auto detection_sorted_index : detection_sorted_indices) {
const ImageEvaluation& evaluation =
evaluations[evaluation_indices[detection_sorted_index]];
const auto num_detections =
evaluation.detection_matches.size() / num_iou_thresholds;
const auto detection_index = iou_threshold_index * num_detections +
image_detection_indices[detection_sorted_index];
assert(evaluation.detection_matches.size() > detection_index);
assert(evaluation.detection_ignores.size() > detection_index);
const int64_t detection_match =
evaluation.detection_matches[detection_index];
const bool detection_ignores =
evaluation.detection_ignores[detection_index];
const auto true_positive = detection_match > 0 && !detection_ignores;
const auto false_positive = detection_match == 0 && !detection_ignores;
if (true_positive) {
++true_positives_sum;
}
if (false_positive) {
++false_positives_sum;
}
const double recall =
static_cast(true_positives_sum) / num_valid_ground_truth;
recalls->push_back(recall);
const int64_t num_valid_detections =
true_positives_sum + false_positives_sum;
const double precision = num_valid_detections > 0
? static_cast(true_positives_sum) / num_valid_detections
: 0.0;
precisions->push_back(precision);
}
(*recalls_out)[recalls_out_index] = !recalls->empty() ? recalls->back() : 0;
for (int64_t i = static_cast(precisions->size()) - 1; i > 0; --i) {
if ((*precisions)[i] > (*precisions)[i - 1]) {
(*precisions)[i - 1] = (*precisions)[i];
}
}
// Sample the per instance precision/recall list at each recall threshold
for (size_t r = 0; r < recall_thresholds.size(); ++r) {
// first index in recalls >= recall_thresholds[r]
std::vector::iterator low = std::lower_bound(
recalls->begin(), recalls->end(), recall_thresholds[r]);
size_t precisions_index = low - recalls->begin();
const auto results_ind = precisions_out_index + r * precisions_out_stride;
assert(results_ind < precisions_out->size());
assert(results_ind < scores_out->size());
if (precisions_index < precisions->size()) {
(*precisions_out)[results_ind] = (*precisions)[precisions_index];
(*scores_out)[results_ind] =
detection_scores[detection_sorted_indices[precisions_index]];
} else {
(*precisions_out)[results_ind] = 0;
(*scores_out)[results_ind] = 0;
}
}
}
py::dict Accumulate(
const py::object& params,
const std::vector& evaluations) {
const std::vector recall_thresholds =
list_to_vec(params.attr("recThrs"));
const std::vector max_detections =
list_to_vec(params.attr("maxDets"));
const int num_iou_thresholds = py::len(params.attr("iouThrs"));
const int num_recall_thresholds = py::len(params.attr("recThrs"));
const int num_categories = params.attr("useCats").cast() == 1
? py::len(params.attr("catIds"))
: 1;
const int num_area_ranges = py::len(params.attr("areaRng"));
const int num_max_detections = py::len(params.attr("maxDets"));
const int num_images = py::len(params.attr("imgIds"));
std::vector precisions_out(
num_iou_thresholds * num_recall_thresholds * num_categories *
num_area_ranges * num_max_detections,
-1);
std::vector recalls_out(
num_iou_thresholds * num_categories * num_area_ranges *
num_max_detections,
-1);
std::vector scores_out(
num_iou_thresholds * num_recall_thresholds * num_categories *
num_area_ranges * num_max_detections,
-1);
// Consider the list of all detected instances in the entire dataset in one
// large list. evaluation_indices, detection_scores,
// image_detection_indices, and detection_sorted_indices all have the same
// length as this list, such that each entry corresponds to one detected
// instance
std::vector evaluation_indices; // indices into evaluations[]
std::vector detection_scores; // detection scores of each instance
std::vector detection_sorted_indices; // sorted indices of all
// instances in the dataset
std::vector
image_detection_indices; // indices into the list of detected instances in
// the same image as each instance
std::vector precisions, recalls;
for (auto c = 0; c < num_categories; ++c) {
for (auto a = 0; a < num_area_ranges; ++a) {
for (auto m = 0; m < num_max_detections; ++m) {
// The COCO PythonAPI assumes evaluations[] (the return value of
// COCOeval::EvaluateImages() is one long list storing results for each
// combination of category, area range, and image id, with categories in
// the outermost loop and images in the innermost loop.
const int64_t evaluations_index =
c * num_area_ranges * num_images + a * num_images;
int num_valid_ground_truth = BuildSortedDetectionList(
evaluations,
evaluations_index,
num_images,
max_detections[m],
&evaluation_indices,
&detection_scores,
&detection_sorted_indices,
&image_detection_indices);
if (num_valid_ground_truth == 0) {
continue;
}
for (auto t = 0; t < num_iou_thresholds; ++t) {
// recalls_out is a flattened vectors representing a
// num_iou_thresholds X num_categories X num_area_ranges X
// num_max_detections matrix
const int64_t recalls_out_index =
t * num_categories * num_area_ranges * num_max_detections +
c * num_area_ranges * num_max_detections +
a * num_max_detections + m;
// precisions_out and scores_out are flattened vectors
// representing a num_iou_thresholds X num_recall_thresholds X
// num_categories X num_area_ranges X num_max_detections matrix
const int64_t precisions_out_stride =
num_categories * num_area_ranges * num_max_detections;
const int64_t precisions_out_index = t * num_recall_thresholds *
num_categories * num_area_ranges * num_max_detections +
c * num_area_ranges * num_max_detections +
a * num_max_detections + m;
ComputePrecisionRecallCurve(
precisions_out_index,
precisions_out_stride,
recalls_out_index,
recall_thresholds,
t,
num_iou_thresholds,
num_valid_ground_truth,
evaluations,
evaluation_indices,
detection_scores,
detection_sorted_indices,
image_detection_indices,
&precisions,
&recalls,
&precisions_out,
&scores_out,
&recalls_out);
}
}
}
}
time_t rawtime;
struct tm local_time;
std::array buffer;
time(&rawtime);
#ifdef _WIN32
localtime_s(&local_time, &rawtime);
#else
localtime_r(&rawtime, &local_time);
#endif
strftime(
buffer.data(), 200, "%Y-%m-%d %H:%num_max_detections:%S", &local_time);
return py::dict(
"params"_a = params,
"counts"_a = std::vector(
{num_iou_thresholds,
num_recall_thresholds,
num_categories,
num_area_ranges,
num_max_detections}),
"date"_a = buffer,
"precision"_a = precisions_out,
"recall"_a = recalls_out,
"scores"_a = scores_out);
}
PYBIND11_MODULE(cocoeval_ext, m) {
m.def("COCOevalAccumulate", &Accumulate, "Accumulate");
m.def("COCOevalEvaluateImages", &EvaluateImages, "EvaluateImages");
py::class_(m, "InstanceAnnotation")
.def(py::init());
py::class_(m, "ImageEvaluation")
.def(py::init<>());
}
================================================
FILE: ppdet/metrics/fast_cocoeval/ext/cocoeval.h
================================================
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// The code is based on
// https://github.com/facebookresearch/detectron2/tree/main/detectron2/layers/csrc/cocoeval/
#pragma once
#include
#include
#include
#include
#include
namespace py = pybind11;
// Annotation data for a single object instance in an image
struct InstanceAnnotation {
InstanceAnnotation(
uint64_t id,
double score,
double area,
bool is_crowd,
bool ignore)
: id{id}, score{score}, area{area}, is_crowd{is_crowd}, ignore{ignore} {}
uint64_t id;
double score = 0.;
double area = 0.;
bool is_crowd = false;
bool ignore = false;
};
// Stores intermediate results for evaluating detection results for a single
// image that has D detected instances and G ground truth instances. This stores
// matches between detected and ground truth instances
struct ImageEvaluation {
// For each of the D detected instances, the id of the matched ground truth
// instance, or 0 if unmatched
std::vector detection_matches;
// The detection score of each of the D detected instances
std::vector detection_scores;
// Marks whether or not each of G instances was ignored from evaluation (e.g.,
// because it's outside area_range)
std::vector ground_truth_ignores;
// Marks whether or not each of D instances was ignored from evaluation (e.g.,
// because it's outside aRng)
std::vector detection_ignores;
};
template
using ImageCategoryInstances = std::vector>>;
// C++ implementation of COCO API cocoeval.py::COCOeval.evaluateImg(). For each
// combination of image, category, area range settings, and IOU thresholds to
// evaluate, it matches detected instances to ground truth instances and stores
// the results into a vector of ImageEvaluation results, which will be
// interpreted by the COCOeval::Accumulate() function to produce precion-recall
// curves. The parameters of nested vectors have the following semantics:
// image_category_ious[i][c][d][g] is the intersection over union of the d'th
// detected instance and g'th ground truth instance of
// category category_ids[c] in image image_ids[i]
// image_category_ground_truth_instances[i][c] is a vector of ground truth
// instances in image image_ids[i] of category category_ids[c]
// image_category_detection_instances[i][c] is a vector of detected
// instances in image image_ids[i] of category category_ids[c]
std::vector EvaluateImages(
const std::vector>& area_ranges, // vector of 2-tuples
int max_detections,
const std::vector& iou_thresholds,
const ImageCategoryInstances>& image_category_ious,
const ImageCategoryInstances&
image_category_ground_truth_instances,
const ImageCategoryInstances&
image_category_detection_instances);
// C++ implementation of COCOeval.accumulate(), which generates precision
// recall curves for each set of category, IOU threshold, detection area range,
// and max number of detections parameters. It is assumed that the parameter
// evaluations is the return value of the functon COCOeval::EvaluateImages(),
// which was called with the same parameter settings params
py::dict Accumulate(
const py::object& params,
const std::vector& evalutations);
================================================
FILE: ppdet/metrics/fast_cocoeval/ext/setup.py
================================================
from pybind11.setup_helpers import Pybind11Extension, build_ext
from setuptools import setup
ext_modules = [Pybind11Extension("cocoeval_ext", ["cocoeval.cc"])]
setup(
name="cocoeval_ext",
version="0.0.0",
ext_modules=ext_modules,
cmdclass={"build_ext": build_ext},
)
================================================
FILE: ppdet/metrics/fast_cocoeval/fast_cocoeval.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The code is based on
# https://github.com/facebookresearch/detectron2/blob/main/detectron2/evaluation/fast_eval_api.py
import copy
import time
import numpy as np
from cocoeval_ext import InstanceAnnotation, COCOevalEvaluateImages, COCOevalAccumulate
from pycocotools.cocoeval import COCOeval
__all__ = ['FastCOCOeval']
class FastCOCOeval(COCOeval):
"""
This is a slightly modified version of the original COCO API, where the functions evaluateImg()
and accumulate() are implemented in C++ to speedup evaluation
"""
def evaluate(self):
"""
Run per image evaluation on given images and store results in self.evalImgs_cpp, a
datastructure that isn't readable from Python but is used by a c++ implementation of
accumulate(). Unlike the original COCO PythonAPI, we don't populate the datastructure
self.evalImgs because this datastructure is a computational bottleneck.
:return: None
"""
tic = time.time()
print('Running per image evaluation...')
p = self.params
# add backward compatibility if useSegm is specified in params
if p.useSegm is not None:
p.iouType = "segm" if p.useSegm == 1 else "bbox"
print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
print('Evaluate annotation type *{}*'.format(p.iouType))
p.imgIds = list(np.unique(p.imgIds))
if p.useCats:
p.catIds = list(np.unique(p.catIds))
p.maxDets = sorted(p.maxDets)
self.params = p
self._prepare() # bottleneck
# loop through images, area range, max detection number
catIds = p.catIds if p.useCats else [-1]
if p.iouType == "segm" or p.iouType == "bbox":
computeIoU = self.computeIoU
elif p.iouType == "keypoints":
computeIoU = self.computeOks
self.ious = {
(imgId, catId): computeIoU(imgId, catId)
for imgId in p.imgIds for catId in catIds
} # bottleneck
maxDet = p.maxDets[-1]
# <<<< Beginning of code differences with original COCO API
def convert_instances_to_cpp(instances, is_det=False):
# Convert annotations for a list of instances in an image to a format that's fast
# to access in C++
instances_cpp = []
for instance in instances:
instance_cpp = InstanceAnnotation(
int(instance["id"]),
instance["score"] if is_det else instance.get("score", 0.0),
instance["area"],
bool(instance.get("iscrowd", 0)),
bool(instance.get("ignore", 0)),
)
instances_cpp.append(instance_cpp)
return instances_cpp
# Convert GT annotations, detections, and IOUs to a format that's fast to access in C++
ground_truth_instances = [
[convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds]
for imgId in p.imgIds
]
detected_instances = [
[convert_instances_to_cpp(self._dts[imgId, catId], is_det=True) for catId in p.catIds]
for imgId in p.imgIds
]
ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds]
if not p.useCats:
# For each image, flatten per-category lists into a single list
ground_truth_instances = [[[o for c in i for o in c]] for i in ground_truth_instances]
detected_instances = [[[o for c in i for o in c]] for i in detected_instances]
# Call C++ implementation of self.evaluateImgs()
self._evalImgs_cpp = COCOevalEvaluateImages(
p.areaRng, maxDet, p.iouThrs, ious, ground_truth_instances, detected_instances
)
self._evalImgs = None
self._paramsEval = copy.deepcopy(self.params)
toc = time.time()
print('DONE (t={:0.2f}s).'.format(toc-tic))
# >>>> End of code differences with original COCO API
def accumulate(self, p=None):
"""
Accumulate per image evaluation results and store the result in self.eval. Does not
support changing parameter settings from those used by self.evaluate()
"""
print('Accumulating evaluation results...')
tic = time.time()
assert hasattr(
self, "_evalImgs_cpp"
), "evaluate() must be called before accmulate() is called."
self.eval = COCOevalAccumulate(self._paramsEval, self._evalImgs_cpp)
# recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections
self.eval["recall"] = np.array(self.eval["recall"]).reshape(
self.eval["counts"][:1] + self.eval["counts"][2:]
)
# precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X
# num_area_ranges X num_max_detections
self.eval["precision"] = np.array(self.eval["precision"]).reshape(self.eval["counts"])
self.eval["scores"] = np.array(self.eval["scores"]).reshape(self.eval["counts"])
toc = time.time()
print('DONE (t={:0.2f}s).'.format( toc-tic))
================================================
FILE: ppdet/metrics/json_results.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import six
import numpy as np
def get_det_res(bboxes,
bbox_nums,
image_id,
label_to_cat_id_map,
bias=0,
im_file=None,
save_threshold=0):
det_res = []
k = 0
for i in range(len(bbox_nums)):
cur_image_id = int(image_id[i][0])
det_nums = bbox_nums[i]
for j in range(det_nums):
dt = bboxes[k]
k = k + 1
num_id, score, xmin, ymin, xmax, ymax = dt.tolist()
if int(num_id) < 0 or score < save_threshold:
continue
category_id = label_to_cat_id_map[int(num_id)]
w = xmax - xmin + bias
h = ymax - ymin + bias
bbox = [xmin, ymin, w, h]
dt_res = {
'image_id': cur_image_id,
'category_id': category_id,
'bbox': bbox,
'score': score
}
if im_file:
dt_res['im_file'] = im_file
det_res.append(dt_res)
return det_res
def get_det_poly_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0):
det_res = []
k = 0
for i in range(len(bbox_nums)):
cur_image_id = int(image_id[i][0])
det_nums = bbox_nums[i]
for j in range(det_nums):
dt = bboxes[k]
k = k + 1
num_id, score, x1, y1, x2, y2, x3, y3, x4, y4 = dt.tolist()
if int(num_id) < 0:
continue
category_id = label_to_cat_id_map[int(num_id)]
rbox = [x1, y1, x2, y2, x3, y3, x4, y4]
dt_res = {
'image_id': cur_image_id,
'category_id': category_id,
'bbox': rbox,
'score': score
}
det_res.append(dt_res)
return det_res
def strip_mask(mask):
row = mask[0, 0, :]
col = mask[0, :, 0]
im_h = len(col) - np.count_nonzero(col == -1)
im_w = len(row) - np.count_nonzero(row == -1)
return mask[:, :im_h, :im_w]
def get_seg_res(masks, bboxes, mask_nums, image_id, label_to_cat_id_map):
import pycocotools.mask as mask_util
seg_res = []
k = 0
for i in range(len(mask_nums)):
cur_image_id = int(image_id[i][0])
det_nums = mask_nums[i]
mask_i = masks[k:k + det_nums]
mask_i = strip_mask(mask_i)
for j in range(det_nums):
mask = mask_i[j].astype(np.uint8)
score = float(bboxes[k][1])
label = int(bboxes[k][0])
k = k + 1
if label == -1:
continue
cat_id = label_to_cat_id_map[label]
rle = mask_util.encode(
np.array(
mask[:, :, None], order="F", dtype="uint8"))[0]
if six.PY3:
if 'counts' in rle:
rle['counts'] = rle['counts'].decode("utf8")
sg_res = {
'image_id': cur_image_id,
'category_id': cat_id,
'segmentation': rle,
'score': score
}
seg_res.append(sg_res)
return seg_res
def get_solov2_segm_res(results, image_id, num_id_to_cat_id_map):
import pycocotools.mask as mask_util
segm_res = []
# for each batch
segms = results['segm'].astype(np.uint8)
clsid_labels = results['cate_label']
clsid_scores = results['cate_score']
lengths = segms.shape[0]
im_id = int(image_id[0][0])
if lengths == 0 or segms is None:
return None
# for each sample
for i in range(lengths - 1):
clsid = int(clsid_labels[i])
catid = num_id_to_cat_id_map[clsid]
score = float(clsid_scores[i])
mask = segms[i]
segm = mask_util.encode(np.array(mask[:, :, np.newaxis], order='F'))[0]
segm['counts'] = segm['counts'].decode('utf8')
coco_res = {
'image_id': im_id,
'category_id': catid,
'segmentation': segm,
'score': score
}
segm_res.append(coco_res)
return segm_res
def get_keypoint_res(results, im_id):
anns = []
preds = results['keypoint']
for idx in range(im_id.shape[0]):
image_id = im_id[idx].item()
kpts, scores = preds[idx]
for kpt, score in zip(kpts, scores):
kpt = kpt.flatten()
ann = {
'image_id': image_id,
'category_id': 1, # XXX hard code
'keypoints': kpt.tolist(),
'score': float(score)
}
x = kpt[0::3]
y = kpt[1::3]
x0, x1, y0, y1 = np.min(x).item(), np.max(x).item(), np.min(y).item(
), np.max(y).item()
ann['area'] = (x1 - x0) * (y1 - y0)
ann['bbox'] = [x0, y0, x1 - x0, y1 - y0]
anns.append(ann)
return anns
def get_pose3d_res(results, im_id):
anns = []
preds = results['pose3d']
for idx in range(im_id.shape[0]):
image_id = im_id[idx].item()
pose3d = preds[idx]
ann = {
'image_id': image_id,
'category_id': 1, # XXX hard code
'pose3d': pose3d.tolist(),
'score': float(1.)
}
anns.append(ann)
return anns
================================================
FILE: ppdet/metrics/keypoint_metrics.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import json
from collections import defaultdict, OrderedDict
import numpy as np
import paddle
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from ..modeling.keypoint_utils import oks_nms, keypoint_pck_accuracy, keypoint_auc, keypoint_epe
from scipy.io import loadmat, savemat
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
__all__ = [
'KeyPointTopDownCOCOEval', 'KeyPointTopDownCOCOWholeBadyHandEval',
'KeyPointTopDownMPIIEval'
]
class KeyPointTopDownCOCOEval(object):
"""refer to
https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
Copyright (c) Microsoft, under the MIT License.
"""
def __init__(self,
anno_file,
num_samples,
num_joints,
output_eval,
iou_type='keypoints',
in_vis_thre=0.2,
oks_thre=0.9,
save_prediction_only=False):
super(KeyPointTopDownCOCOEval, self).__init__()
self.coco = COCO(anno_file)
self.num_samples = num_samples
self.num_joints = num_joints
self.iou_type = iou_type
self.in_vis_thre = in_vis_thre
self.oks_thre = oks_thre
self.output_eval = output_eval
self.res_file = os.path.join(output_eval, "keypoints_results.json")
self.save_prediction_only = save_prediction_only
self.reset()
def reset(self):
self.results = {
'all_preds': np.zeros(
(self.num_samples, self.num_joints, 3), dtype=np.float32),
'all_boxes': np.zeros((self.num_samples, 6)),
'image_path': []
}
self.eval_results = {}
self.idx = 0
def update(self, inputs, outputs):
kpts, _ = outputs['keypoint'][0]
num_images = inputs['image'].shape[0]
self.results['all_preds'][self.idx:self.idx + num_images, :, 0:
3] = kpts[:, :, 0:3]
self.results['all_boxes'][self.idx:self.idx + num_images, 0:2] = inputs[
'center'].numpy()[:, 0:2] if isinstance(
inputs['center'], paddle.Tensor) else inputs['center'][:, 0:2]
self.results['all_boxes'][self.idx:self.idx + num_images, 2:4] = inputs[
'scale'].numpy()[:, 0:2] if isinstance(
inputs['scale'], paddle.Tensor) else inputs['scale'][:, 0:2]
self.results['all_boxes'][self.idx:self.idx + num_images, 4] = np.prod(
inputs['scale'].numpy() * 200,
1) if isinstance(inputs['scale'], paddle.Tensor) else np.prod(
inputs['scale'] * 200, 1)
self.results['all_boxes'][
self.idx:self.idx + num_images,
5] = np.squeeze(inputs['score'].numpy()) if isinstance(
inputs['score'], paddle.Tensor) else np.squeeze(inputs['score'])
if isinstance(inputs['im_id'], paddle.Tensor):
self.results['image_path'].extend(inputs['im_id'].numpy())
else:
self.results['image_path'].extend(inputs['im_id'])
self.idx += num_images
def _write_coco_keypoint_results(self, keypoints):
data_pack = [{
'cat_id': 1,
'cls': 'person',
'ann_type': 'keypoints',
'keypoints': keypoints
}]
results = self._coco_keypoint_results_one_category_kernel(data_pack[0])
if not os.path.exists(self.output_eval):
os.makedirs(self.output_eval)
with open(self.res_file, 'w') as f:
json.dump(results, f, sort_keys=True, indent=4)
logger.info(f'The keypoint result is saved to {self.res_file}.')
try:
json.load(open(self.res_file))
except Exception:
content = []
with open(self.res_file, 'r') as f:
for line in f:
content.append(line)
content[-1] = ']'
with open(self.res_file, 'w') as f:
for c in content:
f.write(c)
def _coco_keypoint_results_one_category_kernel(self, data_pack):
cat_id = data_pack['cat_id']
keypoints = data_pack['keypoints']
cat_results = []
for img_kpts in keypoints:
if len(img_kpts) == 0:
continue
_key_points = np.array(
[img_kpts[k]['keypoints'] for k in range(len(img_kpts))])
_key_points = _key_points.reshape(_key_points.shape[0], -1)
result = [{
'image_id': img_kpts[k]['image'],
'category_id': cat_id,
'keypoints': _key_points[k].tolist(),
'score': img_kpts[k]['score'],
'center': list(img_kpts[k]['center']),
'scale': list(img_kpts[k]['scale'])
} for k in range(len(img_kpts))]
cat_results.extend(result)
return cat_results
def get_final_results(self, preds, all_boxes, img_path):
_kpts = []
for idx, kpt in enumerate(preds):
_kpts.append({
'keypoints': kpt,
'center': all_boxes[idx][0:2],
'scale': all_boxes[idx][2:4],
'area': all_boxes[idx][4],
'score': all_boxes[idx][5],
'image': int(img_path[idx])
})
# image x person x (keypoints)
kpts = defaultdict(list)
for kpt in _kpts:
kpts[kpt['image']].append(kpt)
# rescoring and oks nms
num_joints = preds.shape[1]
in_vis_thre = self.in_vis_thre
oks_thre = self.oks_thre
oks_nmsed_kpts = []
for img in kpts.keys():
img_kpts = kpts[img]
for n_p in img_kpts:
box_score = n_p['score']
kpt_score = 0
valid_num = 0
for n_jt in range(0, num_joints):
t_s = n_p['keypoints'][n_jt][2]
if t_s > in_vis_thre:
kpt_score = kpt_score + t_s
valid_num = valid_num + 1
if valid_num != 0:
kpt_score = kpt_score / valid_num
# rescoring
n_p['score'] = kpt_score * box_score
keep = oks_nms([img_kpts[i] for i in range(len(img_kpts))],
oks_thre)
if len(keep) == 0:
oks_nmsed_kpts.append(img_kpts)
else:
oks_nmsed_kpts.append([img_kpts[_keep] for _keep in keep])
self._write_coco_keypoint_results(oks_nmsed_kpts)
def accumulate(self):
self.get_final_results(self.results['all_preds'],
self.results['all_boxes'],
self.results['image_path'])
if self.save_prediction_only:
logger.info(f'The keypoint result is saved to {self.res_file} '
'and do not evaluate the mAP.')
return
coco_dt = self.coco.loadRes(self.res_file)
coco_eval = COCOeval(self.coco, coco_dt, 'keypoints')
coco_eval.params.useSegm = None
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()
keypoint_stats = []
for ind in range(len(coco_eval.stats)):
keypoint_stats.append((coco_eval.stats[ind]))
self.eval_results['keypoint'] = keypoint_stats
def log(self):
if self.save_prediction_only:
return
stats_names = [
'AP', 'Ap .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
'AR .75', 'AR (M)', 'AR (L)'
]
num_values = len(stats_names)
print(' '.join(['| {}'.format(name) for name in stats_names]) + ' |')
print('|---' * (num_values + 1) + '|')
print(' '.join([
'| {:.3f}'.format(value) for value in self.eval_results['keypoint']
]) + ' |')
def get_results(self):
return self.eval_results
class KeyPointTopDownCOCOWholeBadyHandEval(object):
def __init__(self,
anno_file,
num_samples,
num_joints,
output_eval,
save_prediction_only=False):
super(KeyPointTopDownCOCOWholeBadyHandEval, self).__init__()
self.coco = COCO(anno_file)
self.num_samples = num_samples
self.num_joints = num_joints
self.output_eval = output_eval
self.res_file = os.path.join(output_eval, "keypoints_results.json")
self.save_prediction_only = save_prediction_only
self.parse_dataset()
self.reset()
def parse_dataset(self):
gt_db = []
num_joints = self.num_joints
coco = self.coco
img_ids = coco.getImgIds()
for img_id in img_ids:
ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False)
objs = coco.loadAnns(ann_ids)
for obj in objs:
for type in ['left', 'right']:
if (obj[f'{type}hand_valid'] and
max(obj[f'{type}hand_kpts']) > 0):
joints = np.zeros((num_joints, 3), dtype=np.float32)
joints_vis = np.zeros((num_joints, 3), dtype=np.float32)
keypoints = np.array(obj[f'{type}hand_kpts'])
keypoints = keypoints.reshape(-1, 3)
joints[:, :2] = keypoints[:, :2]
joints_vis[:, :2] = np.minimum(1, keypoints[:, 2:3])
gt_db.append({
'bbox': obj[f'{type}hand_box'],
'gt_joints': joints,
'joints_vis': joints_vis,
})
self.db = gt_db
def reset(self):
self.results = {
'preds': np.zeros(
(self.num_samples, self.num_joints, 3), dtype=np.float32),
}
self.eval_results = {}
self.idx = 0
def update(self, inputs, outputs):
kpts, _ = outputs['keypoint'][0]
num_images = inputs['image'].shape[0]
self.results['preds'][self.idx:self.idx + num_images, :, 0:
3] = kpts[:, :, 0:3]
self.idx += num_images
def accumulate(self):
self.get_final_results(self.results['preds'])
if self.save_prediction_only:
logger.info(f'The keypoint result is saved to {self.res_file} '
'and do not evaluate the mAP.')
return
self.eval_results = self.evaluate(self.res_file, ('PCK', 'AUC', 'EPE'))
def get_final_results(self, preds):
kpts = []
for idx, kpt in enumerate(preds):
kpts.append({'keypoints': kpt.tolist()})
self._write_keypoint_results(kpts)
def _write_keypoint_results(self, keypoints):
if not os.path.exists(self.output_eval):
os.makedirs(self.output_eval)
with open(self.res_file, 'w') as f:
json.dump(keypoints, f, sort_keys=True, indent=4)
logger.info(f'The keypoint result is saved to {self.res_file}.')
try:
json.load(open(self.res_file))
except Exception:
content = []
with open(self.res_file, 'r') as f:
for line in f:
content.append(line)
content[-1] = ']'
with open(self.res_file, 'w') as f:
for c in content:
f.write(c)
def log(self):
if self.save_prediction_only:
return
for item, value in self.eval_results.items():
print("{} : {}".format(item, value))
def get_results(self):
return self.eval_results
def evaluate(self, res_file, metrics, pck_thr=0.2, auc_nor=30):
"""Keypoint evaluation.
Args:
res_file (str): Json file stored prediction results.
metrics (str | list[str]): Metric to be performed.
Options: 'PCK', 'AUC', 'EPE'.
pck_thr (float): PCK threshold, default as 0.2.
auc_nor (float): AUC normalization factor, default as 30 pixel.
Returns:
List: Evaluation results for evaluation metric.
"""
info_str = []
with open(res_file, 'r') as fin:
preds = json.load(fin)
assert len(preds) == len(self.db)
outputs = []
gts = []
masks = []
threshold_bbox = []
for pred, item in zip(preds, self.db):
outputs.append(np.array(pred['keypoints'])[:, :-1])
gts.append(np.array(item['gt_joints'])[:, :-1])
masks.append((np.array(item['joints_vis'])[:, 0]) > 0)
if 'PCK' in metrics:
bbox = np.array(item['bbox'])
bbox_thr = np.max(bbox[2:])
threshold_bbox.append(np.array([bbox_thr, bbox_thr]))
outputs = np.array(outputs)
gts = np.array(gts)
masks = np.array(masks)
threshold_bbox = np.array(threshold_bbox)
if 'PCK' in metrics:
_, pck, _ = keypoint_pck_accuracy(outputs, gts, masks, pck_thr,
threshold_bbox)
info_str.append(('PCK', pck))
if 'AUC' in metrics:
info_str.append(('AUC', keypoint_auc(outputs, gts, masks, auc_nor)))
if 'EPE' in metrics:
info_str.append(('EPE', keypoint_epe(outputs, gts, masks)))
name_value = OrderedDict(info_str)
return name_value
class KeyPointTopDownMPIIEval(object):
def __init__(self,
anno_file,
num_samples,
num_joints,
output_eval,
oks_thre=0.9,
save_prediction_only=False):
super(KeyPointTopDownMPIIEval, self).__init__()
self.ann_file = anno_file
self.res_file = os.path.join(output_eval, "keypoints_results.json")
self.save_prediction_only = save_prediction_only
self.reset()
def reset(self):
self.results = []
self.eval_results = {}
self.idx = 0
def update(self, inputs, outputs):
kpts, _ = outputs['keypoint'][0]
num_images = inputs['image'].shape[0]
results = {}
results['preds'] = kpts[:, :, 0:3]
results['boxes'] = np.zeros((num_images, 6))
results['boxes'][:, 0:2] = inputs['center'].numpy()[:, 0:2]
results['boxes'][:, 2:4] = inputs['scale'].numpy()[:, 0:2]
results['boxes'][:, 4] = np.prod(inputs['scale'].numpy() * 200, 1)
results['boxes'][:, 5] = np.squeeze(inputs['score'].numpy())
results['image_path'] = inputs['image_file']
self.results.append(results)
def accumulate(self):
self._mpii_keypoint_results_save()
if self.save_prediction_only:
logger.info(f'The keypoint result is saved to {self.res_file} '
'and do not evaluate the mAP.')
return
self.eval_results = self.evaluate(self.results)
def _mpii_keypoint_results_save(self):
results = []
for res in self.results:
if len(res) == 0:
continue
result = [{
'preds': res['preds'][k].tolist(),
'boxes': res['boxes'][k].tolist(),
'image_path': res['image_path'][k],
} for k in range(len(res))]
results.extend(result)
with open(self.res_file, 'w') as f:
json.dump(results, f, sort_keys=True, indent=4)
logger.info(f'The keypoint result is saved to {self.res_file}.')
def log(self):
if self.save_prediction_only:
return
for item, value in self.eval_results.items():
print("{} : {}".format(item, value))
def get_results(self):
return self.eval_results
def evaluate(self, outputs, savepath=None):
"""Evaluate PCKh for MPII dataset. refer to
https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
Copyright (c) Microsoft, under the MIT License.
Args:
outputs(list(preds, boxes)):
* preds (np.ndarray[N,K,3]): The first two dimensions are
coordinates, score is the third dimension of the array.
* boxes (np.ndarray[N,6]): [center[0], center[1], scale[0]
, scale[1],area, score]
Returns:
dict: PCKh for each joint
"""
kpts = []
for output in outputs:
preds = output['preds']
batch_size = preds.shape[0]
for i in range(batch_size):
kpts.append({'keypoints': preds[i]})
preds = np.stack([kpt['keypoints'] for kpt in kpts])
# convert 0-based index to 1-based index,
# and get the first two dimensions.
preds = preds[..., :2] + 1.0
if savepath is not None:
pred_file = os.path.join(savepath, 'pred.mat')
savemat(pred_file, mdict={'preds': preds})
SC_BIAS = 0.6
threshold = 0.5
gt_file = os.path.join(
os.path.dirname(self.ann_file), 'mpii_gt_val.mat')
gt_dict = loadmat(gt_file)
dataset_joints = gt_dict['dataset_joints']
jnt_missing = gt_dict['jnt_missing']
pos_gt_src = gt_dict['pos_gt_src']
headboxes_src = gt_dict['headboxes_src']
pos_pred_src = np.transpose(preds, [1, 2, 0])
head = np.where(dataset_joints == 'head')[1][0]
lsho = np.where(dataset_joints == 'lsho')[1][0]
lelb = np.where(dataset_joints == 'lelb')[1][0]
lwri = np.where(dataset_joints == 'lwri')[1][0]
lhip = np.where(dataset_joints == 'lhip')[1][0]
lkne = np.where(dataset_joints == 'lkne')[1][0]
lank = np.where(dataset_joints == 'lank')[1][0]
rsho = np.where(dataset_joints == 'rsho')[1][0]
relb = np.where(dataset_joints == 'relb')[1][0]
rwri = np.where(dataset_joints == 'rwri')[1][0]
rkne = np.where(dataset_joints == 'rkne')[1][0]
rank = np.where(dataset_joints == 'rank')[1][0]
rhip = np.where(dataset_joints == 'rhip')[1][0]
jnt_visible = 1 - jnt_missing
uv_error = pos_pred_src - pos_gt_src
uv_err = np.linalg.norm(uv_error, axis=1)
headsizes = headboxes_src[1, :, :] - headboxes_src[0, :, :]
headsizes = np.linalg.norm(headsizes, axis=0)
headsizes *= SC_BIAS
scale = headsizes * np.ones((len(uv_err), 1), dtype=np.float32)
scaled_uv_err = uv_err / scale
scaled_uv_err = scaled_uv_err * jnt_visible
jnt_count = np.sum(jnt_visible, axis=1)
less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible
PCKh = 100. * np.sum(less_than_threshold, axis=1) / jnt_count
# save
rng = np.arange(0, 0.5 + 0.01, 0.01)
pckAll = np.zeros((len(rng), 16), dtype=np.float32)
for r, threshold in enumerate(rng):
less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible
pckAll[r, :] = 100. * np.sum(less_than_threshold,
axis=1) / jnt_count
PCKh = np.ma.array(PCKh, mask=False)
PCKh.mask[6:8] = True
jnt_count = np.ma.array(jnt_count, mask=False)
jnt_count.mask[6:8] = True
jnt_ratio = jnt_count / np.sum(jnt_count).astype(np.float64)
name_value = [ #noqa
('Head', PCKh[head]),
('Shoulder', 0.5 * (PCKh[lsho] + PCKh[rsho])),
('Elbow', 0.5 * (PCKh[lelb] + PCKh[relb])),
('Wrist', 0.5 * (PCKh[lwri] + PCKh[rwri])),
('Hip', 0.5 * (PCKh[lhip] + PCKh[rhip])),
('Knee', 0.5 * (PCKh[lkne] + PCKh[rkne])),
('Ankle', 0.5 * (PCKh[lank] + PCKh[rank])),
('PCKh', np.sum(PCKh * jnt_ratio)),
('PCKh@0.1', np.sum(pckAll[11, :] * jnt_ratio))
]
name_value = OrderedDict(name_value)
return name_value
def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
"""sort kpts and remove the repeated ones."""
kpts = sorted(kpts, key=lambda x: x[key])
num = len(kpts)
for i in range(num - 1, 0, -1):
if kpts[i][key] == kpts[i - 1][key]:
del kpts[i]
return kpts
================================================
FILE: ppdet/metrics/lvis_utils.py
================================================
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
import numpy as np
import itertools
from ppdet.metrics.json_results import get_det_res, get_det_poly_res, get_seg_res, get_solov2_segm_res, get_keypoint_res, get_pose3d_res
from ppdet.metrics.map_utils import draw_pr_curve
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
def lvisapi_eval(jsonfile,
style,
lvis_gt=None,
anno_file=None,
max_dets=(100, 300, 1000),
classwise=False,
sigmas=None,
use_area=True):
"""
Args:
jsonfile (str): Evaluation json file, eg: bbox.json, mask.json.
style (str): COCOeval style, can be `bbox` , `segm` , `proposal`, `keypoints` and `keypoints_crowd`.
coco_gt (str): Whether to load COCOAPI through anno_file,
eg: coco_gt = COCO(anno_file)
anno_file (str): COCO annotations file.
max_dets (tuple): COCO evaluation maxDets.
classwise (bool): Whether per-category AP and draw P-R Curve or not.
sigmas (nparray): keypoint labelling sigmas.
use_area (bool): If gt annotations (eg. CrowdPose, AIC)
do not have 'area', please set use_area=False.
"""
assert lvis_gt != None or anno_file != None
from lvis import LVIS, LVISEval, LVISResults
if lvis_gt == None:
# coco_gt = COCO(anno_file)
lvis_gt = LVIS(anno_file)
logger.info("Start evaluate...")
lvis_dt = LVISResults(lvis_gt, jsonfile)
lvis_eval = LVISEval(lvis_gt, lvis_dt, style)
lvis_eval.evaluate()
lvis_eval.accumulate()
lvis_eval.summarize()
if classwise:
# Compute per-category AP and PR curve
try:
from terminaltables import AsciiTable
except Exception as e:
logger.error(
'terminaltables not found, plaese install terminaltables. '
'for example: `pip install terminaltables`.')
raise e
precisions = coco_eval.eval['precision']
cat_ids = coco_gt.getCatIds()
# precision: (iou, recall, cls, area range, max dets)
assert len(cat_ids) == precisions.shape[2]
results_per_category = []
for idx, catId in enumerate(cat_ids):
# area range index 0: all area ranges
# max dets index -1: typically 100 per image
nm = coco_gt.loadCats(catId)[0]
precision = precisions[:, :, idx, 0, -1]
precision = precision[precision > -1]
if precision.size:
ap = np.mean(precision)
else:
ap = float('nan')
results_per_category.append(
(str(nm["name"]), '{:0.3f}'.format(float(ap))))
pr_array = precisions[0, :, idx, 0, 2]
recall_array = np.arange(0.0, 1.01, 0.01)
draw_pr_curve(
pr_array,
recall_array,
out_dir=style + '_pr_curve',
file_name='{}_precision_recall_curve.jpg'.format(nm["name"]))
num_columns = min(6, len(results_per_category) * 2)
results_flatten = list(itertools.chain(*results_per_category))
headers = ['category', 'AP'] * (num_columns // 2)
results_2d = itertools.zip_longest(
* [results_flatten[i::num_columns] for i in range(num_columns)])
table_data = [headers]
table_data += [result for result in results_2d]
table = AsciiTable(table_data)
logger.info('Per-category of {} AP: \n{}'.format(style, table.table))
logger.info("per-category PR curve has output to {} folder.".format(
style + '_pr_curve'))
# flush coco evaluation result
sys.stdout.flush()
return lvis_eval.get_results()
================================================
FILE: ppdet/metrics/map_utils.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import os
import sys
import numpy as np
import itertools
import paddle
from ppdet.modeling.rbox_utils import poly2rbox_np
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
__all__ = [
'draw_pr_curve',
'bbox_area',
'jaccard_overlap',
'prune_zero_padding',
'DetectionMAP',
'ap_per_class',
'compute_ap',
]
def draw_pr_curve(precision,
recall,
iou=0.5,
out_dir='pr_curve',
file_name='precision_recall_curve.jpg'):
if not os.path.exists(out_dir):
os.makedirs(out_dir)
output_path = os.path.join(out_dir, file_name)
try:
import matplotlib.pyplot as plt
except Exception as e:
logger.error('Matplotlib not found, plaese install matplotlib.'
'for example: `pip install matplotlib`.')
raise e
plt.cla()
plt.figure('P-R Curve')
plt.title('Precision/Recall Curve(IoU={})'.format(iou))
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.grid(True)
plt.plot(recall, precision)
plt.savefig(output_path)
def bbox_area(bbox, is_bbox_normalized):
"""
Calculate area of a bounding box
"""
norm = 1. - float(is_bbox_normalized)
width = bbox[2] - bbox[0] + norm
height = bbox[3] - bbox[1] + norm
return width * height
def jaccard_overlap(pred, gt, is_bbox_normalized=False):
"""
Calculate jaccard overlap ratio between two bounding box
"""
if pred[0] >= gt[2] or pred[2] <= gt[0] or \
pred[1] >= gt[3] or pred[3] <= gt[1]:
return 0.
inter_xmin = max(pred[0], gt[0])
inter_ymin = max(pred[1], gt[1])
inter_xmax = min(pred[2], gt[2])
inter_ymax = min(pred[3], gt[3])
inter_size = bbox_area([inter_xmin, inter_ymin, inter_xmax, inter_ymax],
is_bbox_normalized)
pred_size = bbox_area(pred, is_bbox_normalized)
gt_size = bbox_area(gt, is_bbox_normalized)
overlap = float(inter_size) / (pred_size + gt_size - inter_size)
return overlap
def calc_rbox_iou(pred, gt_poly):
"""
calc iou between rotated bbox
"""
# calc iou of bounding box for speedup
pred = np.array(pred, np.float32).reshape(-1, 2)
gt_poly = np.array(gt_poly, np.float32).reshape(-1, 2)
pred_rect = [
np.min(pred[:, 0]), np.min(pred[:, 1]), np.max(pred[:, 0]),
np.max(pred[:, 1])
]
gt_rect = [
np.min(gt_poly[:, 0]), np.min(gt_poly[:, 1]), np.max(gt_poly[:, 0]),
np.max(gt_poly[:, 1])
]
iou = jaccard_overlap(pred_rect, gt_rect, False)
if iou <= 0:
return iou
# calc rbox iou
pred_rbox = poly2rbox_np(pred.reshape(-1, 8)).reshape(-1, 5)
gt_rbox = poly2rbox_np(gt_poly.reshape(-1, 8)).reshape(-1, 5)
try:
from ext_op import rbox_iou
except Exception as e:
print("import custom_ops error, try install ext_op " \
"following ppdet/ext_op/README.md", e)
sys.stdout.flush()
sys.exit(-1)
pd_gt_rbox = paddle.to_tensor(gt_rbox, dtype='float32')
pd_pred_rbox = paddle.to_tensor(pred_rbox, dtype='float32')
iou = rbox_iou(pd_gt_rbox, pd_pred_rbox)
iou = iou.numpy()
return iou[0][0]
def prune_zero_padding(gt_box, gt_label, difficult=None):
valid_cnt = 0
for i in range(len(gt_box)):
if (gt_box[i] == 0).all():
break
valid_cnt += 1
return (gt_box[:valid_cnt], gt_label[:valid_cnt], difficult[:valid_cnt]
if difficult is not None else None)
class DetectionMAP(object):
"""
Calculate detection mean average precision.
Currently support two types: 11point and integral
Args:
class_num (int): The class number.
overlap_thresh (float): The threshold of overlap
ratio between prediction bounding box and
ground truth bounding box for deciding
true/false positive. Default 0.5.
map_type (str): Calculation method of mean average
precision, currently support '11point' and
'integral'. Default '11point'.
is_bbox_normalized (bool): Whether bounding boxes
is normalized to range[0, 1]. Default False.
evaluate_difficult (bool): Whether to evaluate
difficult bounding boxes. Default False.
catid2name (dict): Mapping between category id and category name.
classwise (bool): Whether per-category AP and draw
P-R Curve or not.
"""
def __init__(self,
class_num,
overlap_thresh=0.5,
map_type='11point',
is_bbox_normalized=False,
evaluate_difficult=False,
catid2name=None,
classwise=False):
self.class_num = class_num
self.overlap_thresh = overlap_thresh
assert map_type in ['11point', 'integral'], \
"map_type currently only support '11point' "\
"and 'integral'"
self.map_type = map_type
self.is_bbox_normalized = is_bbox_normalized
self.evaluate_difficult = evaluate_difficult
self.classwise = classwise
self.classes = []
for cname in catid2name.values():
self.classes.append(cname)
self.reset()
def update(self, bbox, score, label, gt_box, gt_label, difficult=None):
"""
Update metric statics from given prediction and ground
truth infomations.
"""
if difficult is None:
difficult = np.zeros_like(gt_label)
# record class gt count
for gtl, diff in zip(gt_label, difficult):
if self.evaluate_difficult or int(diff) == 0:
self.class_gt_counts[int(np.array(gtl))] += 1
# record class score positive
visited = [False] * len(gt_label)
for b, s, l in zip(bbox, score, label):
pred = b.tolist() if isinstance(b, np.ndarray) else b
max_idx = -1
max_overlap = -1.0
for i, gl in enumerate(gt_label):
if int(gl) == int(l):
if len(gt_box[i]) == 8:
overlap = calc_rbox_iou(pred, gt_box[i])
else:
overlap = jaccard_overlap(pred, gt_box[i],
self.is_bbox_normalized)
if overlap > max_overlap:
max_overlap = overlap
max_idx = i
if max_overlap > self.overlap_thresh:
if self.evaluate_difficult or \
int(np.array(difficult[max_idx])) == 0:
if not visited[max_idx]:
self.class_score_poss[int(l)].append([s, 1.0])
visited[max_idx] = True
else:
self.class_score_poss[int(l)].append([s, 0.0])
else:
self.class_score_poss[int(l)].append([s, 0.0])
def reset(self):
"""
Reset metric statics
"""
self.class_score_poss = [[] for _ in range(self.class_num)]
self.class_gt_counts = [0] * self.class_num
self.mAP = 0.0
def accumulate(self):
"""
Accumulate metric results and calculate mAP
"""
mAP = 0.
valid_cnt = 0
eval_results = []
for score_pos, count in zip(self.class_score_poss,
self.class_gt_counts):
if count == 0: continue
if len(score_pos) == 0:
valid_cnt += 1
continue
accum_tp_list, accum_fp_list = \
self._get_tp_fp_accum(score_pos)
precision = []
recall = []
for ac_tp, ac_fp in zip(accum_tp_list, accum_fp_list):
precision.append(float(ac_tp) / (ac_tp + ac_fp))
recall.append(float(ac_tp) / count)
one_class_ap = 0.0
if self.map_type == '11point':
max_precisions = [0.] * 11
start_idx = len(precision) - 1
for j in range(10, -1, -1):
for i in range(start_idx, -1, -1):
if recall[i] < float(j) / 10.:
start_idx = i
if j > 0:
max_precisions[j - 1] = max_precisions[j]
break
else:
if max_precisions[j] < precision[i]:
max_precisions[j] = precision[i]
one_class_ap = sum(max_precisions) / 11.
mAP += one_class_ap
valid_cnt += 1
elif self.map_type == 'integral':
import math
prev_recall = 0.
for i in range(len(precision)):
recall_gap = math.fabs(recall[i] - prev_recall)
if recall_gap > 1e-6:
one_class_ap += precision[i] * recall_gap
prev_recall = recall[i]
mAP += one_class_ap
valid_cnt += 1
else:
logger.error("Unspported mAP type {}".format(self.map_type))
sys.exit(1)
eval_results.append({
'class': self.classes[valid_cnt - 1],
'ap': one_class_ap,
'precision': precision,
'recall': recall,
})
self.eval_results = eval_results
self.mAP = mAP / float(valid_cnt) if valid_cnt > 0 else mAP
def get_map(self):
"""
Get mAP result
"""
if self.mAP is None:
logger.error("mAP is not calculated.")
if self.classwise:
# Compute per-category AP and PR curve
try:
from terminaltables import AsciiTable
except Exception as e:
logger.error(
'terminaltables not found, plaese install terminaltables. '
'for example: `pip install terminaltables`.')
raise e
results_per_category = []
for eval_result in self.eval_results:
results_per_category.append(
(str(eval_result['class']),
'{:0.3f}'.format(float(eval_result['ap']))))
draw_pr_curve(
eval_result['precision'],
eval_result['recall'],
out_dir='voc_pr_curve',
file_name='{}_precision_recall_curve.jpg'.format(
eval_result['class']))
num_columns = min(6, len(results_per_category) * 2)
results_flatten = list(itertools.chain(*results_per_category))
headers = ['category', 'AP'] * (num_columns // 2)
results_2d = itertools.zip_longest(* [
results_flatten[i::num_columns] for i in range(num_columns)
])
table_data = [headers]
table_data += [result for result in results_2d]
table = AsciiTable(table_data)
logger.info('Per-category of VOC AP: \n{}'.format(table.table))
logger.info(
"per-category PR curve has output to voc_pr_curve folder.")
return self.mAP
def _get_tp_fp_accum(self, score_pos_list):
"""
Calculate accumulating true/false positive results from
[score, pos] records
"""
sorted_list = sorted(score_pos_list, key=lambda s: s[0], reverse=True)
accum_tp = 0
accum_fp = 0
accum_tp_list = []
accum_fp_list = []
for (score, pos) in sorted_list:
accum_tp += int(pos)
accum_tp_list.append(accum_tp)
accum_fp += 1 - int(pos)
accum_fp_list.append(accum_fp)
return accum_tp_list, accum_fp_list
def ap_per_class(tp, conf, pred_cls, target_cls):
"""
Computes the average precision, given the recall and precision curves.
Method originally from https://github.com/rafaelpadilla/Object-Detection-Metrics.
Args:
tp (list): True positives.
conf (list): Objectness value from 0-1.
pred_cls (list): Predicted object classes.
target_cls (list): Target object classes.
"""
tp, conf, pred_cls, target_cls = np.array(tp), np.array(conf), np.array(
pred_cls), np.array(target_cls)
# Sort by objectness
i = np.argsort(-conf)
tp, conf, pred_cls = tp[i], conf[i], pred_cls[i]
# Find unique classes
unique_classes = np.unique(np.concatenate((pred_cls, target_cls), 0))
# Create Precision-Recall curve and compute AP for each class
ap, p, r = [], [], []
for c in unique_classes:
i = pred_cls == c
n_gt = sum(target_cls == c) # Number of ground truth objects
n_p = sum(i) # Number of predicted objects
if (n_p == 0) and (n_gt == 0):
continue
elif (n_p == 0) or (n_gt == 0):
ap.append(0)
r.append(0)
p.append(0)
else:
# Accumulate FPs and TPs
fpc = np.cumsum(1 - tp[i])
tpc = np.cumsum(tp[i])
# Recall
recall_curve = tpc / (n_gt + 1e-16)
r.append(tpc[-1] / (n_gt + 1e-16))
# Precision
precision_curve = tpc / (tpc + fpc)
p.append(tpc[-1] / (tpc[-1] + fpc[-1]))
# AP from recall-precision curve
ap.append(compute_ap(recall_curve, precision_curve))
return np.array(ap), unique_classes.astype('int32'), np.array(r), np.array(
p)
def compute_ap(recall, precision):
"""
Computes the average precision, given the recall and precision curves.
Code originally from https://github.com/rbgirshick/py-faster-rcnn.
Args:
recall (list): The recall curve.
precision (list): The precision curve.
Returns:
The average precision as computed in py-faster-rcnn.
"""
# correct AP calculation
# first append sentinel values at the end
mrec = np.concatenate(([0.], recall, [1.]))
mpre = np.concatenate(([0.], precision, [0.]))
# compute the precision envelope
for i in range(mpre.size - 1, 0, -1):
mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
# to calculate area under PR curve, look for points
# where X axis (recall) changes value
i = np.where(mrec[1:] != mrec[:-1])[0]
# and sum (\Delta recall) * prec
ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
return ap
================================================
FILE: ppdet/metrics/mcmot_metrics.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import copy
import sys
import math
from collections import defaultdict
import numpy as np
import pandas as pd
from .metrics import Metric
try:
import motmetrics as mm
from motmetrics.math_util import quiet_divide
metrics = mm.metrics.motchallenge_metrics
mh = mm.metrics.create()
except:
print(
'Warning: Unable to use MCMOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics'
)
pass
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
__all__ = ['MCMOTEvaluator', 'MCMOTMetric']
METRICS_LIST = [
'num_frames', 'num_matches', 'num_switches', 'num_transfer', 'num_ascend',
'num_migrate', 'num_false_positives', 'num_misses', 'num_detections',
'num_objects', 'num_predictions', 'num_unique_objects', 'mostly_tracked',
'partially_tracked', 'mostly_lost', 'num_fragmentations', 'motp', 'mota',
'precision', 'recall', 'idfp', 'idfn', 'idtp', 'idp', 'idr', 'idf1'
]
NAME_MAP = {
'num_frames': 'num_frames',
'num_matches': 'num_matches',
'num_switches': 'IDs',
'num_transfer': 'IDt',
'num_ascend': 'IDa',
'num_migrate': 'IDm',
'num_false_positives': 'FP',
'num_misses': 'FN',
'num_detections': 'num_detections',
'num_objects': 'num_objects',
'num_predictions': 'num_predictions',
'num_unique_objects': 'GT',
'mostly_tracked': 'MT',
'partially_tracked': 'partially_tracked',
'mostly_lost': 'ML',
'num_fragmentations': 'FM',
'motp': 'MOTP',
'mota': 'MOTA',
'precision': 'Prcn',
'recall': 'Rcll',
'idfp': 'idfp',
'idfn': 'idfn',
'idtp': 'idtp',
'idp': 'IDP',
'idr': 'IDR',
'idf1': 'IDF1'
}
def parse_accs_metrics(seq_acc, index_name, verbose=False):
"""
Parse the evaluation indicators of multiple MOTAccumulator
"""
mh = mm.metrics.create()
summary = MCMOTEvaluator.get_summary(seq_acc, index_name, METRICS_LIST)
summary.loc['OVERALL', 'motp'] = (summary['motp'] * summary['num_detections']).sum() / \
summary.loc['OVERALL', 'num_detections']
if verbose:
strsummary = mm.io.render_summary(
summary, formatters=mh.formatters, namemap=NAME_MAP)
print(strsummary)
return summary
def seqs_overall_metrics(summary_df, verbose=False):
"""
Calculate overall metrics for multiple sequences
"""
add_col = [
'num_frames', 'num_matches', 'num_switches', 'num_transfer',
'num_ascend', 'num_migrate', 'num_false_positives', 'num_misses',
'num_detections', 'num_objects', 'num_predictions',
'num_unique_objects', 'mostly_tracked', 'partially_tracked',
'mostly_lost', 'num_fragmentations', 'idfp', 'idfn', 'idtp'
]
calc_col = ['motp', 'mota', 'precision', 'recall', 'idp', 'idr', 'idf1']
calc_df = summary_df.copy()
overall_dic = {}
for col in add_col:
overall_dic[col] = calc_df[col].sum()
for col in calc_col:
overall_dic[col] = getattr(MCMOTMetricOverall, col + '_overall')(
calc_df, overall_dic)
overall_df = pd.DataFrame(overall_dic, index=['overall_calc'])
calc_df = pd.concat([calc_df, overall_df])
if verbose:
mh = mm.metrics.create()
str_calc_df = mm.io.render_summary(
calc_df, formatters=mh.formatters, namemap=NAME_MAP)
print(str_calc_df)
return calc_df
class MCMOTMetricOverall(object):
def motp_overall(summary_df, overall_dic):
motp = quiet_divide((summary_df['motp'] *
summary_df['num_detections']).sum(),
overall_dic['num_detections'])
return motp
def mota_overall(summary_df, overall_dic):
del summary_df
mota = 1. - quiet_divide(
(overall_dic['num_misses'] + overall_dic['num_switches'] +
overall_dic['num_false_positives']), overall_dic['num_objects'])
return mota
def precision_overall(summary_df, overall_dic):
del summary_df
precision = quiet_divide(overall_dic['num_detections'], (
overall_dic['num_false_positives'] + overall_dic['num_detections']))
return precision
def recall_overall(summary_df, overall_dic):
del summary_df
recall = quiet_divide(overall_dic['num_detections'],
overall_dic['num_objects'])
return recall
def idp_overall(summary_df, overall_dic):
del summary_df
idp = quiet_divide(overall_dic['idtp'],
(overall_dic['idtp'] + overall_dic['idfp']))
return idp
def idr_overall(summary_df, overall_dic):
del summary_df
idr = quiet_divide(overall_dic['idtp'],
(overall_dic['idtp'] + overall_dic['idfn']))
return idr
def idf1_overall(summary_df, overall_dic):
del summary_df
idf1 = quiet_divide(2. * overall_dic['idtp'], (
overall_dic['num_objects'] + overall_dic['num_predictions']))
return idf1
def read_mcmot_results_union(filename, is_gt, is_ignore):
results_dict = dict()
if os.path.isfile(filename):
all_result = np.loadtxt(filename, delimiter=',')
if all_result.shape[0] == 0 or all_result.shape[1] < 7:
return results_dict
if is_ignore:
return results_dict
if is_gt:
# only for test use
all_result = all_result[all_result[:, 7] != 0]
all_result[:, 7] = all_result[:, 7] - 1
if all_result.shape[0] == 0:
return results_dict
class_unique = np.unique(all_result[:, 7])
last_max_id = 0
result_cls_list = []
for cls in class_unique:
result_cls_split = all_result[all_result[:, 7] == cls]
result_cls_split[:, 1] = result_cls_split[:, 1] + last_max_id
# make sure track id different between every category
last_max_id = max(np.unique(result_cls_split[:, 1])) + 1
result_cls_list.append(result_cls_split)
results_con = np.concatenate(result_cls_list)
for line in range(len(results_con)):
linelist = results_con[line]
fid = int(linelist[0])
if fid < 1:
continue
results_dict.setdefault(fid, list())
if is_gt:
score = 1
else:
score = float(linelist[6])
tlwh = tuple(map(float, linelist[2:6]))
target_id = int(linelist[1])
cls = int(linelist[7])
results_dict[fid].append((tlwh, target_id, cls, score))
return results_dict
def read_mcmot_results(filename, is_gt, is_ignore):
results_dict = dict()
if os.path.isfile(filename):
with open(filename, 'r') as f:
for line in f.readlines():
linelist = line.strip().split(',')
if len(linelist) < 7:
continue
fid = int(linelist[0])
if fid < 1:
continue
cid = int(linelist[7])
if is_gt:
score = 1
# only for test use
cid -= 1
else:
score = float(linelist[6])
cls_result_dict = results_dict.setdefault(cid, dict())
cls_result_dict.setdefault(fid, list())
tlwh = tuple(map(float, linelist[2:6]))
target_id = int(linelist[1])
cls_result_dict[fid].append((tlwh, target_id, score))
return results_dict
def read_results(filename,
data_type,
is_gt=False,
is_ignore=False,
multi_class=False,
union=False):
if data_type in ['mcmot', 'lab']:
if multi_class:
if union:
# The results are evaluated by union all the categories.
# Track IDs between different categories cannot be duplicate.
read_fun = read_mcmot_results_union
else:
# The results are evaluated separately by category.
read_fun = read_mcmot_results
else:
raise ValueError('multi_class: {}, MCMOT should have cls_id.'.
format(multi_class))
else:
raise ValueError('Unknown data type: {}'.format(data_type))
return read_fun(filename, is_gt, is_ignore)
def unzip_objs(objs):
if len(objs) > 0:
tlwhs, ids, scores = zip(*objs)
else:
tlwhs, ids, scores = [], [], []
tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4)
return tlwhs, ids, scores
def unzip_objs_cls(objs):
if len(objs) > 0:
tlwhs, ids, cls, scores = zip(*objs)
else:
tlwhs, ids, cls, scores = [], [], [], []
tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4)
ids = np.array(ids)
cls = np.array(cls)
scores = np.array(scores)
return tlwhs, ids, cls, scores
class MCMOTEvaluator(object):
def __init__(self, data_root, seq_name, data_type, num_classes):
self.data_root = data_root
self.seq_name = seq_name
self.data_type = data_type
self.num_classes = num_classes
self.load_annotations()
try:
import motmetrics as mm
mm.lap.default_solver = 'lap'
except Exception as e:
raise RuntimeError(
'Unable to use MCMOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics'
)
self.reset_accumulator()
self.class_accs = []
def load_annotations(self):
assert self.data_type == 'mcmot'
self.gt_filename = os.path.join(self.data_root, '../', 'sequences',
'{}.txt'.format(self.seq_name))
if not os.path.exists(self.gt_filename):
logger.warning(
"gt_filename '{}' of MCMOTEvaluator is not exist, so the MOTA will be -INF."
)
def reset_accumulator(self):
self.acc = mm.MOTAccumulator(auto_id=True)
def eval_frame_dict(self, trk_objs, gt_objs, rtn_events=False, union=False):
if union:
trk_tlwhs, trk_ids, trk_cls = unzip_objs_cls(trk_objs)[:3]
gt_tlwhs, gt_ids, gt_cls = unzip_objs_cls(gt_objs)[:3]
# get distance matrix
iou_distance = mm.distances.iou_matrix(
gt_tlwhs, trk_tlwhs, max_iou=0.5)
# Set the distance between objects of different categories to nan
gt_cls_len = len(gt_cls)
trk_cls_len = len(trk_cls)
# When the number of GT or Trk is 0, iou_distance dimension is (0,0)
if gt_cls_len != 0 and trk_cls_len != 0:
gt_cls = gt_cls.reshape(gt_cls_len, 1)
gt_cls = np.repeat(gt_cls, trk_cls_len, axis=1)
trk_cls = trk_cls.reshape(1, trk_cls_len)
trk_cls = np.repeat(trk_cls, gt_cls_len, axis=0)
iou_distance = np.where(gt_cls == trk_cls, iou_distance, np.nan)
else:
trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2]
gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2]
# get distance matrix
iou_distance = mm.distances.iou_matrix(
gt_tlwhs, trk_tlwhs, max_iou=0.5)
self.acc.update(gt_ids, trk_ids, iou_distance)
if rtn_events and iou_distance.size > 0 and hasattr(self.acc,
'mot_events'):
events = self.acc.mot_events # only supported by https://github.com/longcw/py-motmetrics
else:
events = None
return events
def eval_file(self, result_filename):
# evaluation of each category
gt_frame_dict = read_results(
self.gt_filename,
self.data_type,
is_gt=True,
multi_class=True,
union=False)
result_frame_dict = read_results(
result_filename,
self.data_type,
is_gt=False,
multi_class=True,
union=False)
for cid in range(self.num_classes):
self.reset_accumulator()
cls_result_frame_dict = result_frame_dict.setdefault(cid, dict())
cls_gt_frame_dict = gt_frame_dict.setdefault(cid, dict())
# only labeled frames will be evaluated
frames = sorted(list(set(cls_gt_frame_dict.keys())))
for frame_id in frames:
trk_objs = cls_result_frame_dict.get(frame_id, [])
gt_objs = cls_gt_frame_dict.get(frame_id, [])
self.eval_frame_dict(trk_objs, gt_objs, rtn_events=False)
self.class_accs.append(self.acc)
return self.class_accs
@staticmethod
def get_summary(accs,
names,
metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1',
'precision', 'recall')):
names = copy.deepcopy(names)
if metrics is None:
metrics = mm.metrics.motchallenge_metrics
metrics = copy.deepcopy(metrics)
mh = mm.metrics.create()
summary = mh.compute_many(
accs, metrics=metrics, names=names, generate_overall=True)
return summary
@staticmethod
def save_summary(summary, filename):
import pandas as pd
writer = pd.ExcelWriter(filename)
summary.to_excel(writer)
writer.save()
class MCMOTMetric(Metric):
def __init__(self, num_classes, save_summary=False):
self.num_classes = num_classes
self.save_summary = save_summary
self.MCMOTEvaluator = MCMOTEvaluator
self.result_root = None
self.reset()
self.seqs_overall = defaultdict(list)
def reset(self):
self.accs = []
self.seqs = []
def update(self, data_root, seq, data_type, result_root, result_filename):
evaluator = self.MCMOTEvaluator(data_root, seq, data_type,
self.num_classes)
seq_acc = evaluator.eval_file(result_filename)
self.accs.append(seq_acc)
self.seqs.append(seq)
self.result_root = result_root
cls_index_name = [
'{}_{}'.format(seq, i) for i in range(self.num_classes)
]
summary = parse_accs_metrics(seq_acc, cls_index_name)
summary.rename(
index={'OVERALL': '{}_OVERALL'.format(seq)}, inplace=True)
for row in range(len(summary)):
self.seqs_overall[row].append(summary.iloc[row:row + 1])
def accumulate(self):
self.cls_summary_list = []
for row in range(self.num_classes):
seqs_cls_df = pd.concat(self.seqs_overall[row])
seqs_cls_summary = seqs_overall_metrics(seqs_cls_df)
cls_summary_overall = seqs_cls_summary.iloc[-1:].copy()
cls_summary_overall.rename(
index={'overall_calc': 'overall_calc_{}'.format(row)},
inplace=True)
self.cls_summary_list.append(cls_summary_overall)
def log(self):
seqs_summary = seqs_overall_metrics(
pd.concat(self.seqs_overall[self.num_classes]), verbose=True)
class_summary = seqs_overall_metrics(
pd.concat(self.cls_summary_list), verbose=True)
def get_results(self):
return 1
================================================
FILE: ppdet/metrics/metrics.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
import json
import paddle
import numpy as np
import typing
from collections import defaultdict
from pathlib import Path
from .map_utils import prune_zero_padding, DetectionMAP
from .coco_utils import get_infer_results, cocoapi_eval
from .lvis_utils import lvisapi_eval
from .widerface_utils import (face_eval_run, image_eval, img_pr_info,
dataset_pr_info, voc_ap)
from ppdet.data.source.category import get_categories
from ppdet.modeling.rbox_utils import poly2rbox_np
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
__all__ = [
'Metric', 'COCOMetric', 'VOCMetric', 'WiderFaceMetric', 'get_infer_results',
'RBoxMetric', 'SNIPERCOCOMetric', 'LVISMetric'
]
COCO_SIGMAS = np.array([
.26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87,
.89, .89
]) / 10.0
CROWD_SIGMAS = np.array(
[.79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89, .79,
.79]) / 10.0
class Metric(paddle.metric.Metric):
def name(self):
return self.__class__.__name__
def reset(self):
pass
def accumulate(self):
pass
# paddle.metric.Metric defined :metch:`update`, :meth:`accumulate`
# :metch:`reset`, in ppdet, we also need following 2 methods:
# abstract method for logging metric results
def log(self):
pass
# abstract method for getting metric results
def get_results(self):
pass
class COCOMetric(Metric):
def __init__(self, anno_file, **kwargs):
self.anno_file = anno_file
self.clsid2catid = kwargs.get('clsid2catid', None)
if self.clsid2catid is None:
self.clsid2catid, _ = get_categories('COCO', anno_file)
self.classwise = kwargs.get('classwise', False)
self.output_eval = kwargs.get('output_eval', None)
# TODO: bias should be unified
self.bias = kwargs.get('bias', 0)
self.save_prediction_only = kwargs.get('save_prediction_only', False)
self.iou_type = kwargs.get('IouType', 'bbox')
if not self.save_prediction_only:
assert os.path.isfile(anno_file), \
"anno_file {} not a file".format(anno_file)
if self.output_eval is not None:
Path(self.output_eval).mkdir(exist_ok=True)
self.save_threshold = kwargs.get('save_threshold', 0)
self.reset()
def reset(self):
# only bbox and mask evaluation support currently
self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []}
self.eval_results = {}
def update(self, inputs, outputs):
outs = {}
# outputs Tensor -> numpy.ndarray
for k, v in outputs.items():
outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v
# multi-scale inputs: all inputs have same im_id
if isinstance(inputs, typing.Sequence):
im_id = inputs[0]['im_id']
else:
im_id = inputs['im_id']
outs['im_id'] = im_id.numpy() if isinstance(im_id,
paddle.Tensor) else im_id
if 'im_file' in inputs:
outs['im_file'] = inputs['im_file']
infer_results = get_infer_results(
outs,
self.clsid2catid,
bias=self.bias,
save_threshold=self.save_threshold)
self.results['bbox'] += infer_results[
'bbox'] if 'bbox' in infer_results else []
self.results['mask'] += infer_results[
'mask'] if 'mask' in infer_results else []
self.results['segm'] += infer_results[
'segm'] if 'segm' in infer_results else []
self.results['keypoint'] += infer_results[
'keypoint'] if 'keypoint' in infer_results else []
def accumulate(self):
if len(self.results['bbox']) > 0:
output = "bbox.json"
if self.output_eval:
output = os.path.join(self.output_eval, output)
with open(output, 'w') as f:
json.dump(self.results['bbox'], f)
logger.info('The bbox result is saved to bbox.json.')
if self.save_prediction_only:
logger.info('The bbox result is saved to {} and do not '
'evaluate the mAP.'.format(output))
else:
bbox_stats = cocoapi_eval(
output,
'bbox',
anno_file=self.anno_file,
classwise=self.classwise)
self.eval_results['bbox'] = bbox_stats
sys.stdout.flush()
if len(self.results['mask']) > 0:
output = "mask.json"
if self.output_eval:
output = os.path.join(self.output_eval, output)
with open(output, 'w') as f:
json.dump(self.results['mask'], f)
logger.info('The mask result is saved to mask.json.')
if self.save_prediction_only:
logger.info('The mask result is saved to {} and do not '
'evaluate the mAP.'.format(output))
else:
seg_stats = cocoapi_eval(
output,
'segm',
anno_file=self.anno_file,
classwise=self.classwise)
self.eval_results['mask'] = seg_stats
sys.stdout.flush()
if len(self.results['segm']) > 0:
output = "segm.json"
if self.output_eval:
output = os.path.join(self.output_eval, output)
with open(output, 'w') as f:
json.dump(self.results['segm'], f)
logger.info('The segm result is saved to segm.json.')
if self.save_prediction_only:
logger.info('The segm result is saved to {} and do not '
'evaluate the mAP.'.format(output))
else:
seg_stats = cocoapi_eval(
output,
'segm',
anno_file=self.anno_file,
classwise=self.classwise)
self.eval_results['mask'] = seg_stats
sys.stdout.flush()
if len(self.results['keypoint']) > 0:
output = "keypoint.json"
if self.output_eval:
output = os.path.join(self.output_eval, output)
with open(output, 'w') as f:
json.dump(self.results['keypoint'], f)
logger.info('The keypoint result is saved to keypoint.json.')
if self.save_prediction_only:
logger.info('The keypoint result is saved to {} and do not '
'evaluate the mAP.'.format(output))
else:
style = 'keypoints'
use_area = True
sigmas = COCO_SIGMAS
if self.iou_type == 'keypoints_crowd':
style = 'keypoints_crowd'
use_area = False
sigmas = CROWD_SIGMAS
keypoint_stats = cocoapi_eval(
output,
style,
anno_file=self.anno_file,
classwise=self.classwise,
sigmas=sigmas,
use_area=use_area)
self.eval_results['keypoint'] = keypoint_stats
sys.stdout.flush()
def log(self):
pass
def get_results(self):
return self.eval_results
class LVISMetric(Metric):
def __init__(self, anno_file, **kwargs):
self.anno_file = anno_file
self.clsid2catid = kwargs.get('clsid2catid', None)
if self.clsid2catid is None:
self.clsid2catid, _ = get_categories('COCO', anno_file)
self.classwise = kwargs.get('classwise', False)
self.output_eval = kwargs.get('output_eval', None)
# TODO: bias should be unified
self.bias = kwargs.get('bias', 0)
self.save_prediction_only = kwargs.get('save_prediction_only', False)
self.iou_type = kwargs.get('IouType', 'bbox')
if not self.save_prediction_only:
assert os.path.isfile(anno_file), \
"anno_file {} not a file".format(anno_file)
if self.output_eval is not None:
Path(self.output_eval).mkdir(exist_ok=True)
self.reset()
def reset(self):
# only bbox and mask evaluation support currently
self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []}
self.eval_results = {}
def update(self, inputs, outputs):
outs = {}
# outputs Tensor -> numpy.ndarray
for k, v in outputs.items():
outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v
# multi-scale inputs: all inputs have same im_id
if isinstance(inputs, typing.Sequence):
im_id = inputs[0]['im_id']
else:
im_id = inputs['im_id']
outs['im_id'] = im_id.numpy() if isinstance(im_id, paddle.Tensor) else im_id
infer_results = get_infer_results(
outs, self.clsid2catid, bias=self.bias)
self.results['bbox'] += infer_results[
'bbox'] if 'bbox' in infer_results else []
self.results['mask'] += infer_results[
'mask'] if 'mask' in infer_results else []
self.results['segm'] += infer_results[
'segm'] if 'segm' in infer_results else []
self.results['keypoint'] += infer_results[
'keypoint'] if 'keypoint' in infer_results else []
def accumulate(self):
if len(self.results['bbox']) > 0:
output = "bbox.json"
if self.output_eval:
output = os.path.join(self.output_eval, output)
with open(output, 'w') as f:
json.dump(self.results['bbox'], f)
logger.info('The bbox result is saved to bbox.json.')
if self.save_prediction_only:
logger.info('The bbox result is saved to {} and do not '
'evaluate the mAP.'.format(output))
else:
bbox_stats = lvisapi_eval(
output,
'bbox',
anno_file=self.anno_file,
classwise=self.classwise
)
self.eval_results['bbox'] = bbox_stats
sys.stdout.flush()
if len(self.results['mask']) > 0:
output = "mask.json"
if self.output_eval:
output = os.path.join(self.output_eval, output)
with open(output, 'w') as f:
json.dump(self.results['mask'], f)
logger.info('The mask result is saved to mask.json.')
if self.save_prediction_only:
logger.info('The mask result is saved to {} and do not '
'evaluate the mAP.'.format(output))
else:
seg_stats = cocoapi_eval(
output,
'segm',
anno_file=self.anno_file,
classwise=self.classwise)
self.eval_results['mask'] = seg_stats
sys.stdout.flush()
if len(self.results['segm']) > 0:
output = "segm.json"
if self.output_eval:
output = os.path.join(self.output_eval, output)
with open(output, 'w') as f:
json.dump(self.results['segm'], f)
logger.info('The segm result is saved to segm.json.')
if self.save_prediction_only:
logger.info('The segm result is saved to {} and do not '
'evaluate the mAP.'.format(output))
else:
seg_stats = cocoapi_eval(
output,
'segm',
anno_file=self.anno_file,
classwise=self.classwise)
self.eval_results['mask'] = seg_stats
sys.stdout.flush()
if len(self.results['keypoint']) > 0:
output = "keypoint.json"
if self.output_eval:
output = os.path.join(self.output_eval, output)
with open(output, 'w') as f:
json.dump(self.results['keypoint'], f)
logger.info('The keypoint result is saved to keypoint.json.')
if self.save_prediction_only:
logger.info('The keypoint result is saved to {} and do not '
'evaluate the mAP.'.format(output))
else:
style = 'keypoints'
use_area = True
sigmas = COCO_SIGMAS
if self.iou_type == 'keypoints_crowd':
style = 'keypoints_crowd'
use_area = False
sigmas = CROWD_SIGMAS
keypoint_stats = cocoapi_eval(
output,
style,
anno_file=self.anno_file,
classwise=self.classwise,
sigmas=sigmas,
use_area=use_area)
self.eval_results['keypoint'] = keypoint_stats
sys.stdout.flush()
def log(self):
# pass
logger.info(self.eval_results['bbox'])
def get_results(self):
return self.eval_results
class VOCMetric(Metric):
def __init__(self,
label_list,
class_num=20,
overlap_thresh=0.5,
map_type='11point',
is_bbox_normalized=False,
evaluate_difficult=False,
classwise=False,
output_eval=None,
save_prediction_only=False):
assert os.path.isfile(label_list), \
"label_list {} not a file".format(label_list)
self.clsid2catid, self.catid2name = get_categories('VOC', label_list)
self.overlap_thresh = overlap_thresh
self.map_type = map_type
self.evaluate_difficult = evaluate_difficult
self.output_eval = output_eval
self.save_prediction_only = save_prediction_only
self.detection_map = DetectionMAP(
class_num=class_num,
overlap_thresh=overlap_thresh,
map_type=map_type,
is_bbox_normalized=is_bbox_normalized,
evaluate_difficult=evaluate_difficult,
catid2name=self.catid2name,
classwise=classwise)
self.reset()
def reset(self):
self.results = {'bbox': [], 'score': [], 'label': []}
self.detection_map.reset()
def update(self, inputs, outputs):
bbox_np = outputs['bbox'].numpy() if isinstance(
outputs['bbox'], paddle.Tensor) else outputs['bbox']
bboxes = bbox_np[:, 2:]
scores = bbox_np[:, 1]
labels = bbox_np[:, 0]
bbox_lengths = outputs['bbox_num'].numpy() if isinstance(
outputs['bbox_num'], paddle.Tensor) else outputs['bbox_num']
self.results['bbox'].append(bboxes.tolist())
self.results['score'].append(scores.tolist())
self.results['label'].append(labels.tolist())
if bboxes.shape == (1, 1) or bboxes is None:
return
if self.save_prediction_only:
return
gt_boxes = inputs['gt_bbox']
gt_labels = inputs['gt_class']
difficults = inputs['difficult'] if not self.evaluate_difficult \
else None
if 'scale_factor' in inputs:
scale_factor = inputs['scale_factor'].numpy() if isinstance(
inputs['scale_factor'],
paddle.Tensor) else inputs['scale_factor']
else:
scale_factor = np.ones((gt_boxes.shape[0], 2)).astype('float32')
bbox_idx = 0
for i in range(len(gt_boxes)):
gt_box = gt_boxes[i].numpy() if isinstance(
gt_boxes[i], paddle.Tensor) else gt_boxes[i]
h, w = scale_factor[i]
gt_box = gt_box / np.array([w, h, w, h])
gt_label = gt_labels[i].numpy() if isinstance(
gt_labels[i], paddle.Tensor) else gt_labels[i]
if difficults is not None:
difficult = difficults[i].numpy() if isinstance(
difficults[i], paddle.Tensor) else difficults[i]
else:
difficult = None
bbox_num = bbox_lengths[i]
bbox = bboxes[bbox_idx:bbox_idx + bbox_num]
score = scores[bbox_idx:bbox_idx + bbox_num]
label = labels[bbox_idx:bbox_idx + bbox_num]
gt_box, gt_label, difficult = prune_zero_padding(gt_box, gt_label,
difficult)
self.detection_map.update(bbox, score, label, gt_box, gt_label,
difficult)
bbox_idx += bbox_num
def accumulate(self):
output = "bbox.json"
if self.output_eval:
output = os.path.join(self.output_eval, output)
with open(output, 'w') as f:
json.dump(self.results, f)
logger.info('The bbox result is saved to bbox.json.')
if self.save_prediction_only:
return
logger.info("Accumulating evaluatation results...")
self.detection_map.accumulate()
def log(self):
map_stat = 100. * self.detection_map.get_map()
logger.info("mAP({:.2f}, {}) = {:.2f}%".format(self.overlap_thresh,
self.map_type, map_stat))
def get_results(self):
return {'bbox': [self.detection_map.get_map()]}
class WiderFaceMetric(Metric):
def __init__(self, iou_thresh=0.5):
self.iou_thresh = iou_thresh
self.reset()
def reset(self):
self.pred_boxes_list = []
self.gt_boxes_list = []
self.aps = []
self.hard_ignore_list = []
self.medium_ignore_list = []
self.easy_ignore_list = []
def update(self, data, outs):
batch_pred_bboxes = outs['bbox']
batch_pred_bboxes_num = outs['bbox_num']
assert len(batch_pred_bboxes_num) == len(data['gt_bbox'])
batch_size = len(data['gt_bbox'])
box_cnt = 0
for batch_id in range(batch_size):
pred_bboxes_num = batch_pred_bboxes_num[batch_id]
pred_bboxes = batch_pred_bboxes[box_cnt: box_cnt +
pred_bboxes_num].numpy()
box_cnt += pred_bboxes_num
det_conf = pred_bboxes[:, 1]
det_xmin = pred_bboxes[:, 2]
det_ymin = pred_bboxes[:, 3]
det_xmax = pred_bboxes[:, 4]
det_ymax = pred_bboxes[:, 5]
det = np.column_stack((det_xmin, det_ymin, det_xmax,
det_ymax, det_conf))
self.pred_boxes_list.append(det) # xyxy conf
self.gt_boxes_list.append(data['gt_ori_bbox'][batch_id].numpy()) # xywh
self.hard_ignore_list.append(
data['gt_hard_ignore'][batch_id].numpy())
self.medium_ignore_list.append(
data['gt_medium_ignore'][batch_id].numpy())
self.easy_ignore_list.append(
data['gt_easy_ignore'][batch_id].numpy())
def accumulate(self):
total_num = len(self.gt_boxes_list)
settings = ['easy', 'medium', 'hard']
setting_ingores = [self.easy_ignore_list,
self.medium_ignore_list,
self.hard_ignore_list]
thresh_num = 1000
aps = []
for setting_id in range(3):
count_face = 0
pr_curve = np.zeros((thresh_num, 2)).astype(np.float32)
gt_ignore_list = setting_ingores[setting_id]
for i in range(total_num):
pred_boxes = self.pred_boxes_list[i] # xyxy conf
gt_boxes = self.gt_boxes_list[i] # xywh
ignore = gt_ignore_list[i]
count_face += np.sum(ignore)
if len(gt_boxes) == 0 or len(pred_boxes) == 0:
continue
pred_recall, proposal_list = image_eval(pred_boxes, gt_boxes,
ignore, self.iou_thresh)
_img_pr_info = img_pr_info(thresh_num, pred_boxes,
proposal_list, pred_recall)
pr_curve += _img_pr_info
pr_curve = dataset_pr_info(thresh_num, pr_curve, count_face)
propose = pr_curve[:, 0]
recall = pr_curve[:, 1]
ap = voc_ap(recall, propose)
aps.append(ap)
self.aps = aps
def log(self):
logger.info("==================== Results ====================")
logger.info("Easy Val AP: {}".format(self.aps[0]))
logger.info("Medium Val AP: {}".format(self.aps[1]))
logger.info("Hard Val AP: {}".format(self.aps[2]))
logger.info("=================================================")
def get_results(self):
return {
'easy_ap': self.aps[0],
'medium_ap': self.aps[1],
'hard_ap': self.aps[2]}
class RBoxMetric(Metric):
def __init__(self, anno_file, **kwargs):
self.anno_file = anno_file
self.clsid2catid, self.catid2name = get_categories('RBOX', anno_file)
self.catid2clsid = {v: k for k, v in self.clsid2catid.items()}
self.classwise = kwargs.get('classwise', False)
self.output_eval = kwargs.get('output_eval', None)
self.save_prediction_only = kwargs.get('save_prediction_only', False)
self.overlap_thresh = kwargs.get('overlap_thresh', 0.5)
self.map_type = kwargs.get('map_type', '11point')
self.evaluate_difficult = kwargs.get('evaluate_difficult', False)
self.imid2path = kwargs.get('imid2path', None)
class_num = len(self.catid2name)
self.detection_map = DetectionMAP(
class_num=class_num,
overlap_thresh=self.overlap_thresh,
map_type=self.map_type,
is_bbox_normalized=False,
evaluate_difficult=self.evaluate_difficult,
catid2name=self.catid2name,
classwise=self.classwise)
self.reset()
def reset(self):
self.results = []
self.detection_map.reset()
def update(self, inputs, outputs):
outs = {}
# outputs Tensor -> numpy.ndarray
for k, v in outputs.items():
outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v
im_id = inputs['im_id']
im_id = im_id.numpy() if isinstance(im_id, paddle.Tensor) else im_id
outs['im_id'] = im_id
infer_results = get_infer_results(outs, self.clsid2catid)
infer_results = infer_results['bbox'] if 'bbox' in infer_results else []
self.results += infer_results
if self.save_prediction_only:
return
gt_boxes = inputs['gt_poly']
gt_labels = inputs['gt_class']
if 'scale_factor' in inputs:
scale_factor = inputs['scale_factor'].numpy() if isinstance(
inputs['scale_factor'],
paddle.Tensor) else inputs['scale_factor']
else:
scale_factor = np.ones((gt_boxes.shape[0], 2)).astype('float32')
for i in range(len(gt_boxes)):
gt_box = gt_boxes[i].numpy() if isinstance(
gt_boxes[i], paddle.Tensor) else gt_boxes[i]
h, w = scale_factor[i]
gt_box = gt_box / np.array([w, h, w, h, w, h, w, h])
gt_label = gt_labels[i].numpy() if isinstance(
gt_labels[i], paddle.Tensor) else gt_labels[i]
gt_box, gt_label, _ = prune_zero_padding(gt_box, gt_label)
bbox = [
res['bbox'] for res in infer_results
if int(res['image_id']) == int(im_id[i])
]
score = [
res['score'] for res in infer_results
if int(res['image_id']) == int(im_id[i])
]
label = [
self.catid2clsid[int(res['category_id'])]
for res in infer_results
if int(res['image_id']) == int(im_id[i])
]
self.detection_map.update(bbox, score, label, gt_box, gt_label)
def save_results(self, results, output_dir, imid2path):
if imid2path:
data_dicts = defaultdict(list)
for result in results:
image_id = result['image_id']
data_dicts[image_id].append(result)
for image_id, image_path in imid2path.items():
basename = os.path.splitext(os.path.split(image_path)[-1])[0]
output = os.path.join(output_dir, "{}.txt".format(basename))
dets = data_dicts.get(image_id, [])
with open(output, 'w') as f:
for det in dets:
catid, bbox, score = det['category_id'], det[
'bbox'], det['score']
bbox_pred = '{} {} '.format(self.catid2name[catid],
score) + ' '.join(
[str(e) for e in bbox])
f.write(bbox_pred + '\n')
logger.info('The bbox result is saved to {}.'.format(output_dir))
else:
output = os.path.join(output_dir, "bbox.json")
with open(output, 'w') as f:
json.dump(results, f)
logger.info('The bbox result is saved to {}.'.format(output))
def accumulate(self):
if self.output_eval:
self.save_results(self.results, self.output_eval, self.imid2path)
if not self.save_prediction_only:
logger.info("Accumulating evaluatation results...")
self.detection_map.accumulate()
def log(self):
map_stat = 100. * self.detection_map.get_map()
logger.info("mAP({:.2f}, {}) = {:.2f}%".format(self.overlap_thresh,
self.map_type, map_stat))
def get_results(self):
return {'bbox': [self.detection_map.get_map()]}
class SNIPERCOCOMetric(COCOMetric):
def __init__(self, anno_file, **kwargs):
super(SNIPERCOCOMetric, self).__init__(anno_file, **kwargs)
self.dataset = kwargs["dataset"]
self.chip_results = []
def reset(self):
# only bbox and mask evaluation support currently
self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []}
self.eval_results = {}
self.chip_results = []
def update(self, inputs, outputs):
outs = {}
# outputs Tensor -> numpy.ndarray
for k, v in outputs.items():
outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v
im_id = inputs['im_id']
outs['im_id'] = im_id.numpy() if isinstance(im_id,
paddle.Tensor) else im_id
self.chip_results.append(outs)
def accumulate(self):
results = self.dataset.anno_cropper.aggregate_chips_detections(
self.chip_results)
for outs in results:
infer_results = get_infer_results(
outs, self.clsid2catid, bias=self.bias)
self.results['bbox'] += infer_results[
'bbox'] if 'bbox' in infer_results else []
super(SNIPERCOCOMetric, self).accumulate()
================================================
FILE: ppdet/metrics/mot_metrics.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import copy
import sys
import math
from collections import defaultdict
import numpy as np
from ppdet.modeling.bbox_utils import bbox_iou_np_expand
from .map_utils import ap_per_class
from .metrics import Metric
from .munkres import Munkres
try:
import motmetrics as mm
mm.lap.default_solver = 'lap'
except:
print(
'Warning: Unable to use MOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics'
)
pass
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
__all__ = ['MOTEvaluator', 'MOTMetric', 'JDEDetMetric', 'KITTIMOTMetric']
def read_mot_results(filename, is_gt=False, is_ignore=False):
valid_label = [1]
ignore_labels = [2, 7, 8, 12] # only in motchallenge datasets like 'MOT16'
if is_gt:
logger.info(
"In MOT16/17 dataset the valid_label of ground truth is '{}', "
"in other dataset it should be '0' for single classs MOT.".format(
valid_label[0]))
results_dict = dict()
if os.path.isfile(filename):
with open(filename, 'r') as f:
for line in f.readlines():
linelist = line.split(',')
if len(linelist) < 7:
continue
fid = int(linelist[0])
if fid < 1:
continue
results_dict.setdefault(fid, list())
if is_gt:
label = int(float(linelist[7]))
mark = int(float(linelist[6]))
if mark == 0 or label not in valid_label:
continue
score = 1
elif is_ignore:
if 'MOT16-' in filename or 'MOT17-' in filename or 'MOT15-' in filename or 'MOT20-' in filename:
label = int(float(linelist[7]))
vis_ratio = float(linelist[8])
if label not in ignore_labels and vis_ratio >= 0:
continue
else:
continue
score = 1
else:
score = float(linelist[6])
tlwh = tuple(map(float, linelist[2:6]))
target_id = int(linelist[1])
results_dict[fid].append((tlwh, target_id, score))
return results_dict
"""
MOT dataset label list, see in https://motchallenge.net
labels={'ped', ... % 1
'person_on_vhcl', ... % 2
'car', ... % 3
'bicycle', ... % 4
'mbike', ... % 5
'non_mot_vhcl', ... % 6
'static_person', ... % 7
'distractor', ... % 8
'occluder', ... % 9
'occluder_on_grnd', ... % 10
'occluder_full', ... % 11
'reflection', ... % 12
'crowd' ... % 13
};
"""
def unzip_objs(objs):
if len(objs) > 0:
tlwhs, ids, scores = zip(*objs)
else:
tlwhs, ids, scores = [], [], []
tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4)
return tlwhs, ids, scores
class MOTEvaluator(object):
def __init__(self, data_root, seq_name, data_type):
self.data_root = data_root
self.seq_name = seq_name
self.data_type = data_type
self.load_annotations()
try:
import motmetrics as mm
mm.lap.default_solver = 'lap'
except Exception as e:
raise RuntimeError(
'Unable to use MOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics'
)
self.reset_accumulator()
def load_annotations(self):
assert self.data_type == 'mot'
gt_filename = os.path.join(self.data_root, self.seq_name, 'gt',
'gt.txt')
if not os.path.exists(gt_filename):
logger.warning(
"gt_filename '{}' of MOTEvaluator is not exist, so the MOTA will be -INF."
)
self.gt_frame_dict = read_mot_results(gt_filename, is_gt=True)
self.gt_ignore_frame_dict = read_mot_results(
gt_filename, is_ignore=True)
def reset_accumulator(self):
self.acc = mm.MOTAccumulator(auto_id=True)
def eval_frame(self, frame_id, trk_tlwhs, trk_ids, rtn_events=False):
# results
trk_tlwhs = np.copy(trk_tlwhs)
trk_ids = np.copy(trk_ids)
# gts
gt_objs = self.gt_frame_dict.get(frame_id, [])
gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2]
# ignore boxes
ignore_objs = self.gt_ignore_frame_dict.get(frame_id, [])
ignore_tlwhs = unzip_objs(ignore_objs)[0]
# remove ignored results
keep = np.ones(len(trk_tlwhs), dtype=bool)
iou_distance = mm.distances.iou_matrix(
ignore_tlwhs, trk_tlwhs, max_iou=0.5)
if len(iou_distance) > 0:
match_is, match_js = mm.lap.linear_sum_assignment(iou_distance)
match_is, match_js = map(lambda a: np.asarray(a, dtype=int), [match_is, match_js])
match_ious = iou_distance[match_is, match_js]
match_js = np.asarray(match_js, dtype=int)
match_js = match_js[np.logical_not(np.isnan(match_ious))]
keep[match_js] = False
trk_tlwhs = trk_tlwhs[keep]
trk_ids = trk_ids[keep]
# get distance matrix
iou_distance = mm.distances.iou_matrix(gt_tlwhs, trk_tlwhs, max_iou=0.5)
# acc
self.acc.update(gt_ids, trk_ids, iou_distance)
if rtn_events and iou_distance.size > 0 and hasattr(self.acc,
'last_mot_events'):
events = self.acc.last_mot_events # only supported by https://github.com/longcw/py-motmetrics
else:
events = None
return events
def eval_file(self, filename):
self.reset_accumulator()
result_frame_dict = read_mot_results(filename, is_gt=False)
frames = sorted(list(set(result_frame_dict.keys())))
for frame_id in frames:
trk_objs = result_frame_dict.get(frame_id, [])
trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2]
self.eval_frame(frame_id, trk_tlwhs, trk_ids, rtn_events=False)
return self.acc
@staticmethod
def get_summary(accs,
names,
metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1',
'precision', 'recall')):
names = copy.deepcopy(names)
if metrics is None:
metrics = mm.metrics.motchallenge_metrics
metrics = copy.deepcopy(metrics)
mh = mm.metrics.create()
summary = mh.compute_many(
accs, metrics=metrics, names=names, generate_overall=True)
return summary
@staticmethod
def save_summary(summary, filename):
import pandas as pd
writer = pd.ExcelWriter(filename)
summary.to_excel(writer)
writer.save()
class MOTMetric(Metric):
def __init__(self, save_summary=False):
self.save_summary = save_summary
self.MOTEvaluator = MOTEvaluator
self.result_root = None
self.reset()
def reset(self):
self.accs = []
self.seqs = []
def update(self, data_root, seq, data_type, result_root, result_filename):
evaluator = self.MOTEvaluator(data_root, seq, data_type)
self.accs.append(evaluator.eval_file(result_filename))
self.seqs.append(seq)
self.result_root = result_root
def accumulate(self):
metrics = mm.metrics.motchallenge_metrics
mh = mm.metrics.create()
summary = self.MOTEvaluator.get_summary(self.accs, self.seqs, metrics)
self.strsummary = mm.io.render_summary(
summary,
formatters=mh.formatters,
namemap=mm.io.motchallenge_metric_names)
if self.save_summary:
self.MOTEvaluator.save_summary(
summary, os.path.join(self.result_root, 'summary.xlsx'))
def log(self):
print(self.strsummary)
def get_results(self):
return self.strsummary
class JDEDetMetric(Metric):
# Note this detection AP metric is different from COCOMetric or VOCMetric,
# and the bboxes coordinates are not scaled to the original image
def __init__(self, overlap_thresh=0.5):
self.overlap_thresh = overlap_thresh
self.reset()
def reset(self):
self.AP_accum = np.zeros(1)
self.AP_accum_count = np.zeros(1)
def update(self, inputs, outputs):
bboxes = outputs['bbox'][:, 2:].numpy()
scores = outputs['bbox'][:, 1].numpy()
labels = outputs['bbox'][:, 0].numpy()
bbox_lengths = outputs['bbox_num'].numpy()
if bboxes.shape[0] == 1 and bboxes.sum() == 0.0:
return
gt_boxes = inputs['gt_bbox'].numpy()[0]
gt_labels = inputs['gt_class'].numpy()[0]
if gt_labels.shape[0] == 0:
return
correct = []
detected = []
for i in range(bboxes.shape[0]):
obj_pred = 0
pred_bbox = bboxes[i].reshape(1, 4)
# Compute iou with target boxes
iou = bbox_iou_np_expand(pred_bbox, gt_boxes, x1y1x2y2=True)[0]
# Extract index of largest overlap
best_i = np.argmax(iou)
# If overlap exceeds threshold and classification is correct mark as correct
if iou[best_i] > self.overlap_thresh and obj_pred == gt_labels[
best_i] and best_i not in detected:
correct.append(1)
detected.append(best_i)
else:
correct.append(0)
# Compute Average Precision (AP) per class
target_cls = list(gt_labels.T[0])
AP, AP_class, R, P = ap_per_class(
tp=correct,
conf=scores,
pred_cls=np.zeros_like(scores),
target_cls=target_cls)
self.AP_accum_count += np.bincount(AP_class, minlength=1)
self.AP_accum += np.bincount(AP_class, minlength=1, weights=AP)
def accumulate(self):
logger.info("Accumulating evaluatation results...")
self.map_stat = self.AP_accum[0] / (self.AP_accum_count[0] + 1E-16)
def log(self):
map_stat = 100. * self.map_stat
logger.info("mAP({:.2f}) = {:.2f}%".format(self.overlap_thresh,
map_stat))
def get_results(self):
return self.map_stat
"""
Following code is borrow from https://github.com/xingyizhou/CenterTrack/blob/master/src/tools/eval_kitti_track/evaluate_tracking.py
"""
class tData:
"""
Utility class to load data.
"""
def __init__(self,frame=-1,obj_type="unset",truncation=-1,occlusion=-1,\
obs_angle=-10,x1=-1,y1=-1,x2=-1,y2=-1,w=-1,h=-1,l=-1,\
X=-1000,Y=-1000,Z=-1000,yaw=-10,score=-1000,track_id=-1):
"""
Constructor, initializes the object given the parameters.
"""
self.frame = frame
self.track_id = track_id
self.obj_type = obj_type
self.truncation = truncation
self.occlusion = occlusion
self.obs_angle = obs_angle
self.x1 = x1
self.y1 = y1
self.x2 = x2
self.y2 = y2
self.w = w
self.h = h
self.l = l
self.X = X
self.Y = Y
self.Z = Z
self.yaw = yaw
self.score = score
self.ignored = False
self.valid = False
self.tracker = -1
def __str__(self):
attrs = vars(self)
return '\n'.join("%s: %s" % item for item in attrs.items())
class KITTIEvaluation(object):
""" KITTI tracking statistics (CLEAR MOT, id-switches, fragments, ML/PT/MT, precision/recall)
MOTA - Multi-object tracking accuracy in [0,100]
MOTP - Multi-object tracking precision in [0,100] (3D) / [td,100] (2D)
MOTAL - Multi-object tracking accuracy in [0,100] with log10(id-switches)
id-switches - number of id switches
fragments - number of fragmentations
MT, PT, ML - number of mostly tracked, partially tracked and mostly lost trajectories
recall - recall = percentage of detected targets
precision - precision = percentage of correctly detected targets
FAR - number of false alarms per frame
falsepositives - number of false positives (FP)
missed - number of missed targets (FN)
"""
def __init__(self, result_path, gt_path, min_overlap=0.5, max_truncation = 0,\
min_height = 25, max_occlusion = 2, cls="car",\
n_frames=[], seqs=[], n_sequences=0):
# get number of sequences and
# get number of frames per sequence from test mapping
# (created while extracting the benchmark)
self.gt_path = os.path.join(gt_path, "../labels")
self.n_frames = n_frames
self.sequence_name = seqs
self.n_sequences = n_sequences
self.cls = cls # class to evaluate, i.e. pedestrian or car
self.result_path = result_path
# statistics and numbers for evaluation
self.n_gt = 0 # number of ground truth detections minus ignored false negatives and true positives
self.n_igt = 0 # number of ignored ground truth detections
self.n_gts = [
] # number of ground truth detections minus ignored false negatives and true positives PER SEQUENCE
self.n_igts = [
] # number of ground ignored truth detections PER SEQUENCE
self.n_gt_trajectories = 0
self.n_gt_seq = []
self.n_tr = 0 # number of tracker detections minus ignored tracker detections
self.n_trs = [
] # number of tracker detections minus ignored tracker detections PER SEQUENCE
self.n_itr = 0 # number of ignored tracker detections
self.n_itrs = [] # number of ignored tracker detections PER SEQUENCE
self.n_igttr = 0 # number of ignored ground truth detections where the corresponding associated tracker detection is also ignored
self.n_tr_trajectories = 0
self.n_tr_seq = []
self.MOTA = 0
self.MOTP = 0
self.MOTAL = 0
self.MODA = 0
self.MODP = 0
self.MODP_t = []
self.recall = 0
self.precision = 0
self.F1 = 0
self.FAR = 0
self.total_cost = 0
self.itp = 0 # number of ignored true positives
self.itps = [] # number of ignored true positives PER SEQUENCE
self.tp = 0 # number of true positives including ignored true positives!
self.tps = [
] # number of true positives including ignored true positives PER SEQUENCE
self.fn = 0 # number of false negatives WITHOUT ignored false negatives
self.fns = [
] # number of false negatives WITHOUT ignored false negatives PER SEQUENCE
self.ifn = 0 # number of ignored false negatives
self.ifns = [] # number of ignored false negatives PER SEQUENCE
self.fp = 0 # number of false positives
# a bit tricky, the number of ignored false negatives and ignored true positives
# is subtracted, but if both tracker detection and ground truth detection
# are ignored this number is added again to avoid double counting
self.fps = [] # above PER SEQUENCE
self.mme = 0
self.fragments = 0
self.id_switches = 0
self.MT = 0
self.PT = 0
self.ML = 0
self.min_overlap = min_overlap # minimum bounding box overlap for 3rd party metrics
self.max_truncation = max_truncation # maximum truncation of an object for evaluation
self.max_occlusion = max_occlusion # maximum occlusion of an object for evaluation
self.min_height = min_height # minimum height of an object for evaluation
self.n_sample_points = 500
# this should be enough to hold all groundtruth trajectories
# is expanded if necessary and reduced in any case
self.gt_trajectories = [[] for x in range(self.n_sequences)]
self.ign_trajectories = [[] for x in range(self.n_sequences)]
def loadGroundtruth(self):
try:
self._loadData(self.gt_path, cls=self.cls, loading_groundtruth=True)
except IOError:
return False
return True
def loadTracker(self):
try:
if not self._loadData(
self.result_path, cls=self.cls, loading_groundtruth=False):
return False
except IOError:
return False
return True
def _loadData(self,
root_dir,
cls,
min_score=-1000,
loading_groundtruth=False):
"""
Generic loader for ground truth and tracking data.
Use loadGroundtruth() or loadTracker() to load this data.
Loads detections in KITTI format from textfiles.
"""
# construct objectDetections object to hold detection data
t_data = tData()
data = []
eval_2d = True
eval_3d = True
seq_data = []
n_trajectories = 0
n_trajectories_seq = []
for seq, s_name in enumerate(self.sequence_name):
i = 0
filename = os.path.join(root_dir, "%s.txt" % s_name)
f = open(filename, "r")
f_data = [
[] for x in range(self.n_frames[seq])
] # current set has only 1059 entries, sufficient length is checked anyway
ids = []
n_in_seq = 0
id_frame_cache = []
for line in f:
# KITTI tracking benchmark data format:
# (frame,tracklet_id,objectType,truncation,occlusion,alpha,x1,y1,x2,y2,h,w,l,X,Y,Z,ry)
line = line.strip()
fields = line.split(" ")
# classes that should be loaded (ignored neighboring classes)
if "car" in cls.lower():
classes = ["car", "van"]
elif "pedestrian" in cls.lower():
classes = ["pedestrian", "person_sitting"]
else:
classes = [cls.lower()]
classes += ["dontcare"]
if not any([s for s in classes if s in fields[2].lower()]):
continue
# get fields from table
t_data.frame = int(float(fields[0])) # frame
t_data.track_id = int(float(fields[1])) # id
t_data.obj_type = fields[
2].lower() # object type [car, pedestrian, cyclist, ...]
t_data.truncation = int(
float(fields[3])) # truncation [-1,0,1,2]
t_data.occlusion = int(
float(fields[4])) # occlusion [-1,0,1,2]
t_data.obs_angle = float(fields[5]) # observation angle [rad]
t_data.x1 = float(fields[6]) # left [px]
t_data.y1 = float(fields[7]) # top [px]
t_data.x2 = float(fields[8]) # right [px]
t_data.y2 = float(fields[9]) # bottom [px]
t_data.h = float(fields[10]) # height [m]
t_data.w = float(fields[11]) # width [m]
t_data.l = float(fields[12]) # length [m]
t_data.X = float(fields[13]) # X [m]
t_data.Y = float(fields[14]) # Y [m]
t_data.Z = float(fields[15]) # Z [m]
t_data.yaw = float(fields[16]) # yaw angle [rad]
if not loading_groundtruth:
if len(fields) == 17:
t_data.score = -1
elif len(fields) == 18:
t_data.score = float(fields[17]) # detection score
else:
logger.info("file is not in KITTI format")
return
# do not consider objects marked as invalid
if t_data.track_id is -1 and t_data.obj_type != "dontcare":
continue
idx = t_data.frame
# check if length for frame data is sufficient
if idx >= len(f_data):
print("extend f_data", idx, len(f_data))
f_data += [[] for x in range(max(500, idx - len(f_data)))]
try:
id_frame = (t_data.frame, t_data.track_id)
if id_frame in id_frame_cache and not loading_groundtruth:
logger.info(
"track ids are not unique for sequence %d: frame %d"
% (seq, t_data.frame))
logger.info(
"track id %d occurred at least twice for this frame"
% t_data.track_id)
logger.info("Exiting...")
#continue # this allows to evaluate non-unique result files
return False
id_frame_cache.append(id_frame)
f_data[t_data.frame].append(copy.copy(t_data))
except:
print(len(f_data), idx)
raise
if t_data.track_id not in ids and t_data.obj_type != "dontcare":
ids.append(t_data.track_id)
n_trajectories += 1
n_in_seq += 1
# check if uploaded data provides information for 2D and 3D evaluation
if not loading_groundtruth and eval_2d is True and (
t_data.x1 == -1 or t_data.x2 == -1 or t_data.y1 == -1 or
t_data.y2 == -1):
eval_2d = False
if not loading_groundtruth and eval_3d is True and (
t_data.X == -1000 or t_data.Y == -1000 or
t_data.Z == -1000):
eval_3d = False
# only add existing frames
n_trajectories_seq.append(n_in_seq)
seq_data.append(f_data)
f.close()
if not loading_groundtruth:
self.tracker = seq_data
self.n_tr_trajectories = n_trajectories
self.eval_2d = eval_2d
self.eval_3d = eval_3d
self.n_tr_seq = n_trajectories_seq
if self.n_tr_trajectories == 0:
return False
else:
# split ground truth and DontCare areas
self.dcareas = []
self.groundtruth = []
for seq_idx in range(len(seq_data)):
seq_gt = seq_data[seq_idx]
s_g, s_dc = [], []
for f in range(len(seq_gt)):
all_gt = seq_gt[f]
g, dc = [], []
for gg in all_gt:
if gg.obj_type == "dontcare":
dc.append(gg)
else:
g.append(gg)
s_g.append(g)
s_dc.append(dc)
self.dcareas.append(s_dc)
self.groundtruth.append(s_g)
self.n_gt_seq = n_trajectories_seq
self.n_gt_trajectories = n_trajectories
return True
def boxoverlap(self, a, b, criterion="union"):
"""
boxoverlap computes intersection over union for bbox a and b in KITTI format.
If the criterion is 'union', overlap = (a inter b) / a union b).
If the criterion is 'a', overlap = (a inter b) / a, where b should be a dontcare area.
"""
x1 = max(a.x1, b.x1)
y1 = max(a.y1, b.y1)
x2 = min(a.x2, b.x2)
y2 = min(a.y2, b.y2)
w = x2 - x1
h = y2 - y1
if w <= 0. or h <= 0.:
return 0.
inter = w * h
aarea = (a.x2 - a.x1) * (a.y2 - a.y1)
barea = (b.x2 - b.x1) * (b.y2 - b.y1)
# intersection over union overlap
if criterion.lower() == "union":
o = inter / float(aarea + barea - inter)
elif criterion.lower() == "a":
o = float(inter) / float(aarea)
else:
raise TypeError("Unkown type for criterion")
return o
def compute3rdPartyMetrics(self):
"""
Computes the metrics defined in
- Stiefelhagen 2008: Evaluating Multiple Object Tracking Performance: The CLEAR MOT Metrics
MOTA, MOTAL, MOTP
- Nevatia 2008: Global Data Association for Multi-Object Tracking Using Network Flows
MT/PT/ML
"""
# construct Munkres object for Hungarian Method association
hm = Munkres()
max_cost = 1e9
# go through all frames and associate ground truth and tracker results
# groundtruth and tracker contain lists for every single frame containing lists of KITTI format detections
fr, ids = 0, 0
for seq_idx in range(len(self.groundtruth)):
seq_gt = self.groundtruth[seq_idx]
seq_dc = self.dcareas[seq_idx] # don't care areas
seq_tracker = self.tracker[seq_idx]
seq_trajectories = defaultdict(list)
seq_ignored = defaultdict(list)
# statistics over the current sequence, check the corresponding
# variable comments in __init__ to get their meaning
seqtp = 0
seqitp = 0
seqfn = 0
seqifn = 0
seqfp = 0
seqigt = 0
seqitr = 0
last_ids = [[], []]
n_gts = 0
n_trs = 0
for f in range(len(seq_gt)):
g = seq_gt[f]
dc = seq_dc[f]
t = seq_tracker[f]
# counting total number of ground truth and tracker objects
self.n_gt += len(g)
self.n_tr += len(t)
n_gts += len(g)
n_trs += len(t)
# use hungarian method to associate, using boxoverlap 0..1 as cost
# build cost matrix
cost_matrix = []
this_ids = [[], []]
for gg in g:
# save current ids
this_ids[0].append(gg.track_id)
this_ids[1].append(-1)
gg.tracker = -1
gg.id_switch = 0
gg.fragmentation = 0
cost_row = []
for tt in t:
# overlap == 1 is cost ==0
c = 1 - self.boxoverlap(gg, tt)
# gating for boxoverlap
if c <= self.min_overlap:
cost_row.append(c)
else:
cost_row.append(max_cost) # = 1e9
cost_matrix.append(cost_row)
# all ground truth trajectories are initially not associated
# extend groundtruth trajectories lists (merge lists)
seq_trajectories[gg.track_id].append(-1)
seq_ignored[gg.track_id].append(False)
if len(g) is 0:
cost_matrix = [[]]
# associate
association_matrix = hm.compute(cost_matrix)
# tmp variables for sanity checks and MODP computation
tmptp = 0
tmpfp = 0
tmpfn = 0
tmpc = 0 # this will sum up the overlaps for all true positives
tmpcs = [0] * len(
g) # this will save the overlaps for all true positives
# the reason is that some true positives might be ignored
# later such that the corrsponding overlaps can
# be subtracted from tmpc for MODP computation
# mapping for tracker ids and ground truth ids
for row, col in association_matrix:
# apply gating on boxoverlap
c = cost_matrix[row][col]
if c < max_cost:
g[row].tracker = t[col].track_id
this_ids[1][row] = t[col].track_id
t[col].valid = True
g[row].distance = c
self.total_cost += 1 - c
tmpc += 1 - c
tmpcs[row] = 1 - c
seq_trajectories[g[row].track_id][-1] = t[col].track_id
# true positives are only valid associations
self.tp += 1
tmptp += 1
else:
g[row].tracker = -1
self.fn += 1
tmpfn += 1
# associate tracker and DontCare areas
# ignore tracker in neighboring classes
nignoredtracker = 0 # number of ignored tracker detections
ignoredtrackers = dict() # will associate the track_id with -1
# if it is not ignored and 1 if it is
# ignored;
# this is used to avoid double counting ignored
# cases, see the next loop
for tt in t:
ignoredtrackers[tt.track_id] = -1
# ignore detection if it belongs to a neighboring class or is
# smaller or equal to the minimum height
tt_height = abs(tt.y1 - tt.y2)
if ((self.cls == "car" and tt.obj_type == "van") or
(self.cls == "pedestrian" and
tt.obj_type == "person_sitting") or
tt_height <= self.min_height) and not tt.valid:
nignoredtracker += 1
tt.ignored = True
ignoredtrackers[tt.track_id] = 1
continue
for d in dc:
overlap = self.boxoverlap(tt, d, "a")
if overlap > 0.5 and not tt.valid:
tt.ignored = True
nignoredtracker += 1
ignoredtrackers[tt.track_id] = 1
break
# check for ignored FN/TP (truncation or neighboring object class)
ignoredfn = 0 # the number of ignored false negatives
nignoredtp = 0 # the number of ignored true positives
nignoredpairs = 0 # the number of ignored pairs, i.e. a true positive
# which is ignored but where the associated tracker
# detection has already been ignored
gi = 0
for gg in g:
if gg.tracker < 0:
if gg.occlusion>self.max_occlusion or gg.truncation>self.max_truncation\
or (self.cls=="car" and gg.obj_type=="van") or (self.cls=="pedestrian" and gg.obj_type=="person_sitting"):
seq_ignored[gg.track_id][-1] = True
gg.ignored = True
ignoredfn += 1
elif gg.tracker >= 0:
if gg.occlusion>self.max_occlusion or gg.truncation>self.max_truncation\
or (self.cls=="car" and gg.obj_type=="van") or (self.cls=="pedestrian" and gg.obj_type=="person_sitting"):
seq_ignored[gg.track_id][-1] = True
gg.ignored = True
nignoredtp += 1
# if the associated tracker detection is already ignored,
# we want to avoid double counting ignored detections
if ignoredtrackers[gg.tracker] > 0:
nignoredpairs += 1
# for computing MODP, the overlaps from ignored detections
# are subtracted
tmpc -= tmpcs[gi]
gi += 1
# the below might be confusion, check the comments in __init__
# to see what the individual statistics represent
# correct TP by number of ignored TP due to truncation
# ignored TP are shown as tracked in visualization
tmptp -= nignoredtp
# count the number of ignored true positives
self.itp += nignoredtp
# adjust the number of ground truth objects considered
self.n_gt -= (ignoredfn + nignoredtp)
# count the number of ignored ground truth objects
self.n_igt += ignoredfn + nignoredtp
# count the number of ignored tracker objects
self.n_itr += nignoredtracker
# count the number of ignored pairs, i.e. associated tracker and
# ground truth objects that are both ignored
self.n_igttr += nignoredpairs
# false negatives = associated gt bboxes exceding association threshold + non-associated gt bboxes
tmpfn += len(g) - len(association_matrix) - ignoredfn
self.fn += len(g) - len(association_matrix) - ignoredfn
self.ifn += ignoredfn
# false positives = tracker bboxes - associated tracker bboxes
# mismatches (mme_t)
tmpfp += len(
t) - tmptp - nignoredtracker - nignoredtp + nignoredpairs
self.fp += len(
t) - tmptp - nignoredtracker - nignoredtp + nignoredpairs
# update sequence data
seqtp += tmptp
seqitp += nignoredtp
seqfp += tmpfp
seqfn += tmpfn
seqifn += ignoredfn
seqigt += ignoredfn + nignoredtp
seqitr += nignoredtracker
# sanity checks
# - the number of true positives minues ignored true positives
# should be greater or equal to 0
# - the number of false negatives should be greater or equal to 0
# - the number of false positives needs to be greater or equal to 0
# otherwise ignored detections might be counted double
# - the number of counted true positives (plus ignored ones)
# and the number of counted false negatives (plus ignored ones)
# should match the total number of ground truth objects
# - the number of counted true positives (plus ignored ones)
# and the number of counted false positives
# plus the number of ignored tracker detections should
# match the total number of tracker detections; note that
# nignoredpairs is subtracted here to avoid double counting
# of ignored detection sin nignoredtp and nignoredtracker
if tmptp < 0:
print(tmptp, nignoredtp)
raise NameError("Something went wrong! TP is negative")
if tmpfn < 0:
print(tmpfn,
len(g),
len(association_matrix), ignoredfn, nignoredpairs)
raise NameError("Something went wrong! FN is negative")
if tmpfp < 0:
print(tmpfp,
len(t), tmptp, nignoredtracker, nignoredtp,
nignoredpairs)
raise NameError("Something went wrong! FP is negative")
if tmptp + tmpfn is not len(g) - ignoredfn - nignoredtp:
print("seqidx", seq_idx)
print("frame ", f)
print("TP ", tmptp)
print("FN ", tmpfn)
print("FP ", tmpfp)
print("nGT ", len(g))
print("nAss ", len(association_matrix))
print("ign GT", ignoredfn)
print("ign TP", nignoredtp)
raise NameError(
"Something went wrong! nGroundtruth is not TP+FN")
if tmptp + tmpfp + nignoredtp + nignoredtracker - nignoredpairs is not len(
t):
print(seq_idx, f, len(t), tmptp, tmpfp)
print(len(association_matrix), association_matrix)
raise NameError(
"Something went wrong! nTracker is not TP+FP")
# check for id switches or fragmentations
for i, tt in enumerate(this_ids[0]):
if tt in last_ids[0]:
idx = last_ids[0].index(tt)
tid = this_ids[1][i]
lid = last_ids[1][idx]
if tid != lid and lid != -1 and tid != -1:
if g[i].truncation < self.max_truncation:
g[i].id_switch = 1
ids += 1
if tid != lid and lid != -1:
if g[i].truncation < self.max_truncation:
g[i].fragmentation = 1
fr += 1
# save current index
last_ids = this_ids
# compute MOTP_t
MODP_t = 1
if tmptp != 0:
MODP_t = tmpc / float(tmptp)
self.MODP_t.append(MODP_t)
# remove empty lists for current gt trajectories
self.gt_trajectories[seq_idx] = seq_trajectories
self.ign_trajectories[seq_idx] = seq_ignored
# gather statistics for "per sequence" statistics.
self.n_gts.append(n_gts)
self.n_trs.append(n_trs)
self.tps.append(seqtp)
self.itps.append(seqitp)
self.fps.append(seqfp)
self.fns.append(seqfn)
self.ifns.append(seqifn)
self.n_igts.append(seqigt)
self.n_itrs.append(seqitr)
# compute MT/PT/ML, fragments, idswitches for all groundtruth trajectories
n_ignored_tr_total = 0
for seq_idx, (
seq_trajectories, seq_ignored
) in enumerate(zip(self.gt_trajectories, self.ign_trajectories)):
if len(seq_trajectories) == 0:
continue
tmpMT, tmpML, tmpPT, tmpId_switches, tmpFragments = [0] * 5
n_ignored_tr = 0
for g, ign_g in zip(seq_trajectories.values(),
seq_ignored.values()):
# all frames of this gt trajectory are ignored
if all(ign_g):
n_ignored_tr += 1
n_ignored_tr_total += 1
continue
# all frames of this gt trajectory are not assigned to any detections
if all([this == -1 for this in g]):
tmpML += 1
self.ML += 1
continue
# compute tracked frames in trajectory
last_id = g[0]
# first detection (necessary to be in gt_trajectories) is always tracked
tracked = 1 if g[0] >= 0 else 0
lgt = 0 if ign_g[0] else 1
for f in range(1, len(g)):
if ign_g[f]:
last_id = -1
continue
lgt += 1
if last_id != g[f] and last_id != -1 and g[f] != -1 and g[
f - 1] != -1:
tmpId_switches += 1
self.id_switches += 1
if f < len(g) - 1 and g[f - 1] != g[
f] and last_id != -1 and g[f] != -1 and g[f +
1] != -1:
tmpFragments += 1
self.fragments += 1
if g[f] != -1:
tracked += 1
last_id = g[f]
# handle last frame; tracked state is handled in for loop (g[f]!=-1)
if len(g) > 1 and g[f - 1] != g[f] and last_id != -1 and g[
f] != -1 and not ign_g[f]:
tmpFragments += 1
self.fragments += 1
# compute MT/PT/ML
tracking_ratio = tracked / float(len(g) - sum(ign_g))
if tracking_ratio > 0.8:
tmpMT += 1
self.MT += 1
elif tracking_ratio < 0.2:
tmpML += 1
self.ML += 1
else: # 0.2 <= tracking_ratio <= 0.8
tmpPT += 1
self.PT += 1
if (self.n_gt_trajectories - n_ignored_tr_total) == 0:
self.MT = 0.
self.PT = 0.
self.ML = 0.
else:
self.MT /= float(self.n_gt_trajectories - n_ignored_tr_total)
self.PT /= float(self.n_gt_trajectories - n_ignored_tr_total)
self.ML /= float(self.n_gt_trajectories - n_ignored_tr_total)
# precision/recall etc.
if (self.fp + self.tp) == 0 or (self.tp + self.fn) == 0:
self.recall = 0.
self.precision = 0.
else:
self.recall = self.tp / float(self.tp + self.fn)
self.precision = self.tp / float(self.fp + self.tp)
if (self.recall + self.precision) == 0:
self.F1 = 0.
else:
self.F1 = 2. * (self.precision * self.recall) / (
self.precision + self.recall)
if sum(self.n_frames) == 0:
self.FAR = "n/a"
else:
self.FAR = self.fp / float(sum(self.n_frames))
# compute CLEARMOT
if self.n_gt == 0:
self.MOTA = -float("inf")
self.MODA = -float("inf")
else:
self.MOTA = 1 - (self.fn + self.fp + self.id_switches
) / float(self.n_gt)
self.MODA = 1 - (self.fn + self.fp) / float(self.n_gt)
if self.tp == 0:
self.MOTP = float("inf")
else:
self.MOTP = self.total_cost / float(self.tp)
if self.n_gt != 0:
if self.id_switches == 0:
self.MOTAL = 1 - (self.fn + self.fp + self.id_switches
) / float(self.n_gt)
else:
self.MOTAL = 1 - (self.fn + self.fp +
math.log10(self.id_switches)
) / float(self.n_gt)
else:
self.MOTAL = -float("inf")
if sum(self.n_frames) == 0:
self.MODP = "n/a"
else:
self.MODP = sum(self.MODP_t) / float(sum(self.n_frames))
return True
def createSummary(self):
summary = ""
summary += "tracking evaluation summary".center(80, "=") + "\n"
summary += self.printEntry("Multiple Object Tracking Accuracy (MOTA)",
self.MOTA) + "\n"
summary += self.printEntry("Multiple Object Tracking Precision (MOTP)",
self.MOTP) + "\n"
summary += self.printEntry("Multiple Object Tracking Accuracy (MOTAL)",
self.MOTAL) + "\n"
summary += self.printEntry("Multiple Object Detection Accuracy (MODA)",
self.MODA) + "\n"
summary += self.printEntry("Multiple Object Detection Precision (MODP)",
self.MODP) + "\n"
summary += "\n"
summary += self.printEntry("Recall", self.recall) + "\n"
summary += self.printEntry("Precision", self.precision) + "\n"
summary += self.printEntry("F1", self.F1) + "\n"
summary += self.printEntry("False Alarm Rate", self.FAR) + "\n"
summary += "\n"
summary += self.printEntry("Mostly Tracked", self.MT) + "\n"
summary += self.printEntry("Partly Tracked", self.PT) + "\n"
summary += self.printEntry("Mostly Lost", self.ML) + "\n"
summary += "\n"
summary += self.printEntry("True Positives", self.tp) + "\n"
#summary += self.printEntry("True Positives per Sequence", self.tps) + "\n"
summary += self.printEntry("Ignored True Positives", self.itp) + "\n"
#summary += self.printEntry("Ignored True Positives per Sequence", self.itps) + "\n"
summary += self.printEntry("False Positives", self.fp) + "\n"
#summary += self.printEntry("False Positives per Sequence", self.fps) + "\n"
summary += self.printEntry("False Negatives", self.fn) + "\n"
#summary += self.printEntry("False Negatives per Sequence", self.fns) + "\n"
summary += self.printEntry("ID-switches", self.id_switches) + "\n"
self.fp = self.fp / self.n_gt
self.fn = self.fn / self.n_gt
self.id_switches = self.id_switches / self.n_gt
summary += self.printEntry("False Positives Ratio", self.fp) + "\n"
#summary += self.printEntry("False Positives per Sequence", self.fps) + "\n"
summary += self.printEntry("False Negatives Ratio", self.fn) + "\n"
#summary += self.printEntry("False Negatives per Sequence", self.fns) + "\n"
summary += self.printEntry("Ignored False Negatives Ratio",
self.ifn) + "\n"
#summary += self.printEntry("Ignored False Negatives per Sequence", self.ifns) + "\n"
summary += self.printEntry("Missed Targets", self.fn) + "\n"
summary += self.printEntry("ID-switches", self.id_switches) + "\n"
summary += self.printEntry("Fragmentations", self.fragments) + "\n"
summary += "\n"
summary += self.printEntry("Ground Truth Objects (Total)", self.n_gt +
self.n_igt) + "\n"
#summary += self.printEntry("Ground Truth Objects (Total) per Sequence", self.n_gts) + "\n"
summary += self.printEntry("Ignored Ground Truth Objects",
self.n_igt) + "\n"
#summary += self.printEntry("Ignored Ground Truth Objects per Sequence", self.n_igts) + "\n"
summary += self.printEntry("Ground Truth Trajectories",
self.n_gt_trajectories) + "\n"
summary += "\n"
summary += self.printEntry("Tracker Objects (Total)", self.n_tr) + "\n"
#summary += self.printEntry("Tracker Objects (Total) per Sequence", self.n_trs) + "\n"
summary += self.printEntry("Ignored Tracker Objects", self.n_itr) + "\n"
#summary += self.printEntry("Ignored Tracker Objects per Sequence", self.n_itrs) + "\n"
summary += self.printEntry("Tracker Trajectories",
self.n_tr_trajectories) + "\n"
#summary += "\n"
#summary += self.printEntry("Ignored Tracker Objects with Associated Ignored Ground Truth Objects", self.n_igttr) + "\n"
summary += "=" * 80
return summary
def printEntry(self, key, val, width=(70, 10)):
"""
Pretty print an entry in a table fashion.
"""
s_out = key.ljust(width[0])
if type(val) == int:
s = "%%%dd" % width[1]
s_out += s % val
elif type(val) == float:
s = "%%%df" % (width[1])
s_out += s % val
else:
s_out += ("%s" % val).rjust(width[1])
return s_out
def saveToStats(self, save_summary):
"""
Save the statistics in a whitespace separate file.
"""
summary = self.createSummary()
if save_summary:
filename = os.path.join(self.result_path,
"summary_%s.txt" % self.cls)
dump = open(filename, "w+")
dump.write(summary)
dump.close()
return summary
class KITTIMOTMetric(Metric):
def __init__(self, save_summary=True):
self.save_summary = save_summary
self.MOTEvaluator = KITTIEvaluation
self.result_root = None
self.reset()
def reset(self):
self.seqs = []
self.n_sequences = 0
self.n_frames = []
self.strsummary = ''
def update(self, data_root, seq, data_type, result_root, result_filename):
assert data_type == 'kitti', "data_type should 'kitti'"
self.result_root = result_root
self.gt_path = data_root
gt_path = '{}/../labels/{}.txt'.format(data_root, seq)
gt = open(gt_path, "r")
max_frame = 0
for line in gt:
line = line.strip()
line_list = line.split(" ")
if int(line_list[0]) > max_frame:
max_frame = int(line_list[0])
rs = open(result_filename, "r")
for line in rs:
line = line.strip()
line_list = line.split(" ")
if int(line_list[0]) > max_frame:
max_frame = int(line_list[0])
gt.close()
rs.close()
self.n_frames.append(max_frame + 1)
self.seqs.append(seq)
self.n_sequences += 1
def accumulate(self):
logger.info("Processing Result for KITTI Tracking Benchmark")
e = self.MOTEvaluator(result_path=self.result_root, gt_path=self.gt_path,\
n_frames=self.n_frames, seqs=self.seqs, n_sequences=self.n_sequences)
try:
if not e.loadTracker():
return
logger.info("Loading Results - Success")
logger.info("Evaluate Object Class: %s" % c.upper())
except:
logger.info("Caught exception while loading result data.")
if not e.loadGroundtruth():
raise ValueError("Ground truth not found.")
logger.info("Loading Groundtruth - Success")
# sanity checks
if len(e.groundtruth) is not len(e.tracker):
logger.info(
"The uploaded data does not provide results for every sequence.")
return False
logger.info("Loaded %d Sequences." % len(e.groundtruth))
logger.info("Start Evaluation...")
if e.compute3rdPartyMetrics():
self.strsummary = e.saveToStats(self.save_summary)
else:
logger.info(
"There seem to be no true positives or false positives at all in the submitted data."
)
def log(self):
print(self.strsummary)
def get_results(self):
return self.strsummary
================================================
FILE: ppdet/metrics/munkres.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is borrow from https://github.com/xingyizhou/CenterTrack/blob/master/src/tools/eval_kitti_track/munkres.py
"""
import sys
__all__ = ['Munkres', 'make_cost_matrix']
class Munkres:
"""
Calculate the Munkres solution to the classical assignment problem.
See the module documentation for usage.
"""
def __init__(self):
"""Create a new instance"""
self.C = None
self.row_covered = []
self.col_covered = []
self.n = 0
self.Z0_r = 0
self.Z0_c = 0
self.marked = None
self.path = None
def make_cost_matrix(profit_matrix, inversion_function):
"""
**DEPRECATED**
Please use the module function ``make_cost_matrix()``.
"""
import munkres
return munkres.make_cost_matrix(profit_matrix, inversion_function)
make_cost_matrix = staticmethod(make_cost_matrix)
def pad_matrix(self, matrix, pad_value=0):
"""
Pad a possibly non-square matrix to make it square.
:Parameters:
matrix : list of lists
matrix to pad
pad_value : int
value to use to pad the matrix
:rtype: list of lists
:return: a new, possibly padded, matrix
"""
max_columns = 0
total_rows = len(matrix)
for row in matrix:
max_columns = max(max_columns, len(row))
total_rows = max(max_columns, total_rows)
new_matrix = []
for row in matrix:
row_len = len(row)
new_row = row[:]
if total_rows > row_len:
# Row too short. Pad it.
new_row += [0] * (total_rows - row_len)
new_matrix += [new_row]
while len(new_matrix) < total_rows:
new_matrix += [[0] * total_rows]
return new_matrix
def compute(self, cost_matrix):
"""
Compute the indexes for the lowest-cost pairings between rows and
columns in the database. Returns a list of (row, column) tuples
that can be used to traverse the matrix.
:Parameters:
cost_matrix : list of lists
The cost matrix. If this cost matrix is not square, it
will be padded with zeros, via a call to ``pad_matrix()``.
(This method does *not* modify the caller's matrix. It
operates on a copy of the matrix.)
**WARNING**: This code handles square and rectangular
matrices. It does *not* handle irregular matrices.
:rtype: list
:return: A list of ``(row, column)`` tuples that describe the lowest
cost path through the matrix
"""
self.C = self.pad_matrix(cost_matrix)
self.n = len(self.C)
self.original_length = len(cost_matrix)
self.original_width = len(cost_matrix[0])
self.row_covered = [False for i in range(self.n)]
self.col_covered = [False for i in range(self.n)]
self.Z0_r = 0
self.Z0_c = 0
self.path = self.__make_matrix(self.n * 2, 0)
self.marked = self.__make_matrix(self.n, 0)
done = False
step = 1
steps = {
1: self.__step1,
2: self.__step2,
3: self.__step3,
4: self.__step4,
5: self.__step5,
6: self.__step6
}
while not done:
try:
func = steps[step]
step = func()
except KeyError:
done = True
# Look for the starred columns
results = []
for i in range(self.original_length):
for j in range(self.original_width):
if self.marked[i][j] == 1:
results += [(i, j)]
return results
def __copy_matrix(self, matrix):
"""Return an exact copy of the supplied matrix"""
return copy.deepcopy(matrix)
def __make_matrix(self, n, val):
"""Create an *n*x*n* matrix, populating it with the specific value."""
matrix = []
for i in range(n):
matrix += [[val for j in range(n)]]
return matrix
def __step1(self):
"""
For each row of the matrix, find the smallest element and
subtract it from every element in its row. Go to Step 2.
"""
C = self.C
n = self.n
for i in range(n):
minval = min(self.C[i])
# Find the minimum value for this row and subtract that minimum
# from every element in the row.
for j in range(n):
self.C[i][j] -= minval
return 2
def __step2(self):
"""
Find a zero (Z) in the resulting matrix. If there is no starred
zero in its row or column, star Z. Repeat for each element in the
matrix. Go to Step 3.
"""
n = self.n
for i in range(n):
for j in range(n):
if (self.C[i][j] == 0) and \
(not self.col_covered[j]) and \
(not self.row_covered[i]):
self.marked[i][j] = 1
self.col_covered[j] = True
self.row_covered[i] = True
self.__clear_covers()
return 3
def __step3(self):
"""
Cover each column containing a starred zero. If K columns are
covered, the starred zeros describe a complete set of unique
assignments. In this case, Go to DONE, otherwise, Go to Step 4.
"""
n = self.n
count = 0
for i in range(n):
for j in range(n):
if self.marked[i][j] == 1:
self.col_covered[j] = True
count += 1
if count >= n:
step = 7 # done
else:
step = 4
return step
def __step4(self):
"""
Find a noncovered zero and prime it. If there is no starred zero
in the row containing this primed zero, Go to Step 5. Otherwise,
cover this row and uncover the column containing the starred
zero. Continue in this manner until there are no uncovered zeros
left. Save the smallest uncovered value and Go to Step 6.
"""
step = 0
done = False
row = -1
col = -1
star_col = -1
while not done:
(row, col) = self.__find_a_zero()
if row < 0:
done = True
step = 6
else:
self.marked[row][col] = 2
star_col = self.__find_star_in_row(row)
if star_col >= 0:
col = star_col
self.row_covered[row] = True
self.col_covered[col] = False
else:
done = True
self.Z0_r = row
self.Z0_c = col
step = 5
return step
def __step5(self):
"""
Construct a series of alternating primed and starred zeros as
follows. Let Z0 represent the uncovered primed zero found in Step 4.
Let Z1 denote the starred zero in the column of Z0 (if any).
Let Z2 denote the primed zero in the row of Z1 (there will always
be one). Continue until the series terminates at a primed zero
that has no starred zero in its column. Unstar each starred zero
of the series, star each primed zero of the series, erase all
primes and uncover every line in the matrix. Return to Step 3
"""
count = 0
path = self.path
path[count][0] = self.Z0_r
path[count][1] = self.Z0_c
done = False
while not done:
row = self.__find_star_in_col(path[count][1])
if row >= 0:
count += 1
path[count][0] = row
path[count][1] = path[count - 1][1]
else:
done = True
if not done:
col = self.__find_prime_in_row(path[count][0])
count += 1
path[count][0] = path[count - 1][0]
path[count][1] = col
self.__convert_path(path, count)
self.__clear_covers()
self.__erase_primes()
return 3
def __step6(self):
"""
Add the value found in Step 4 to every element of each covered
row, and subtract it from every element of each uncovered column.
Return to Step 4 without altering any stars, primes, or covered
lines.
"""
minval = self.__find_smallest()
for i in range(self.n):
for j in range(self.n):
if self.row_covered[i]:
self.C[i][j] += minval
if not self.col_covered[j]:
self.C[i][j] -= minval
return 4
def __find_smallest(self):
"""Find the smallest uncovered value in the matrix."""
minval = 2e9 # sys.maxint
for i in range(self.n):
for j in range(self.n):
if (not self.row_covered[i]) and (not self.col_covered[j]):
if minval > self.C[i][j]:
minval = self.C[i][j]
return minval
def __find_a_zero(self):
"""Find the first uncovered element with value 0"""
row = -1
col = -1
i = 0
n = self.n
done = False
while not done:
j = 0
while True:
if (self.C[i][j] == 0) and \
(not self.row_covered[i]) and \
(not self.col_covered[j]):
row = i
col = j
done = True
j += 1
if j >= n:
break
i += 1
if i >= n:
done = True
return (row, col)
def __find_star_in_row(self, row):
"""
Find the first starred element in the specified row. Returns
the column index, or -1 if no starred element was found.
"""
col = -1
for j in range(self.n):
if self.marked[row][j] == 1:
col = j
break
return col
def __find_star_in_col(self, col):
"""
Find the first starred element in the specified row. Returns
the row index, or -1 if no starred element was found.
"""
row = -1
for i in range(self.n):
if self.marked[i][col] == 1:
row = i
break
return row
def __find_prime_in_row(self, row):
"""
Find the first prime element in the specified row. Returns
the column index, or -1 if no starred element was found.
"""
col = -1
for j in range(self.n):
if self.marked[row][j] == 2:
col = j
break
return col
def __convert_path(self, path, count):
for i in range(count + 1):
if self.marked[path[i][0]][path[i][1]] == 1:
self.marked[path[i][0]][path[i][1]] = 0
else:
self.marked[path[i][0]][path[i][1]] = 1
def __clear_covers(self):
"""Clear all covered matrix cells"""
for i in range(self.n):
self.row_covered[i] = False
self.col_covered[i] = False
def __erase_primes(self):
"""Erase all prime markings"""
for i in range(self.n):
for j in range(self.n):
if self.marked[i][j] == 2:
self.marked[i][j] = 0
def make_cost_matrix(profit_matrix, inversion_function):
"""
Create a cost matrix from a profit matrix by calling
'inversion_function' to invert each value. The inversion
function must take one numeric argument (of any type) and return
another numeric argument which is presumed to be the cost inverse
of the original profit.
This is a static method. Call it like this:
.. python::
cost_matrix = Munkres.make_cost_matrix(matrix, inversion_func)
For example:
.. python::
cost_matrix = Munkres.make_cost_matrix(matrix, lambda x : sys.maxint - x)
:Parameters:
profit_matrix : list of lists
The matrix to convert from a profit to a cost matrix
inversion_function : function
The function to use to invert each entry in the profit matrix
:rtype: list of lists
:return: The converted matrix
"""
cost_matrix = []
for row in profit_matrix:
cost_matrix.append([inversion_function(value) for value in row])
return cost_matrix
================================================
FILE: ppdet/metrics/pose3d_metrics.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle.distributed import ParallelEnv
import os
import json
from collections import defaultdict, OrderedDict
import numpy as np
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
__all__ = ['Pose3DEval']
class AverageMeter(object):
def __init__(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def mean_per_joint_position_error(pred, gt, has_3d_joints):
"""
Compute mPJPE
"""
gt = gt[has_3d_joints == 1]
gt = gt[:, :, :3]
pred = pred[has_3d_joints == 1]
with paddle.no_grad():
gt_pelvis = (gt[:, 2, :] + gt[:, 3, :]) / 2
gt = gt - gt_pelvis[:, None, :]
pred_pelvis = (pred[:, 2, :] + pred[:, 3, :]) / 2
pred = pred - pred_pelvis[:, None, :]
error = paddle.sqrt(((pred - gt)**2).sum(axis=-1)).mean(axis=-1).numpy()
return error
def compute_similarity_transform(S1, S2):
"""Computes a similarity transform (sR, t) that takes
a set of 3D points S1 (3 x N) closest to a set of 3D points S2,
where R is an 3x3 rotation matrix, t 3x1 translation, s scale.
i.e. solves the orthogonal Procrutes problem.
"""
transposed = False
if S1.shape[0] != 3 and S1.shape[0] != 2:
S1 = S1.T
S2 = S2.T
transposed = True
assert (S2.shape[1] == S1.shape[1])
# 1. Remove mean.
mu1 = S1.mean(axis=1, keepdims=True)
mu2 = S2.mean(axis=1, keepdims=True)
X1 = S1 - mu1
X2 = S2 - mu2
# 2. Compute variance of X1 used for scale.
var1 = np.sum(X1**2)
# 3. The outer product of X1 and X2.
K = X1.dot(X2.T)
# 4. Solution that Maximizes trace(R'K) is R=U*V', where U, V are
# singular vectors of K.
U, s, Vh = np.linalg.svd(K)
V = Vh.T
# Construct Z that fixes the orientation of R to get det(R)=1.
Z = np.eye(U.shape[0])
Z[-1, -1] *= np.sign(np.linalg.det(U.dot(V.T)))
# Construct R.
R = V.dot(Z.dot(U.T))
# 5. Recover scale.
scale = np.trace(R.dot(K)) / var1
# 6. Recover translation.
t = mu2 - scale * (R.dot(mu1))
# 7. Error:
S1_hat = scale * R.dot(S1) + t
if transposed:
S1_hat = S1_hat.T
return S1_hat
def compute_similarity_transform_batch(S1, S2):
"""Batched version of compute_similarity_transform."""
S1_hat = np.zeros_like(S1)
for i in range(S1.shape[0]):
S1_hat[i] = compute_similarity_transform(S1[i], S2[i])
return S1_hat
def reconstruction_error(S1, S2, reduction='mean'):
"""Do Procrustes alignment and compute reconstruction error."""
S1_hat = compute_similarity_transform_batch(S1, S2)
re = np.sqrt(((S1_hat - S2)**2).sum(axis=-1)).mean(axis=-1)
if reduction == 'mean':
re = re.mean()
elif reduction == 'sum':
re = re.sum()
return re
def all_gather(data):
if paddle.distributed.get_world_size() == 1:
return data
vlist = []
paddle.distributed.all_gather(vlist, data)
data = paddle.concat(vlist, 0)
return data
class Pose3DEval(object):
def __init__(self, output_eval, save_prediction_only=False):
super(Pose3DEval, self).__init__()
self.output_eval = output_eval
self.res_file = os.path.join(output_eval, "pose3d_results.json")
self.save_prediction_only = save_prediction_only
self.reset()
def reset(self):
self.PAmPJPE = AverageMeter()
self.mPJPE = AverageMeter()
self.eval_results = {}
def get_human36m_joints(self, input):
J24_TO_J14 = paddle.to_tensor(
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18])
J24_TO_J17 = paddle.to_tensor(
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19])
return paddle.index_select(input, J24_TO_J14, axis=1)
def update(self, inputs, outputs):
gt_3d_joints = all_gather(inputs['joints_3d'].cuda(ParallelEnv()
.local_rank))
has_3d_joints = all_gather(inputs['has_3d_joints'].cuda(ParallelEnv()
.local_rank))
pred_3d_joints = all_gather(outputs['pose3d'])
if gt_3d_joints.shape[1] == 24:
gt_3d_joints = self.get_human36m_joints(gt_3d_joints)
if pred_3d_joints.shape[1] == 24:
pred_3d_joints = self.get_human36m_joints(pred_3d_joints)
mPJPE_val = mean_per_joint_position_error(pred_3d_joints, gt_3d_joints,
has_3d_joints).mean()
PAmPJPE_val = reconstruction_error(
pred_3d_joints.numpy(),
gt_3d_joints[:, :, :3].numpy(),
reduction=None).mean()
count = int(np.sum(has_3d_joints.numpy()))
self.PAmPJPE.update(PAmPJPE_val * 1000., count)
self.mPJPE.update(mPJPE_val * 1000., count)
def accumulate(self):
if self.save_prediction_only:
logger.info(f'The pose3d result is saved to {self.res_file} '
'and do not evaluate the model.')
return
self.eval_results['pose3d'] = [-self.mPJPE.avg, -self.PAmPJPE.avg]
def log(self):
if self.save_prediction_only:
return
stats_names = ['mPJPE', 'PAmPJPE']
num_values = len(stats_names)
print(' '.join(['| {}'.format(name) for name in stats_names]) + ' |')
print('|---' * (num_values + 1) + '|')
print(' '.join([
'| {:.3f}'.format(abs(value))
for value in self.eval_results['pose3d']
]) + ' |')
def get_results(self):
return self.eval_results
================================================
FILE: ppdet/metrics/widerface_utils.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import cv2
import numpy as np
from collections import OrderedDict
import paddle
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
__all__ = ['face_eval_run', 'lmk2out']
def face_eval_run(model,
image_dir,
gt_file,
pred_dir='output/pred',
eval_mode='widerface',
multi_scale=False):
# load ground truth files
with open(gt_file, 'r') as f:
gt_lines = f.readlines()
imid2path = []
pos_gt = 0
while pos_gt < len(gt_lines):
name_gt = gt_lines[pos_gt].strip('\n\t').split()[0]
imid2path.append(name_gt)
pos_gt += 1
n_gt = int(gt_lines[pos_gt].strip('\n\t').split()[0])
pos_gt += 1 + n_gt
logger.info('The ground truth file load {} images'.format(len(imid2path)))
dets_dist = OrderedDict()
for iter_id, im_path in enumerate(imid2path):
image_path = os.path.join(image_dir, im_path)
if eval_mode == 'fddb':
image_path += '.jpg'
assert os.path.exists(image_path)
image = cv2.imread(image_path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
if multi_scale:
shrink, max_shrink = get_shrink(image.shape[0], image.shape[1])
det0 = detect_face(model, image, shrink)
det1 = flip_test(model, image, shrink)
[det2, det3] = multi_scale_test(model, image, max_shrink)
det4 = multi_scale_test_pyramid(model, image, max_shrink)
det = np.row_stack((det0, det1, det2, det3, det4))
dets = bbox_vote(det)
else:
dets = detect_face(model, image, 1)
if eval_mode == 'widerface':
save_widerface_bboxes(image_path, dets, pred_dir)
else:
dets_dist[im_path] = dets
if iter_id % 100 == 0:
logger.info('Test iter {}'.format(iter_id))
if eval_mode == 'fddb':
save_fddb_bboxes(dets_dist, pred_dir)
logger.info("Finish evaluation.")
def detect_face(model, image, shrink):
image_shape = [image.shape[0], image.shape[1]]
if shrink != 1:
h, w = int(image_shape[0] * shrink), int(image_shape[1] * shrink)
image = cv2.resize(image, (w, h))
image_shape = [h, w]
img = face_img_process(image)
image_shape = np.asarray([image_shape])
scale_factor = np.asarray([[shrink, shrink]])
data = {
"image": paddle.to_tensor(
img, dtype='float32'),
"im_shape": paddle.to_tensor(
image_shape, dtype='float32'),
"scale_factor": paddle.to_tensor(
scale_factor, dtype='float32')
}
model.eval()
detection = model(data)
detection = detection['bbox'].numpy()
# layout: xmin, ymin, xmax. ymax, score
if np.prod(detection.shape) == 1:
logger.info("No face detected")
return np.array([[0, 0, 0, 0, 0]])
det_conf = detection[:, 1]
det_xmin = detection[:, 2]
det_ymin = detection[:, 3]
det_xmax = detection[:, 4]
det_ymax = detection[:, 5]
det = np.column_stack((det_xmin, det_ymin, det_xmax, det_ymax, det_conf))
return det
def flip_test(model, image, shrink):
img = cv2.flip(image, 1)
det_f = detect_face(model, img, shrink)
det_t = np.zeros(det_f.shape)
img_width = image.shape[1]
det_t[:, 0] = img_width - det_f[:, 2]
det_t[:, 1] = det_f[:, 1]
det_t[:, 2] = img_width - det_f[:, 0]
det_t[:, 3] = det_f[:, 3]
det_t[:, 4] = det_f[:, 4]
return det_t
def multi_scale_test(model, image, max_shrink):
# Shrink detecting is only used to detect big faces
st = 0.5 if max_shrink >= 0.75 else 0.5 * max_shrink
det_s = detect_face(model, image, st)
index = np.where(
np.maximum(det_s[:, 2] - det_s[:, 0] + 1, det_s[:, 3] - det_s[:, 1] + 1)
> 30)[0]
det_s = det_s[index, :]
# Enlarge one times
bt = min(2, max_shrink) if max_shrink > 1 else (st + max_shrink) / 2
det_b = detect_face(model, image, bt)
# Enlarge small image x times for small faces
if max_shrink > 2:
bt *= 2
while bt < max_shrink:
det_b = np.row_stack((det_b, detect_face(model, image, bt)))
bt *= 2
det_b = np.row_stack((det_b, detect_face(model, image, max_shrink)))
# Enlarged images are only used to detect small faces.
if bt > 1:
index = np.where(
np.minimum(det_b[:, 2] - det_b[:, 0] + 1,
det_b[:, 3] - det_b[:, 1] + 1) < 100)[0]
det_b = det_b[index, :]
# Shrinked images are only used to detect big faces.
else:
index = np.where(
np.maximum(det_b[:, 2] - det_b[:, 0] + 1,
det_b[:, 3] - det_b[:, 1] + 1) > 30)[0]
det_b = det_b[index, :]
return det_s, det_b
def multi_scale_test_pyramid(model, image, max_shrink):
# Use image pyramids to detect faces
det_b = detect_face(model, image, 0.25)
index = np.where(
np.maximum(det_b[:, 2] - det_b[:, 0] + 1, det_b[:, 3] - det_b[:, 1] + 1)
> 30)[0]
det_b = det_b[index, :]
st = [0.75, 1.25, 1.5, 1.75]
for i in range(len(st)):
if st[i] <= max_shrink:
det_temp = detect_face(model, image, st[i])
# Enlarged images are only used to detect small faces.
if st[i] > 1:
index = np.where(
np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1,
det_temp[:, 3] - det_temp[:, 1] + 1) < 100)[0]
det_temp = det_temp[index, :]
# Shrinked images are only used to detect big faces.
else:
index = np.where(
np.maximum(det_temp[:, 2] - det_temp[:, 0] + 1,
det_temp[:, 3] - det_temp[:, 1] + 1) > 30)[0]
det_temp = det_temp[index, :]
det_b = np.row_stack((det_b, det_temp))
return det_b
def to_chw(image):
"""
Transpose image from HWC to CHW.
Args:
image (np.array): an image with HWC layout.
"""
# HWC to CHW
if len(image.shape) == 3:
image = np.swapaxes(image, 1, 2)
image = np.swapaxes(image, 1, 0)
return image
def face_img_process(image,
mean=[104., 117., 123.],
std=[127.502231, 127.502231, 127.502231]):
img = np.array(image)
img = to_chw(img)
img = img.astype('float32')
img -= np.array(mean)[:, np.newaxis, np.newaxis].astype('float32')
img /= np.array(std)[:, np.newaxis, np.newaxis].astype('float32')
img = [img]
img = np.array(img)
return img
def get_shrink(height, width):
"""
Args:
height (int): image height.
width (int): image width.
"""
# avoid out of memory
max_shrink_v1 = (0x7fffffff / 577.0 / (height * width))**0.5
max_shrink_v2 = ((678 * 1024 * 2.0 * 2.0) / (height * width))**0.5
def get_round(x, loc):
str_x = str(x)
if '.' in str_x:
str_before, str_after = str_x.split('.')
len_after = len(str_after)
if len_after >= 3:
str_final = str_before + '.' + str_after[0:loc]
return float(str_final)
else:
return x
max_shrink = get_round(min(max_shrink_v1, max_shrink_v2), 2) - 0.3
if max_shrink >= 1.5 and max_shrink < 2:
max_shrink = max_shrink - 0.1
elif max_shrink >= 2 and max_shrink < 3:
max_shrink = max_shrink - 0.2
elif max_shrink >= 3 and max_shrink < 4:
max_shrink = max_shrink - 0.3
elif max_shrink >= 4 and max_shrink < 5:
max_shrink = max_shrink - 0.4
elif max_shrink >= 5:
max_shrink = max_shrink - 0.5
elif max_shrink <= 0.1:
max_shrink = 0.1
shrink = max_shrink if max_shrink < 1 else 1
return shrink, max_shrink
def bbox_vote(det):
order = det[:, 4].ravel().argsort()[::-1]
det = det[order, :]
if det.shape[0] == 0:
dets = np.array([[10, 10, 20, 20, 0.002]])
det = np.empty(shape=[0, 5])
while det.shape[0] > 0:
# IOU
area = (det[:, 2] - det[:, 0] + 1) * (det[:, 3] - det[:, 1] + 1)
xx1 = np.maximum(det[0, 0], det[:, 0])
yy1 = np.maximum(det[0, 1], det[:, 1])
xx2 = np.minimum(det[0, 2], det[:, 2])
yy2 = np.minimum(det[0, 3], det[:, 3])
w = np.maximum(0.0, xx2 - xx1 + 1)
h = np.maximum(0.0, yy2 - yy1 + 1)
inter = w * h
o = inter / (area[0] + area[:] - inter)
# nms
merge_index = np.where(o >= 0.3)[0]
det_accu = det[merge_index, :]
det = np.delete(det, merge_index, 0)
if merge_index.shape[0] <= 1:
if det.shape[0] == 0:
try:
dets = np.row_stack((dets, det_accu))
except:
dets = det_accu
continue
det_accu[:, 0:4] = det_accu[:, 0:4] * np.tile(det_accu[:, -1:], (1, 4))
max_score = np.max(det_accu[:, 4])
det_accu_sum = np.zeros((1, 5))
det_accu_sum[:, 0:4] = np.sum(det_accu[:, 0:4],
axis=0) / np.sum(det_accu[:, -1:])
det_accu_sum[:, 4] = max_score
try:
dets = np.row_stack((dets, det_accu_sum))
except:
dets = det_accu_sum
dets = dets[0:750, :]
keep_index = np.where(dets[:, 4] >= 0.01)[0]
dets = dets[keep_index, :]
return dets
def save_widerface_bboxes(image_path, bboxes_scores, output_dir):
image_name = image_path.split('/')[-1]
image_class = image_path.split('/')[-2]
odir = os.path.join(output_dir, image_class)
if not os.path.exists(odir):
os.makedirs(odir)
ofname = os.path.join(odir, '%s.txt' % (image_name[:-4]))
f = open(ofname, 'w')
f.write('{:s}\n'.format(image_class + '/' + image_name))
f.write('{:d}\n'.format(bboxes_scores.shape[0]))
for box_score in bboxes_scores:
xmin, ymin, xmax, ymax, score = box_score
f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n'.format(xmin, ymin, (
xmax - xmin + 1), (ymax - ymin + 1), score))
f.close()
logger.info("The predicted result is saved as {}".format(ofname))
def save_fddb_bboxes(bboxes_scores,
output_dir,
output_fname='pred_fddb_res.txt'):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
predict_file = os.path.join(output_dir, output_fname)
f = open(predict_file, 'w')
for image_path, dets in bboxes_scores.iteritems():
f.write('{:s}\n'.format(image_path))
f.write('{:d}\n'.format(dets.shape[0]))
for box_score in dets:
xmin, ymin, xmax, ymax, score = box_score
width, height = xmax - xmin, ymax - ymin
f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n'
.format(xmin, ymin, width, height, score))
logger.info("The predicted result is saved as {}".format(predict_file))
return predict_file
def lmk2out(results, is_bbox_normalized=False):
"""
Args:
results: request a dict, should include: `landmark`, `im_id`,
if is_bbox_normalized=True, also need `im_shape`.
is_bbox_normalized: whether or not landmark is normalized.
"""
xywh_res = []
for t in results:
bboxes = t['bbox'][0]
lengths = t['bbox'][1][0]
im_ids = np.array(t['im_id'][0]).flatten()
if bboxes.shape == (1, 1) or bboxes is None:
continue
face_index = t['face_index'][0]
prior_box = t['prior_boxes'][0]
predict_lmk = t['landmark'][0]
prior = np.reshape(prior_box, (-1, 4))
predictlmk = np.reshape(predict_lmk, (-1, 10))
k = 0
for a in range(len(lengths)):
num = lengths[a]
im_id = int(im_ids[a])
for i in range(num):
score = bboxes[k][1]
theindex = face_index[i][0]
me_prior = prior[theindex, :]
lmk_pred = predictlmk[theindex, :]
prior_w = me_prior[2] - me_prior[0]
prior_h = me_prior[3] - me_prior[1]
prior_w_center = (me_prior[2] + me_prior[0]) / 2
prior_h_center = (me_prior[3] + me_prior[1]) / 2
lmk_decode = np.zeros((10))
for j in [0, 2, 4, 6, 8]:
lmk_decode[j] = lmk_pred[j] * 0.1 * prior_w + prior_w_center
for j in [1, 3, 5, 7, 9]:
lmk_decode[j] = lmk_pred[j] * 0.1 * prior_h + prior_h_center
im_shape = t['im_shape'][0][a].tolist()
image_h, image_w = int(im_shape[0]), int(im_shape[1])
if is_bbox_normalized:
lmk_decode = lmk_decode * np.array([
image_w, image_h, image_w, image_h, image_w, image_h,
image_w, image_h, image_w, image_h
])
lmk_res = {
'image_id': im_id,
'landmark': lmk_decode,
'score': score,
}
xywh_res.append(lmk_res)
k += 1
return xywh_res
def image_eval(pred, gt, ignore, iou_thresh):
""" single image evaluation
pred: Nx5 xyxys
gt: Nx4 xywh
ignore:
"""
_pred = pred.copy()
_gt = gt.copy()
pred_recall = np.zeros(_pred.shape[0])
recall_list = np.zeros(_gt.shape[0])
proposal_list = np.ones(_pred.shape[0])
_gt[:, 2] = _gt[:, 2] + _gt[:, 0]
_gt[:, 3] = _gt[:, 3] + _gt[:, 1]
overlaps = bbox_overlaps(_pred[:, :4], _gt)
for h in range(_pred.shape[0]):
gt_overlap = overlaps[h]
max_overlap, max_idx = gt_overlap.max(), gt_overlap.argmax()
if max_overlap >= iou_thresh:
if ignore[max_idx] == 0:
recall_list[max_idx] = -1
proposal_list[h] = -1
elif recall_list[max_idx] == 0:
recall_list[max_idx] = 1
r_keep_index = np.where(recall_list == 1)[0]
pred_recall[h] = len(r_keep_index)
return pred_recall, proposal_list
def bbox_overlaps(boxes1, boxes2):
"""
Parameters
----------
boxes1: (N, 4) ndarray of float
boxes2: (K, 4) ndarray of float
Returns
-------
overlaps: (N, K) ndarray of overlap between boxes1 and boxes2
"""
# Calculate the area of each box
box_areas1 = (boxes1[:, 2] - boxes1[:, 0] + 1) * (
boxes1[:, 3] - boxes1[:, 1] + 1)
box_areas2 = (boxes2[:, 2] - boxes2[:, 0] + 1) * (
boxes2[:, 3] - boxes2[:, 1] + 1)
# Calculate the intersection areas
iw = np.minimum(boxes1[:, None, 2], boxes2[None, :, 2]) - np.maximum(
boxes1[:, None, 0], boxes2[None, :, 0]) + 1
ih = np.minimum(boxes1[:, None, 3], boxes2[None, :, 3]) - np.maximum(
boxes1[:, None, 1], boxes2[None, :, 1]) + 1
# Ensure that the intersection width and height are non-negative
iw = np.maximum(iw, 0)
ih = np.maximum(ih, 0)
# Calculate the intersection area
intersection = iw * ih
# Calculate the union area
union = box_areas1[:, None] + box_areas2[None, :] - intersection
union = box_areas1[:, None] + box_areas2[None, :] - intersection
union = np.maximum(union, 1e-8)
# Calculate the overlaps (intersection over union)
overlaps = intersection / union
return overlaps
def img_pr_info(thresh_num, pred_info, proposal_list, pred_recall):
pr_info = np.zeros((thresh_num, 2)).astype('float')
for t in range(thresh_num):
thresh = 1 - (t+1)/thresh_num
r_index = np.where(pred_info[:, 4] >= thresh)[0]
if len(r_index) == 0:
pr_info[t, 0] = 0
pr_info[t, 1] = 0
else:
r_index = r_index[-1]
p_index = np.where(proposal_list[:r_index+1] == 1)[0]
pr_info[t, 0] = len(p_index)
pr_info[t, 1] = pred_recall[r_index]
return pr_info
def dataset_pr_info(thresh_num, pr_curve, count_face):
_pr_curve = np.zeros((thresh_num, 2))
for i in range(thresh_num):
_pr_curve[i, 0] = pr_curve[i, 1] / pr_curve[i, 0]
_pr_curve[i, 1] = pr_curve[i, 1] / count_face
return _pr_curve
def voc_ap(rec, prec):
# correct AP calculation
# first append sentinel values at the end
mrec = np.concatenate(([0.], rec, [1.]))
mpre = np.concatenate(([0.], prec, [0.]))
# compute the precision envelope
for i in range(mpre.size - 1, 0, -1):
mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
# to calculate area under PR curve, look for points
# where X axis (recall) changes value
i = np.where(mrec[1:] != mrec[:-1])[0]
# and sum (\Delta recall) * prec
ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
return ap
================================================
FILE: ppdet/model_zoo/.gitignore
================================================
MODEL_ZOO
================================================
FILE: ppdet/model_zoo/__init__.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import model_zoo
from .model_zoo import *
__all__ = model_zoo.__all__
================================================
FILE: ppdet/model_zoo/model_zoo.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os.path as osp
import pkg_resources
try:
from collections.abc import Sequence
except:
from collections import Sequence
from ppdet.core.workspace import load_config, create
from ppdet.utils.checkpoint import load_weight
from ppdet.utils.download import get_config_path
from ppdet.utils.logger import setup_logger
logger = setup_logger(__name__)
__all__ = [
'list_model', 'get_config_file', 'get_weights_url', 'get_model',
'MODEL_ZOO_FILENAME'
]
MODEL_ZOO_FILENAME = 'MODEL_ZOO'
def list_model(filters=[]):
model_zoo_file = pkg_resources.resource_filename('ppdet.model_zoo',
MODEL_ZOO_FILENAME)
with open(model_zoo_file) as f:
model_names = f.read().splitlines()
# filter model_name
def filt(name):
for f in filters:
if name.find(f) < 0:
return False
return True
if isinstance(filters, str) or not isinstance(filters, Sequence):
filters = [filters]
model_names = [name for name in model_names if filt(name)]
if len(model_names) == 0 and len(filters) > 0:
raise ValueError("no model found, please check filters seeting, "
"filters can be set as following kinds:\n"
"\tDataset: coco, voc ...\n"
"\tArchitecture: yolo, rcnn, ssd ...\n"
"\tBackbone: resnet, vgg, darknet ...\n")
model_str = "Available Models:\n"
for model_name in model_names:
model_str += "\t{}\n".format(model_name)
logger.info(model_str)
# models and configs save on bcebos under dygraph directory
def get_config_file(model_name):
return get_config_path("ppdet://configs/{}.yml".format(model_name))
def get_weights_url(model_name):
return "ppdet://models/{}.pdparams".format(osp.split(model_name)[-1])
def get_model(model_name, pretrained=True):
cfg_file = get_config_file(model_name)
cfg = load_config(cfg_file)
model = create(cfg.architecture)
if pretrained:
load_weight(model, get_weights_url(model_name))
return model
================================================
FILE: ppdet/model_zoo/tests/__init__.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
================================================
FILE: ppdet/model_zoo/tests/test_get_model.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import paddle
import ppdet
import unittest
# NOTE: weights downloading costs time, we choose
# a small model for unittesting
MODEL_NAME = 'ppyolo/ppyolo_tiny_650e_coco'
class TestGetConfigFile(unittest.TestCase):
def test_main(self):
try:
cfg_file = ppdet.model_zoo.get_config_file(MODEL_NAME)
assert os.path.isfile(cfg_file)
except:
self.assertTrue(False)
class TestGetModel(unittest.TestCase):
def test_main(self):
try:
model = ppdet.model_zoo.get_model(MODEL_NAME)
assert isinstance(model, paddle.nn.Layer)
except:
self.assertTrue(False)
if __name__ == '__main__':
unittest.main()
================================================
FILE: ppdet/model_zoo/tests/test_list_model.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import ppdet
class TestListModel(unittest.TestCase):
def setUp(self):
self._filter = []
def test_main(self):
try:
ppdet.model_zoo.list_model(self._filter)
self.assertTrue(True)
except:
self.assertTrue(False)
class TestListModelYOLO(TestListModel):
def setUp(self):
self._filter = ['yolo']
class TestListModelRCNN(TestListModel):
def setUp(self):
self._filter = ['rcnn']
class TestListModelSSD(TestListModel):
def setUp(self):
self._filter = ['ssd']
class TestListModelMultiFilter(TestListModel):
def setUp(self):
self._filter = ['yolo', 'darknet']
class TestListModelError(unittest.TestCase):
def setUp(self):
self._filter = ['xxx']
def test_main(self):
try:
ppdet.model_zoo.list_model(self._filter)
self.assertTrue(False)
except ValueError:
self.assertTrue(True)
if __name__ == '__main__':
unittest.main()
================================================
FILE: ppdet/modeling/__init__.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
warnings.filterwarnings(
action='ignore', category=DeprecationWarning, module='ops')
from . import ops
from . import backbones
from . import necks
from . import proposal_generator
from . import heads
from . import losses
from . import architectures
from . import post_process
from . import layers
from . import reid
from . import mot
from . import transformers
from . import assigners
from . import rbox_utils
from . import ssod
from .ops import *
from .backbones import *
from .necks import *
from .proposal_generator import *
from .heads import *
from .losses import *
from .architectures import *
from .post_process import *
from .layers import *
from .reid import *
from .mot import *
from .transformers import *
from .assigners import *
from .rbox_utils import *
from .ssod import *
================================================
FILE: ppdet/modeling/architectures/__init__.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import meta_arch
from . import faster_rcnn
from . import mask_rcnn
from . import yolo
from . import ppyoloe
from . import cascade_rcnn
from . import ssd
from . import fcos
from . import solov2
from . import ttfnet
from . import s2anet
from . import keypoint_hrhrnet
from . import keypoint_hrnet
from . import keypoint_vitpose
from . import jde
from . import deepsort
from . import fairmot
from . import centernet
from . import gfl
from . import picodet
from . import detr
from . import sparse_rcnn
from . import tood
from . import retinanet
from . import bytetrack
from . import yolox
from . import yolof
from . import pose3d_metro
from . import centertrack
from . import queryinst
from . import detr_ssod
from . import multi_stream_detector
from . import clrnet
from .meta_arch import *
from .faster_rcnn import *
from .mask_rcnn import *
from .yolo import *
from .ppyoloe import *
from .cascade_rcnn import *
from .ssd import *
from .fcos import *
from .solov2 import *
from .ttfnet import *
from .s2anet import *
from .keypoint_hrhrnet import *
from .keypoint_hrnet import *
from .keypoint_vitpose import *
from .jde import *
from .deepsort import *
from .fairmot import *
from .centernet import *
from .blazeface import *
from .gfl import *
from .picodet import *
from .detr import *
from .sparse_rcnn import *
from .tood import *
from .retinanet import *
from .bytetrack import *
from .yolox import *
from .yolof import *
from .pose3d_metro import *
from .centertrack import *
from .queryinst import *
from .keypoint_petr import *
from .detr_ssod import *
from .multi_stream_detector import *
from .clrnet import *
from . import rtdetrv3
from .rtdetrv3 import *
================================================
FILE: ppdet/modeling/architectures/blazeface.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
import paddle
import paddle.nn.functional as F
__all__ = ['BlazeFace']
@register
class BlazeFace(BaseArch):
"""
BlazeFace: Sub-millisecond Neural Face Detection on Mobile GPUs,
see https://arxiv.org/abs/1907.05047
Args:
backbone (nn.Layer): backbone instance
neck (nn.Layer): neck instance
blaze_head (nn.Layer): `blazeHead` instance
post_process (object): `BBoxPostProcess` instance
"""
__category__ = 'architecture'
__inject__ = ['post_process']
def __init__(self, backbone, blaze_head, neck, post_process):
super(BlazeFace, self).__init__()
self.backbone = backbone
self.neck = neck
self.blaze_head = blaze_head
self.post_process = post_process
@classmethod
def from_config(cls, cfg, *args, **kwargs):
# backbone
backbone = create(cfg['backbone'])
# fpn
kwargs = {'input_shape': backbone.out_shape}
neck = create(cfg['neck'], **kwargs)
# head
kwargs = {'input_shape': neck.out_shape}
blaze_head = create(cfg['blaze_head'], **kwargs)
return {
'backbone': backbone,
'neck': neck,
'blaze_head': blaze_head,
}
def _forward(self):
# Backbone
body_feats = self.backbone(self.inputs)
# neck
neck_feats = self.neck(body_feats)
# blaze Head
if self.training:
return self.blaze_head(neck_feats, self.inputs['image'],
self.inputs['gt_bbox'],
self.inputs['gt_class'])
else:
preds, anchors = self.blaze_head(neck_feats, self.inputs['image'])
bbox, bbox_num, nms_keep_idx = self.post_process(
preds, anchors, self.inputs['im_shape'],
self.inputs['scale_factor'])
if self.use_extra_data:
extra_data = {} # record the bbox output before nms, such like scores and nms_keep_idx
"""extra_data:{
'scores': predict scores,
'nms_keep_idx': bbox index before nms,
}
"""
preds_logits = preds[1] # [[1xNumBBoxNumClass]]
extra_data['scores'] = F.softmax(paddle.concat(
preds_logits, axis=1)).transpose([0, 2, 1])
extra_data['logits'] = paddle.concat(
preds_logits, axis=1).transpose([0, 2, 1])
extra_data['nms_keep_idx'] = nms_keep_idx # bbox index before nms
return bbox, bbox_num, extra_data
else:
return bbox, bbox_num
def get_loss(self, ):
return {"loss": self._forward()}
def get_pred(self):
if self.use_extra_data:
bbox_pred, bbox_num, extra_data = self._forward()
output = {
"bbox": bbox_pred,
"bbox_num": bbox_num,
"extra_data": extra_data
}
else:
bbox_pred, bbox_num = self._forward()
output = {
"bbox": bbox_pred,
"bbox_num": bbox_num,
}
return output
================================================
FILE: ppdet/modeling/architectures/bytetrack.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
__all__ = ['ByteTrack']
@register
class ByteTrack(BaseArch):
"""
ByteTrack network, see https://arxiv.org/abs/2110.06864
Args:
detector (object): detector model instance
reid (object): reid model instance, default None
tracker (object): tracker instance
"""
__category__ = 'architecture'
def __init__(self,
detector='YOLOX',
reid=None,
tracker='JDETracker'):
super(ByteTrack, self).__init__()
self.detector = detector
self.reid = reid
self.tracker = tracker
@classmethod
def from_config(cls, cfg, *args, **kwargs):
detector = create(cfg['detector'])
if cfg['reid'] != 'None':
reid = create(cfg['reid'])
else:
reid = None
tracker = create(cfg['tracker'])
return {
"detector": detector,
"reid": reid,
"tracker": tracker,
}
def _forward(self):
det_outs = self.detector(self.inputs)
if self.training:
return det_outs
else:
if self.reid is not None:
assert 'crops' in self.inputs
crops = self.inputs['crops']
pred_embs = self.reid(crops)
else:
pred_embs = None
det_outs['embeddings'] = pred_embs
return det_outs
def get_loss(self):
return self._forward()
def get_pred(self):
return self._forward()
================================================
FILE: ppdet/modeling/architectures/cascade_rcnn.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
__all__ = ['CascadeRCNN']
@register
class CascadeRCNN(BaseArch):
"""
Cascade R-CNN network, see https://arxiv.org/abs/1712.00726
Args:
backbone (object): backbone instance
rpn_head (object): `RPNHead` instance
bbox_head (object): `BBoxHead` instance
bbox_post_process (object): `BBoxPostProcess` instance
neck (object): 'FPN' instance
mask_head (object): `MaskHead` instance
mask_post_process (object): `MaskPostProcess` instance
"""
__category__ = 'architecture'
__inject__ = [
'bbox_post_process',
'mask_post_process',
]
def __init__(self,
backbone,
rpn_head,
bbox_head,
bbox_post_process,
neck=None,
mask_head=None,
mask_post_process=None):
super(CascadeRCNN, self).__init__()
self.backbone = backbone
self.rpn_head = rpn_head
self.bbox_head = bbox_head
self.bbox_post_process = bbox_post_process
self.neck = neck
self.mask_head = mask_head
self.mask_post_process = mask_post_process
self.with_mask = mask_head is not None
@classmethod
def from_config(cls, cfg, *args, **kwargs):
backbone = create(cfg['backbone'])
kwargs = {'input_shape': backbone.out_shape}
neck = cfg['neck'] and create(cfg['neck'], **kwargs)
out_shape = neck and neck.out_shape or backbone.out_shape
kwargs = {'input_shape': out_shape}
rpn_head = create(cfg['rpn_head'], **kwargs)
bbox_head = create(cfg['bbox_head'], **kwargs)
out_shape = neck and out_shape or bbox_head.get_head().out_shape
kwargs = {'input_shape': out_shape}
mask_head = cfg['mask_head'] and create(cfg['mask_head'], **kwargs)
return {
'backbone': backbone,
'neck': neck,
"rpn_head": rpn_head,
"bbox_head": bbox_head,
"mask_head": mask_head,
}
def _forward(self):
body_feats = self.backbone(self.inputs)
if self.neck is not None:
body_feats = self.neck(body_feats)
if self.training:
rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs)
bbox_loss, bbox_feat = self.bbox_head(body_feats, rois, rois_num,
self.inputs)
rois, rois_num = self.bbox_head.get_assigned_rois()
bbox_targets = self.bbox_head.get_assigned_targets()
if self.with_mask:
mask_loss = self.mask_head(body_feats, rois, rois_num,
self.inputs, bbox_targets, bbox_feat)
return rpn_loss, bbox_loss, mask_loss
else:
return rpn_loss, bbox_loss, {}
else:
rois, rois_num, _ = self.rpn_head(body_feats, self.inputs)
preds, _ = self.bbox_head(body_feats, rois, rois_num, self.inputs)
refined_rois = self.bbox_head.get_refined_rois()
im_shape = self.inputs['im_shape']
scale_factor = self.inputs['scale_factor']
bbox, bbox_num, nms_keep_idx = self.bbox_post_process(
preds, (refined_rois, rois_num), im_shape, scale_factor)
# rescale the prediction back to origin image
bbox, bbox_pred, bbox_num = self.bbox_post_process.get_pred(
bbox, bbox_num, im_shape, scale_factor)
if not self.with_mask:
return bbox_pred, bbox_num, None
mask_out = self.mask_head(body_feats, bbox, bbox_num, self.inputs)
origin_shape = self.bbox_post_process.get_origin_shape()
mask_pred = self.mask_post_process(mask_out, bbox_pred, bbox_num,
origin_shape)
return bbox_pred, bbox_num, mask_pred
def get_loss(self, ):
rpn_loss, bbox_loss, mask_loss = self._forward()
loss = {}
loss.update(rpn_loss)
loss.update(bbox_loss)
if self.with_mask:
loss.update(mask_loss)
total_loss = paddle.add_n(list(loss.values()))
loss.update({'loss': total_loss})
return loss
def get_pred(self):
bbox_pred, bbox_num, mask_pred = self._forward()
output = {
'bbox': bbox_pred,
'bbox_num': bbox_num,
}
if self.with_mask:
output.update({'mask': mask_pred})
return output
================================================
FILE: ppdet/modeling/architectures/centernet.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
__all__ = ['CenterNet']
@register
class CenterNet(BaseArch):
"""
CenterNet network, see http://arxiv.org/abs/1904.07850
Args:
backbone (object): backbone instance
neck (object): FPN instance, default use 'CenterNetDLAFPN'
head (object): 'CenterNetHead' instance
post_process (object): 'CenterNetPostProcess' instance
for_mot (bool): whether return other features used in tracking model
"""
__category__ = 'architecture'
__inject__ = ['post_process']
__shared__ = ['for_mot']
def __init__(self,
backbone,
neck='CenterNetDLAFPN',
head='CenterNetHead',
post_process='CenterNetPostProcess',
for_mot=False):
super(CenterNet, self).__init__()
self.backbone = backbone
self.neck = neck
self.head = head
self.post_process = post_process
self.for_mot = for_mot
@classmethod
def from_config(cls, cfg, *args, **kwargs):
backbone = create(cfg['backbone'])
kwargs = {'input_shape': backbone.out_shape}
neck = cfg['neck'] and create(cfg['neck'], **kwargs)
out_shape = neck and neck.out_shape or backbone.out_shape
kwargs = {'input_shape': out_shape}
head = create(cfg['head'], **kwargs)
return {'backbone': backbone, 'neck': neck, "head": head}
def _forward(self):
neck_feat = self.backbone(self.inputs)
if self.neck is not None:
neck_feat = self.neck(neck_feat)
head_out = self.head(neck_feat, self.inputs)
if self.for_mot:
head_out.update({'neck_feat': neck_feat})
elif self.training:
head_out['loss'] = head_out.pop('det_loss')
return head_out
def get_pred(self):
head_out = self._forward()
bbox, bbox_num, bbox_inds, topk_clses, topk_ys, topk_xs = self.post_process(
head_out['heatmap'],
head_out['size'],
head_out['offset'],
im_shape=self.inputs['im_shape'],
scale_factor=self.inputs['scale_factor'])
if self.for_mot:
output = {
"bbox": bbox,
"bbox_num": bbox_num,
"bbox_inds": bbox_inds,
"topk_clses": topk_clses,
"topk_ys": topk_ys,
"topk_xs": topk_xs,
"neck_feat": head_out['neck_feat']
}
else:
output = {"bbox": bbox, "bbox_num": bbox_num}
return output
def get_loss(self):
return self._forward()
================================================
FILE: ppdet/modeling/architectures/centertrack.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import copy
import math
import numpy as np
import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
from ..keypoint_utils import affine_transform
from ppdet.data.transform.op_helper import gaussian_radius, gaussian2D, draw_umich_gaussian
__all__ = ['CenterTrack']
@register
class CenterTrack(BaseArch):
"""
CenterTrack network, see http://arxiv.org/abs/2004.01177
Args:
detector (object): 'CenterNet' instance
plugin_head (object): 'CenterTrackHead' instance
tracker (object): 'CenterTracker' instance
"""
__category__ = 'architecture'
__shared__ = ['mot_metric']
def __init__(self,
detector='CenterNet',
plugin_head='CenterTrackHead',
tracker='CenterTracker',
mot_metric=False):
super(CenterTrack, self).__init__()
self.detector = detector
self.plugin_head = plugin_head
self.tracker = tracker
self.mot_metric = mot_metric
self.pre_image = None
self.deploy = False
@classmethod
def from_config(cls, cfg, *args, **kwargs):
detector = create(cfg['detector'])
detector_out_shape = detector.neck and detector.neck.out_shape or detector.backbone.out_shape
kwargs = {'input_shape': detector_out_shape}
plugin_head = create(cfg['plugin_head'], **kwargs)
tracker = create(cfg['tracker'])
return {
'detector': detector,
'plugin_head': plugin_head,
'tracker': tracker,
}
def _forward(self):
if self.training:
det_outs = self.detector(self.inputs)
neck_feat = det_outs['neck_feat']
losses = {}
for k, v in det_outs.items():
if 'loss' not in k: continue
losses.update({k: v})
plugin_outs = self.plugin_head(neck_feat, self.inputs)
for k, v in plugin_outs.items():
if 'loss' not in k: continue
losses.update({k: v})
losses['loss'] = det_outs['det_loss'] + plugin_outs['plugin_loss']
return losses
else:
if not self.mot_metric:
# detection, support bs>=1
det_outs = self.detector(self.inputs)
return {
'bbox': det_outs['bbox'],
'bbox_num': det_outs['bbox_num']
}
else:
# MOT, only support bs=1
if not self.deploy:
if self.pre_image is None:
self.pre_image = self.inputs['image']
# initializing tracker for the first frame
self.tracker.init_track([])
self.inputs['pre_image'] = self.pre_image
self.pre_image = self.inputs[
'image'] # Note: update for next image
# render input heatmap from tracker status
pre_hm = self.get_additional_inputs(
self.tracker.tracks, self.inputs, with_hm=True)
self.inputs['pre_hm'] = paddle.to_tensor(pre_hm)
# model inference
det_outs = self.detector(self.inputs)
neck_feat = det_outs['neck_feat']
result = self.plugin_head(
neck_feat, self.inputs, det_outs['bbox'],
det_outs['bbox_inds'], det_outs['topk_clses'],
det_outs['topk_ys'], det_outs['topk_xs'])
if not self.deploy:
# convert the cropped and 4x downsampled output coordinate system
# back to the input image coordinate system
result = self.plugin_head.centertrack_post_process(
result, self.inputs, self.tracker.out_thresh)
return result
def get_pred(self):
return self._forward()
def get_loss(self):
return self._forward()
def reset_tracking(self):
self.tracker.reset()
self.pre_image = None
def get_additional_inputs(self, dets, meta, with_hm=True):
# Render input heatmap from previous trackings.
trans_input = meta['trans_input'][0].numpy()
inp_width, inp_height = int(meta['inp_width'][0]), int(meta[
'inp_height'][0])
input_hm = np.zeros((1, inp_height, inp_width), dtype=np.float32)
for det in dets:
if det['score'] < self.tracker.pre_thresh:
continue
bbox = affine_transform_bbox(det['bbox'], trans_input, inp_width,
inp_height)
h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
if (h > 0 and w > 0):
radius = gaussian_radius(
(math.ceil(h), math.ceil(w)), min_overlap=0.7)
radius = max(0, int(radius))
ct = np.array(
[(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
dtype=np.float32)
ct_int = ct.astype(np.int32)
if with_hm:
input_hm[0] = draw_umich_gaussian(input_hm[0], ct_int,
radius)
if with_hm:
input_hm = input_hm[np.newaxis]
return input_hm
def affine_transform_bbox(bbox, trans, width, height):
bbox = np.array(copy.deepcopy(bbox), dtype=np.float32)
bbox[:2] = affine_transform(bbox[:2], trans)
bbox[2:] = affine_transform(bbox[2:], trans)
bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, width - 1)
bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, height - 1)
return bbox
================================================
FILE: ppdet/modeling/architectures/clrnet.py
================================================
from .meta_arch import BaseArch
from ppdet.core.workspace import register, create
from paddle import in_dynamic_mode
__all__ = ['CLRNet']
@register
class CLRNet(BaseArch):
__category__ = 'architecture'
def __init__(self,
backbone="CLRResNet",
neck="CLRFPN",
clr_head="CLRHead",
post_process=None):
super(CLRNet, self).__init__()
self.backbone = backbone
self.neck = neck
self.heads = clr_head
self.post_process = post_process
@classmethod
def from_config(cls, cfg, *args, **kwargs):
# backbone
backbone = create(cfg['backbone'])
# fpn
kwargs = {'input_shape': backbone.out_shape}
neck = create(cfg['neck'], **kwargs)
# head
kwargs = {'input_shape': neck.out_shape}
clr_head = create(cfg['clr_head'], **kwargs)
return {
'backbone': backbone,
'neck': neck,
'clr_head': clr_head,
}
def _forward(self):
# Backbone
body_feats = self.backbone(self.inputs['image'])
# neck
neck_feats = self.neck(body_feats)
# CRL Head
if self.training:
output = self.heads(neck_feats, self.inputs)
else:
output = self.heads(neck_feats)
output = {'lanes': output}
# TODO: hard code fix as_lanes=False problem in clrnet_head.py "get_lanes" function for static mode
if in_dynamic_mode():
output = self.heads.get_lanes(output['lanes'])
output = {
"lanes": output,
"img_path": self.inputs['full_img_path'],
"img_name": self.inputs['img_name']
}
return output
def get_loss(self):
return self._forward()
def get_pred(self):
return self._forward()
================================================
FILE: ppdet/modeling/architectures/deepsort.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
from ppdet.modeling.mot.utils import Detection, get_crops, scale_coords, clip_box
__all__ = ['DeepSORT']
@register
class DeepSORT(BaseArch):
"""
DeepSORT network, see https://arxiv.org/abs/1703.07402
Args:
detector (object): detector model instance
reid (object): reid model instance
tracker (object): tracker instance
"""
__category__ = 'architecture'
def __init__(self,
detector='YOLOv3',
reid='PCBPyramid',
tracker='DeepSORTTracker'):
super(DeepSORT, self).__init__()
self.detector = detector
self.reid = reid
self.tracker = tracker
@classmethod
def from_config(cls, cfg, *args, **kwargs):
if cfg['detector'] != 'None':
detector = create(cfg['detector'])
else:
detector = None
reid = create(cfg['reid'])
tracker = create(cfg['tracker'])
return {
"detector": detector,
"reid": reid,
"tracker": tracker,
}
def _forward(self):
crops = self.inputs['crops']
outs = {}
outs['embeddings'] = self.reid(crops)
return outs
def get_pred(self):
return self._forward()
================================================
FILE: ppdet/modeling/architectures/detr.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from .meta_arch import BaseArch
from ppdet.core.workspace import register, create
__all__ = ['DETR']
# Deformable DETR, DINO use the same architecture as DETR
@register
class DETR(BaseArch):
__category__ = 'architecture'
__inject__ = ['post_process', 'post_process_semi']
__shared__ = ['with_mask', 'exclude_post_process']
def __init__(self,
backbone,
transformer='DETRTransformer',
detr_head='DETRHead',
neck=None,
post_process='DETRPostProcess',
post_process_semi=None,
with_mask=False,
exclude_post_process=False):
super(DETR, self).__init__()
self.backbone = backbone
self.transformer = transformer
self.detr_head = detr_head
self.neck = neck
self.post_process = post_process
self.with_mask = with_mask
self.exclude_post_process = exclude_post_process
self.post_process_semi = post_process_semi
@classmethod
def from_config(cls, cfg, *args, **kwargs):
# backbone
backbone = create(cfg['backbone'])
# neck
kwargs = {'input_shape': backbone.out_shape}
neck = create(cfg['neck'], **kwargs) if cfg['neck'] else None
# transformer
if neck is not None:
kwargs = {'input_shape': neck.out_shape}
transformer = create(cfg['transformer'], **kwargs)
# head
kwargs = {
'hidden_dim': transformer.hidden_dim,
'nhead': transformer.nhead,
'input_shape': backbone.out_shape
}
detr_head = create(cfg['detr_head'], **kwargs)
return {
'backbone': backbone,
'transformer': transformer,
"detr_head": detr_head,
"neck": neck
}
def _forward(self):
# Backbone
body_feats = self.backbone(self.inputs)
# Neck
if self.neck is not None:
body_feats = self.neck(body_feats)
# Transformer
pad_mask = self.inputs.get('pad_mask', None)
out_transformer = self.transformer(body_feats, pad_mask, self.inputs)
# DETR Head
if self.training:
detr_losses = self.detr_head(out_transformer, body_feats,
self.inputs)
detr_losses.update({
'loss': paddle.add_n(
[v for k, v in detr_losses.items() if 'log' not in k])
})
return detr_losses
else:
preds = self.detr_head(out_transformer, body_feats)
if self.exclude_post_process:
bbox, bbox_num, mask = preds
else:
bbox, bbox_num, mask = self.post_process(
preds, self.inputs['im_shape'], self.inputs['scale_factor'],
self.inputs['image'][2:].shape)
output = {'bbox': bbox, 'bbox_num': bbox_num}
if self.with_mask:
output['mask'] = mask
return output
def get_loss(self):
return self._forward()
def get_pred(self):
return self._forward()
================================================
FILE: ppdet/modeling/architectures/detr_ssod.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ppdet.core.workspace import register, create, merge_config
import paddle
import numpy as np
import paddle
import paddle.nn.functional as F
from ppdet.core.workspace import register, create
from ppdet.utils.logger import setup_logger
from ppdet.modeling.ssod.utils import filter_invalid
from .multi_stream_detector import MultiSteamDetector
logger = setup_logger(__name__)
__all__ = ['DETR_SSOD']
__shared__ = ['num_classes']
@register
class DETR_SSOD(MultiSteamDetector):
def __init__(self,
teacher,
student,
train_cfg=None,
test_cfg=None,
RTDETRTransformer=None,
num_classes=80):
super(DETR_SSOD, self).__init__(
dict(
teacher=teacher, student=student),
train_cfg=train_cfg,
test_cfg=test_cfg, )
self.ema_start_iters = train_cfg['ema_start_iters']
self.momentum = 0.9996
self.cls_thr = None
self.cls_thr_ig = None
self.num_classes = num_classes
if train_cfg is not None:
self.freeze("teacher")
self.unsup_weight = self.train_cfg['unsup_weight']
self.sup_weight = self.train_cfg['sup_weight']
self._teacher = None
self._student = None
self._transformer = None
@classmethod
def from_config(cls, cfg):
teacher = create(cfg['teacher'])
merge_config(cfg)
student = create(cfg['student'])
train_cfg = cfg['train_cfg']
test_cfg = cfg['test_cfg']
RTDETRTransformer = cfg['RTDETRTransformer']
return {
'teacher': teacher,
'student': student,
'train_cfg': train_cfg,
'test_cfg': test_cfg,
'RTDETRTransformer': RTDETRTransformer
}
def forward_train(self, inputs, **kwargs):
if isinstance(inputs, dict):
iter_id = inputs['iter_id']
elif isinstance(inputs, list):
iter_id = inputs[-1]
if iter_id == self.ema_start_iters:
self.update_ema_model(momentum=0)
elif iter_id > self.ema_start_iters:
self.update_ema_model(momentum=self.momentum)
if iter_id > self.ema_start_iters:
data_sup_w, data_sup_s, data_unsup_w, data_unsup_s, _ = inputs
if data_sup_w['image'].shape != data_sup_s['image'].shape:
data_sup_w, data_sup_s = align_weak_strong_shape(data_sup_w,
data_sup_s)
if 'gt_bbox' in data_unsup_s.keys():
del data_unsup_s['gt_bbox']
if 'gt_class' in data_unsup_s.keys():
del data_unsup_s['gt_class']
if 'gt_class' in data_unsup_w.keys():
del data_unsup_w['gt_class']
if 'gt_bbox' in data_unsup_w.keys():
del data_unsup_w['gt_bbox']
for k, v in data_sup_s.items():
if k in ['epoch_id']:
continue
elif k in ['gt_class', 'gt_bbox', 'is_crowd']:
data_sup_s[k].extend(data_sup_w[k])
else:
data_sup_s[k] = paddle.concat([v, data_sup_w[k]])
loss = {}
body_feats = self.student.backbone(data_sup_s)
if self.student.neck is not None:
body_feats = self.student.neck(body_feats)
out_transformer = self.student.transformer(body_feats, None,
data_sup_s)
sup_loss = self.student.detr_head(out_transformer, body_feats,
data_sup_s)
sup_loss.update({
'loss': paddle.add_n(
[v for k, v in sup_loss.items() if 'log' not in k])
})
sup_loss = {"sup_" + k: v for k, v in sup_loss.items()}
loss.update(**sup_loss)
unsup_loss = self.foward_unsup_train(data_unsup_w, data_unsup_s)
unsup_loss.update({
'loss': paddle.add_n(
[v for k, v in unsup_loss.items() if 'log' not in k])
})
unsup_loss = {"unsup_" + k: v for k, v in unsup_loss.items()}
unsup_loss.update({
'loss': paddle.add_n(
[v for k, v in unsup_loss.items() if 'log' not in k])
})
loss.update(**unsup_loss)
loss.update({'loss': loss['sup_loss'] + loss['unsup_loss']})
else:
if iter_id == self.ema_start_iters:
logger.info("start semi_supervised_traing")
data_sup_w, data_sup_s, data_unsup_w, data_unsup_s, _ = inputs
if data_sup_w['image'].shape != data_sup_s['image'].shape:
data_sup_w, data_sup_s = align_weak_strong_shape(data_sup_w,
data_sup_s)
for k, v in data_sup_s.items():
if k in ['epoch_id']:
continue
elif k in ['gt_class', 'gt_bbox', 'is_crowd']:
data_sup_s[k].extend(data_sup_w[k])
else:
data_sup_s[k] = paddle.concat([v, data_sup_w[k]])
loss = {}
sup_loss = self.student(data_sup_s)
unsup_loss = {
"unsup_" + k: v * paddle.to_tensor(0)
for k, v in sup_loss.items()
}
sup_loss = {"sup_" + k: v for k, v in sup_loss.items()}
loss.update(**sup_loss)
unsup_loss.update({
'loss': paddle.add_n(
[v * 0 for k, v in sup_loss.items() if 'log' not in k])
})
unsup_loss = {"unsup_" + k: v * 0 for k, v in unsup_loss.items()}
loss.update(**unsup_loss)
loss.update({'loss': loss['sup_loss']})
return loss
def foward_unsup_train(self, data_unsup_w, data_unsup_s):
with paddle.no_grad():
body_feats = self.teacher.backbone(data_unsup_w)
if self.teacher.neck is not None:
body_feats = self.teacher.neck(body_feats, is_teacher=True)
out_transformer = self.teacher.transformer(
body_feats, None, data_unsup_w, is_teacher=True)
preds = self.teacher.detr_head(out_transformer, body_feats)
bbox, bbox_num = self.teacher.post_process_semi(preds)
self.place = body_feats[0].place
proposal_bbox_list = bbox[:, -4:]
proposal_bbox_list = proposal_bbox_list.split(
tuple(np.array(bbox_num)), 0)
proposal_label_list = paddle.cast(bbox[:, :1], np.float32)
proposal_label_list = proposal_label_list.split(
tuple(np.array(bbox_num)), 0)
proposal_score_list = paddle.cast(bbox[:, 1:self.num_classes + 1],
np.float32)
proposal_score_list = proposal_score_list.split(
tuple(np.array(bbox_num)), 0)
proposal_bbox_list = [
paddle.to_tensor(
p, place=self.place) for p in proposal_bbox_list
]
proposal_label_list = [
paddle.to_tensor(
p, place=self.place) for p in proposal_label_list
]
# filter invalid box roughly
if isinstance(self.train_cfg['pseudo_label_initial_score_thr'], float):
thr = self.train_cfg['pseudo_label_initial_score_thr']
else:
# TODO: use dynamic threshold
raise NotImplementedError(
"Dynamic Threshold is not implemented yet.")
proposal_bbox_list, proposal_label_list, proposal_score_list = list(
zip(* [
filter_invalid(
proposal[:, :4],
proposal_label,
proposal_score,
thr=thr,
min_size=self.train_cfg['min_pseduo_box_size'], )
for proposal, proposal_label, proposal_score in
zip(proposal_bbox_list, proposal_label_list,
proposal_score_list)
]))
teacher_bboxes = list(proposal_bbox_list)
teacher_labels = proposal_label_list
teacher_info = [teacher_bboxes, teacher_labels]
student_unsup = data_unsup_s
return self.compute_pseudo_label_loss(student_unsup, teacher_info,
proposal_score_list)
def compute_pseudo_label_loss(self, student_unsup, teacher_info,
proposal_score_list):
pseudo_bboxes = list(teacher_info[0])
pseudo_labels = list(teacher_info[1])
losses = dict()
for i in range(len(pseudo_bboxes)):
if pseudo_labels[i].shape[0] == 0:
pseudo_bboxes[i] = paddle.zeros([0, 4]).numpy()
pseudo_labels[i] = paddle.zeros([0, 1]).numpy()
else:
pseudo_bboxes[i] = pseudo_bboxes[i][:, :4].numpy()
pseudo_labels[i] = pseudo_labels[i].numpy()
for i in range(len(pseudo_bboxes)):
pseudo_labels[i] = paddle.to_tensor(
pseudo_labels[i], dtype=paddle.int32, place=self.place)
pseudo_bboxes[i] = paddle.to_tensor(
pseudo_bboxes[i], dtype=paddle.float32, place=self.place)
student_unsup.update({
'gt_bbox': pseudo_bboxes,
'gt_class': pseudo_labels
})
pseudo_sum = 0
for i in range(len(pseudo_bboxes)):
pseudo_sum += pseudo_bboxes[i].sum()
if pseudo_sum == 0: #input fake data when there are no pseudo labels
pseudo_bboxes[0] = paddle.ones([1, 4]) - 0.5
pseudo_labels[0] = paddle.ones([1, 1]).astype('int32')
student_unsup.update({
'gt_bbox': pseudo_bboxes,
'gt_class': pseudo_labels
})
body_feats = self.student.backbone(student_unsup)
if self.student.neck is not None:
body_feats = self.student.neck(body_feats)
out_transformer = self.student.transformer(body_feats, None,
student_unsup)
losses = self.student.detr_head(out_transformer, body_feats,
student_unsup)
for n, v in losses.items():
losses[n] = v * 0
else:
gt_bbox = []
gt_class = []
images = []
proposal_score = []
for i in range(len(pseudo_bboxes)):
if pseudo_labels[i].shape[0] == 0:
continue
else:
proposal_score.append(proposal_score_list[i].max(-1)
.unsqueeze(-1))
gt_class.append(pseudo_labels[i])
gt_bbox.append(pseudo_bboxes[i])
images.append(student_unsup['image'][i])
images = paddle.stack(images)
student_unsup.update({
'image': images,
'gt_bbox': gt_bbox,
'gt_class': gt_class
})
body_feats = self.student.backbone(student_unsup)
if self.student.neck is not None:
body_feats = self.student.neck(body_feats)
out_transformer = self.student.transformer(body_feats, None,
student_unsup)
student_unsup.update({'gt_score': proposal_score})
losses = self.student.detr_head(out_transformer, body_feats,
student_unsup)
return losses
def box_cxcywh_to_xyxy(x):
x_c, y_c, w, h = x.unbind(-1)
b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
return paddle.stack(b, axis=-1)
def box_xyxy_to_cxcywh(x):
x0, y0, x1, y1 = x.unbind(-1)
b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)]
return paddle.stack(b, axis=-1)
def get_size_with_aspect_ratio(image_size, size, max_size=None):
w, h = image_size
if max_size is not None:
min_original_size = float(min((w, h)))
max_original_size = float(max((w, h)))
if max_original_size / min_original_size * size > max_size:
size = int(round(max_size * min_original_size / max_original_size))
if (w <= h and w == size) or (h <= w and h == size):
return (w, h)
if w < h:
ow = size
oh = int(size * h / w)
else:
oh = size
ow = int(size * w / h)
return (ow, oh)
def align_weak_strong_shape(data_weak, data_strong):
shape_x = data_strong['image'].shape[2]
shape_y = data_strong['image'].shape[3]
target_size = [shape_x, shape_y]
data_weak['image'] = F.interpolate(
data_weak['image'],
size=target_size,
mode='bilinear',
align_corners=False)
return data_weak, data_strong
================================================
FILE: ppdet/modeling/architectures/fairmot.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
__all__ = ['FairMOT']
@register
class FairMOT(BaseArch):
"""
FairMOT network, see http://arxiv.org/abs/2004.01888
Args:
detector (object): 'CenterNet' instance
reid (object): 'FairMOTEmbeddingHead' instance
tracker (object): 'JDETracker' instance
loss (object): 'FairMOTLoss' instance
"""
__category__ = 'architecture'
__inject__ = ['loss']
def __init__(self,
detector='CenterNet',
reid='FairMOTEmbeddingHead',
tracker='JDETracker',
loss='FairMOTLoss'):
super(FairMOT, self).__init__()
self.detector = detector
self.reid = reid
self.tracker = tracker
self.loss = loss
@classmethod
def from_config(cls, cfg, *args, **kwargs):
detector = create(cfg['detector'])
detector_out_shape = detector.neck and detector.neck.out_shape or detector.backbone.out_shape
kwargs = {'input_shape': detector_out_shape}
reid = create(cfg['reid'], **kwargs)
loss = create(cfg['loss'])
tracker = create(cfg['tracker'])
return {
'detector': detector,
'reid': reid,
'loss': loss,
'tracker': tracker
}
def _forward(self):
loss = dict()
# det_outs keys:
# train: neck_feat, det_loss, heatmap_loss, size_loss, offset_loss (optional: iou_loss)
# eval/infer: neck_feat, bbox, bbox_inds
det_outs = self.detector(self.inputs)
neck_feat = det_outs['neck_feat']
if self.training:
reid_loss = self.reid(neck_feat, self.inputs)
det_loss = det_outs['det_loss']
loss = self.loss(det_loss, reid_loss)
for k, v in det_outs.items():
if 'loss' not in k:
continue
loss.update({k: v})
loss.update({'reid_loss': reid_loss})
return loss
else:
pred_dets, pred_embs = self.reid(
neck_feat, self.inputs, det_outs['bbox'], det_outs['bbox_inds'],
det_outs['topk_clses'])
return pred_dets, pred_embs
def get_pred(self):
output = self._forward()
return output
def get_loss(self):
loss = self._forward()
return loss
================================================
FILE: ppdet/modeling/architectures/faster_rcnn.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
import numpy as np
__all__ = ['FasterRCNN']
@register
class FasterRCNN(BaseArch):
"""
Faster R-CNN network, see https://arxiv.org/abs/1506.01497
Args:
backbone (object): backbone instance
rpn_head (object): `RPNHead` instance
bbox_head (object): `BBoxHead` instance
bbox_post_process (object): `BBoxPostProcess` instance
neck (object): 'FPN' instance
"""
__category__ = 'architecture'
__inject__ = ['bbox_post_process']
def __init__(self,
backbone,
rpn_head,
bbox_head,
bbox_post_process,
neck=None):
super(FasterRCNN, self).__init__()
self.backbone = backbone
self.neck = neck
self.rpn_head = rpn_head
self.bbox_head = bbox_head
self.bbox_post_process = bbox_post_process
def init_cot_head(self, relationship):
self.bbox_head.init_cot_head(relationship)
@classmethod
def from_config(cls, cfg, *args, **kwargs):
backbone = create(cfg['backbone'])
kwargs = {'input_shape': backbone.out_shape}
neck = cfg['neck'] and create(cfg['neck'], **kwargs)
out_shape = neck and neck.out_shape or backbone.out_shape
kwargs = {'input_shape': out_shape}
rpn_head = create(cfg['rpn_head'], **kwargs)
bbox_head = create(cfg['bbox_head'], **kwargs)
return {
'backbone': backbone,
'neck': neck,
"rpn_head": rpn_head,
"bbox_head": bbox_head,
}
def _forward(self):
body_feats = self.backbone(self.inputs)
if self.neck is not None:
body_feats = self.neck(body_feats)
if self.training:
rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs)
bbox_loss, _ = self.bbox_head(body_feats, rois, rois_num,
self.inputs)
return rpn_loss, bbox_loss
else:
rois, rois_num, _ = self.rpn_head(body_feats, self.inputs)
preds, _ = self.bbox_head(body_feats, rois, rois_num, None)
im_shape = self.inputs['im_shape']
scale_factor = self.inputs['scale_factor']
bbox, bbox_num, nms_keep_idx = self.bbox_post_process(
preds, (rois, rois_num), im_shape, scale_factor)
# rescale the prediction back to origin image
bboxes, bbox_pred, bbox_num = self.bbox_post_process.get_pred(
bbox, bbox_num, im_shape, scale_factor)
if self.use_extra_data:
extra_data = {
} # record the bbox output before nms, such like scores and nms_keep_idx
"""extra_data:{
'scores': predict scores,
'nms_keep_idx': bbox index before nms,
}
"""
extra_data['scores'] = preds[1] # predict scores (probability)
# Todo: get logits output
extra_data[
'nms_keep_idx'] = nms_keep_idx # bbox index before nms
return bbox_pred, bbox_num, extra_data
else:
return bbox_pred, bbox_num
def get_loss(self, ):
rpn_loss, bbox_loss = self._forward()
loss = {}
loss.update(rpn_loss)
loss.update(bbox_loss)
total_loss = paddle.add_n(list(loss.values()))
loss.update({'loss': total_loss})
return loss
def get_pred(self):
if self.use_extra_data:
bbox_pred, bbox_num, extra_data = self._forward()
output = {
'bbox': bbox_pred,
'bbox_num': bbox_num,
'extra_data': extra_data
}
else:
bbox_pred, bbox_num = self._forward()
output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
return output
def target_bbox_forward(self, data):
body_feats = self.backbone(data)
if self.neck is not None:
body_feats = self.neck(body_feats)
rois = [roi for roi in data['gt_bbox']]
rois_num = paddle.concat([paddle.shape(roi)[0:1] for roi in rois])
preds, _ = self.bbox_head(body_feats, rois, rois_num, None, cot=True)
return preds
def relationship_learning(self, loader, num_classes_novel):
print('computing relationship')
train_labels_list = []
label_list = []
for step_id, data in enumerate(loader):
_, bbox_prob = self.target_bbox_forward(data)
batch_size = data['im_id'].shape[0]
for i in range(batch_size):
num_bbox = data['gt_class'][i].shape[0]
train_labels = data['gt_class'][i]
train_labels_list.append(train_labels.numpy().squeeze(1))
base_labels = bbox_prob.detach().numpy()[:, :-1]
label_list.append(base_labels)
labels = np.concatenate(train_labels_list, 0)
probabilities = np.concatenate(label_list, 0)
N_t = np.max(labels) + 1
conditional = []
for i in range(N_t):
this_class = probabilities[labels == i]
average = np.mean(this_class, axis=0, keepdims=True)
conditional.append(average)
return np.concatenate(conditional)
================================================
FILE: ppdet/modeling/architectures/fcos.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
__all__ = ['FCOS', 'ARSL_FCOS']
@register
class FCOS(BaseArch):
"""
FCOS network, see https://arxiv.org/abs/1904.01355
Args:
backbone (object): backbone instance
neck (object): 'FPN' instance
fcos_head (object): 'FCOSHead' instance
ssod_loss (object): 'SSODFCOSLoss' instance, only used for semi-det(ssod) by DenseTeacher
"""
__category__ = 'architecture'
__inject__ = ['ssod_loss']
def __init__(self,
backbone='ResNet',
neck='FPN',
fcos_head='FCOSHead',
ssod_loss='SSODFCOSLoss'):
super(FCOS, self).__init__()
self.backbone = backbone
self.neck = neck
self.fcos_head = fcos_head
# for ssod, semi-det
self.is_teacher = False
self.ssod_loss = ssod_loss
@classmethod
def from_config(cls, cfg, *args, **kwargs):
backbone = create(cfg['backbone'])
kwargs = {'input_shape': backbone.out_shape}
neck = create(cfg['neck'], **kwargs)
kwargs = {'input_shape': neck.out_shape}
fcos_head = create(cfg['fcos_head'], **kwargs)
return {
'backbone': backbone,
'neck': neck,
"fcos_head": fcos_head,
}
def _forward(self):
body_feats = self.backbone(self.inputs)
fpn_feats = self.neck(body_feats)
self.is_teacher = self.inputs.get('is_teacher', False)
if self.training or self.is_teacher:
losses = self.fcos_head(fpn_feats, self.inputs)
return losses
else:
fcos_head_outs = self.fcos_head(fpn_feats)
bbox_pred, bbox_num = self.fcos_head.post_process(
fcos_head_outs, self.inputs['scale_factor'])
return {'bbox': bbox_pred, 'bbox_num': bbox_num}
def get_loss(self):
return self._forward()
def get_pred(self):
return self._forward()
def get_loss_keys(self):
return ['loss_cls', 'loss_box', 'loss_quality']
def get_ssod_loss(self, student_head_outs, teacher_head_outs, train_cfg):
ssod_losses = self.ssod_loss(student_head_outs, teacher_head_outs,
train_cfg)
return ssod_losses
@register
class ARSL_FCOS(BaseArch):
"""
FCOS ARSL network, see https://arxiv.org/abs/
Args:
backbone (object): backbone instance
neck (object): 'FPN' instance
fcos_head (object): 'FCOSHead_ARSL' instance
fcos_cr_loss (object): 'FCOSLossCR' instance, only used for semi-det(ssod) by ARSL
"""
__category__ = 'architecture'
__inject__ = ['fcos_cr_loss']
def __init__(self,
backbone,
neck,
fcos_head='FCOSHead_ARSL',
fcos_cr_loss='FCOSLossCR'):
super(ARSL_FCOS, self).__init__()
self.backbone = backbone
self.neck = neck
self.fcos_head = fcos_head
self.fcos_cr_loss = fcos_cr_loss
@classmethod
def from_config(cls, cfg, *args, **kwargs):
backbone = create(cfg['backbone'])
kwargs = {'input_shape': backbone.out_shape}
neck = create(cfg['neck'], **kwargs)
kwargs = {'input_shape': neck.out_shape}
fcos_head = create(cfg['fcos_head'], **kwargs)
# consistency regularization loss
fcos_cr_loss = create(cfg['fcos_cr_loss'])
return {
'backbone': backbone,
'neck': neck,
'fcos_head': fcos_head,
'fcos_cr_loss': fcos_cr_loss,
}
def forward(self, inputs, branch="supervised", teacher_prediction=None):
assert branch in ['supervised', 'semi_supervised'], \
print('In ARSL, type must be supervised or semi_supervised.')
if self.data_format == 'NHWC':
image = inputs['image']
inputs['image'] = paddle.transpose(image, [0, 2, 3, 1])
self.inputs = inputs
if self.training:
if branch == "supervised":
out = self.get_loss()
else:
out = self.get_pseudo_loss(teacher_prediction)
else:
# norm test
if branch == "supervised":
out = self.get_pred()
# predict pseudo labels
else:
out = self.get_pseudo_pred()
return out
# model forward
def model_forward(self):
body_feats = self.backbone(self.inputs)
fpn_feats = self.neck(body_feats)
fcos_head_outs = self.fcos_head(fpn_feats)
return fcos_head_outs
# supervised loss for labeled data
def get_loss(self):
loss = {}
tag_labels, tag_bboxes, tag_centerness = [], [], []
for i in range(len(self.fcos_head.fpn_stride)):
# labels, reg_target, centerness
k_lbl = 'labels{}'.format(i)
if k_lbl in self.inputs:
tag_labels.append(self.inputs[k_lbl])
k_box = 'reg_target{}'.format(i)
if k_box in self.inputs:
tag_bboxes.append(self.inputs[k_box])
k_ctn = 'centerness{}'.format(i)
if k_ctn in self.inputs:
tag_centerness.append(self.inputs[k_ctn])
fcos_head_outs = self.model_forward()
loss_fcos = self.fcos_head.get_loss(fcos_head_outs, tag_labels,
tag_bboxes, tag_centerness)
loss.update(loss_fcos)
return loss
# unsupervised loss for unlabeled data
def get_pseudo_loss(self, teacher_prediction):
loss = {}
fcos_head_outs = self.model_forward()
unsup_loss = self.fcos_cr_loss(fcos_head_outs, teacher_prediction)
for k in unsup_loss.keys():
loss[k + '_pseudo'] = unsup_loss[k]
return loss
# get detection results for test, decode and rescale the results to original size
def get_pred(self):
fcos_head_outs = self.model_forward()
scale_factor = self.inputs['scale_factor']
bbox_pred, bbox_num = self.fcos_head.post_process(fcos_head_outs,
scale_factor)
output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
return output
# generate pseudo labels to guide student
def get_pseudo_pred(self):
fcos_head_outs = self.model_forward()
pred_cls, pred_loc, pred_iou = fcos_head_outs[1:] # 0 is locations
for lvl, _ in enumerate(pred_loc):
pred_loc[lvl] = pred_loc[lvl] / self.fcos_head.fpn_stride[lvl]
return [pred_cls, pred_loc, pred_iou, self.fcos_head.fpn_stride]
================================================
FILE: ppdet/modeling/architectures/gfl.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
__all__ = ['GFL']
@register
class GFL(BaseArch):
"""
Generalized Focal Loss network, see https://arxiv.org/abs/2006.04388
Args:
backbone (object): backbone instance
neck (object): 'FPN' instance
head (object): 'GFLHead' instance
"""
__category__ = 'architecture'
def __init__(self, backbone, neck, head='GFLHead'):
super(GFL, self).__init__()
self.backbone = backbone
self.neck = neck
self.head = head
@classmethod
def from_config(cls, cfg, *args, **kwargs):
backbone = create(cfg['backbone'])
kwargs = {'input_shape': backbone.out_shape}
neck = create(cfg['neck'], **kwargs)
kwargs = {'input_shape': neck.out_shape}
head = create(cfg['head'], **kwargs)
return {
'backbone': backbone,
'neck': neck,
"head": head,
}
def _forward(self):
body_feats = self.backbone(self.inputs)
fpn_feats = self.neck(body_feats)
head_outs = self.head(fpn_feats)
if not self.training:
im_shape = self.inputs['im_shape']
scale_factor = self.inputs['scale_factor']
bboxes, bbox_num = self.head.post_process(head_outs, im_shape,
scale_factor)
return bboxes, bbox_num
else:
return head_outs
def get_loss(self, ):
loss = {}
head_outs = self._forward()
loss_gfl = self.head.get_loss(head_outs, self.inputs)
loss.update(loss_gfl)
total_loss = paddle.add_n(list(loss.values()))
loss.update({'loss': total_loss})
return loss
def get_pred(self):
bbox_pred, bbox_num = self._forward()
output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
return output
================================================
FILE: ppdet/modeling/architectures/jde.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
__all__ = ['JDE']
@register
class JDE(BaseArch):
__category__ = 'architecture'
__shared__ = ['metric']
"""
JDE network, see https://arxiv.org/abs/1909.12605v1
Args:
detector (object): detector model instance
reid (object): reid model instance
tracker (object): tracker instance
metric (str): 'MOTDet' for training and detection evaluation, 'ReID'
for ReID embedding evaluation, or 'MOT' for multi object tracking
evaluation.
"""
def __init__(self,
detector='YOLOv3',
reid='JDEEmbeddingHead',
tracker='JDETracker',
metric='MOT'):
super(JDE, self).__init__()
self.detector = detector
self.reid = reid
self.tracker = tracker
self.metric = metric
@classmethod
def from_config(cls, cfg, *args, **kwargs):
detector = create(cfg['detector'])
kwargs = {'input_shape': detector.neck.out_shape}
reid = create(cfg['reid'], **kwargs)
tracker = create(cfg['tracker'])
return {
"detector": detector,
"reid": reid,
"tracker": tracker,
}
def _forward(self):
det_outs = self.detector(self.inputs)
if self.training:
emb_feats = det_outs['emb_feats']
loss_confs = det_outs['det_losses']['loss_confs']
loss_boxes = det_outs['det_losses']['loss_boxes']
jde_losses = self.reid(
emb_feats,
self.inputs,
loss_confs=loss_confs,
loss_boxes=loss_boxes)
return jde_losses
else:
if self.metric == 'MOTDet':
det_results = {
'bbox': det_outs['bbox'],
'bbox_num': det_outs['bbox_num'],
}
return det_results
elif self.metric == 'MOT':
emb_feats = det_outs['emb_feats']
bboxes = det_outs['bbox']
boxes_idx = det_outs['boxes_idx']
nms_keep_idx = det_outs['nms_keep_idx']
pred_dets, pred_embs = self.reid(
emb_feats,
self.inputs,
bboxes=bboxes,
boxes_idx=boxes_idx,
nms_keep_idx=nms_keep_idx)
return pred_dets, pred_embs
else:
raise ValueError("Unknown metric {} for multi object tracking.".
format(self.metric))
def get_loss(self):
return self._forward()
def get_pred(self):
return self._forward()
================================================
FILE: ppdet/modeling/architectures/keypoint_hrhrnet.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from scipy.optimize import linear_sum_assignment
from collections import abc, defaultdict
import numpy as np
import paddle
from ppdet.core.workspace import register, create, serializable
from .meta_arch import BaseArch
from .. import layers as L
from ..keypoint_utils import transpred
__all__ = ['HigherHRNet']
@register
class HigherHRNet(BaseArch):
__category__ = 'architecture'
def __init__(self,
backbone='HRNet',
hrhrnet_head='HrHRNetHead',
post_process='HrHRNetPostProcess',
eval_flip=True,
flip_perm=None,
max_num_people=30):
"""
HigherHRNet network, see https://arxiv.org/abs/1908.10357;
HigherHRNet+swahr, see https://arxiv.org/abs/2012.15175
Args:
backbone (nn.Layer): backbone instance
hrhrnet_head (nn.Layer): keypoint_head instance
bbox_post_process (object): `BBoxPostProcess` instance
"""
super(HigherHRNet, self).__init__()
self.backbone = backbone
self.hrhrnet_head = hrhrnet_head
self.post_process = post_process
self.flip = eval_flip
self.flip_perm = paddle.to_tensor(flip_perm)
self.deploy = False
self.interpolate = L.Upsample(2, mode='bilinear')
self.pool = L.MaxPool(5, 1, 2)
self.max_num_people = max_num_people
@classmethod
def from_config(cls, cfg, *args, **kwargs):
# backbone
backbone = create(cfg['backbone'])
# head
kwargs = {'input_shape': backbone.out_shape}
hrhrnet_head = create(cfg['hrhrnet_head'], **kwargs)
post_process = create(cfg['post_process'])
return {
'backbone': backbone,
"hrhrnet_head": hrhrnet_head,
"post_process": post_process,
}
def _forward(self):
if self.flip and not self.training and not self.deploy:
self.inputs['image'] = paddle.concat(
(self.inputs['image'], paddle.flip(self.inputs['image'], [3])))
body_feats = self.backbone(self.inputs)
if self.training:
return self.hrhrnet_head(body_feats, self.inputs)
else:
outputs = self.hrhrnet_head(body_feats)
if self.flip and not self.deploy:
outputs = [paddle.split(o, 2) for o in outputs]
output_rflip = [
paddle.flip(paddle.gather(o[1], self.flip_perm, 1), [3])
for o in outputs
]
output1 = [o[0] for o in outputs]
heatmap = (output1[0] + output_rflip[0]) / 2.
tagmaps = [output1[1], output_rflip[1]]
outputs = [heatmap] + tagmaps
outputs = self.get_topk(outputs)
if self.deploy:
return outputs
res_lst = []
h = self.inputs['im_shape'][0, 0].numpy().item()
w = self.inputs['im_shape'][0, 1].numpy().item()
kpts, scores = self.post_process(*outputs, h, w)
res_lst.append([kpts, scores])
return res_lst
def get_loss(self):
return self._forward()
def get_pred(self):
outputs = {}
res_lst = self._forward()
outputs['keypoint'] = res_lst
return outputs
def get_topk(self, outputs):
# resize to image size
outputs = [self.interpolate(x) for x in outputs]
if len(outputs) == 3:
tagmap = paddle.concat(
(outputs[1].unsqueeze(4), outputs[2].unsqueeze(4)), axis=4)
else:
tagmap = outputs[1].unsqueeze(4)
heatmap = outputs[0]
N, J = 1, self.hrhrnet_head.num_joints
heatmap_maxpool = self.pool(heatmap)
# topk
maxmap = heatmap * (heatmap == heatmap_maxpool)
maxmap = maxmap.reshape([N, J, -1])
heat_k, inds_k = maxmap.topk(self.max_num_people, axis=2)
outputs = [heatmap, tagmap, heat_k, inds_k]
return outputs
@register
@serializable
class HrHRNetPostProcess(object):
'''
HrHRNet postprocess contain:
1) get topk keypoints in the output heatmap
2) sample the tagmap's value corresponding to each of the topk coordinate
3) match different joints to combine to some people with Hungary algorithm
4) adjust the coordinate by +-0.25 to decrease error std
5) salvage missing joints by check positivity of heatmap - tagdiff_norm
Args:
max_num_people (int): max number of people support in postprocess
heat_thresh (float): value of topk below this threshhold will be ignored
tag_thresh (float): coord's value sampled in tagmap below this threshold belong to same people for init
inputs(list[heatmap]): the output list of model, [heatmap, heatmap_maxpool, tagmap], heatmap_maxpool used to get topk
original_height, original_width (float): the original image size
'''
def __init__(self, max_num_people=30, heat_thresh=0.1, tag_thresh=1.):
self.max_num_people = max_num_people
self.heat_thresh = heat_thresh
self.tag_thresh = tag_thresh
def lerp(self, j, y, x, heatmap):
H, W = heatmap.shape[-2:]
left = np.clip(x - 1, 0, W - 1)
right = np.clip(x + 1, 0, W - 1)
up = np.clip(y - 1, 0, H - 1)
down = np.clip(y + 1, 0, H - 1)
offset_y = np.where(heatmap[j, down, x] > heatmap[j, up, x], 0.25,
-0.25)
offset_x = np.where(heatmap[j, y, right] > heatmap[j, y, left], 0.25,
-0.25)
return offset_y + 0.5, offset_x + 0.5
def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height,
original_width):
N, J, H, W = heatmap.shape
assert N == 1, "only support batch size 1"
heatmap = heatmap[0].cpu().detach().numpy()
tagmap = tagmap[0].cpu().detach().numpy()
heats = heat_k[0].cpu().detach().numpy()
inds_np = inds_k[0].cpu().detach().numpy()
y = inds_np // W
x = inds_np % W
tags = tagmap[np.arange(J)[None, :].repeat(self.max_num_people),
y.flatten(), x.flatten()].reshape(J, -1, tagmap.shape[-1])
coords = np.stack((y, x), axis=2)
# threshold
mask = heats > self.heat_thresh
# cluster
cluster = defaultdict(lambda: {
'coords': np.zeros((J, 2), dtype=np.float32),
'scores': np.zeros(J, dtype=np.float32),
'tags': []
})
for jid, m in enumerate(mask):
num_valid = m.sum()
if num_valid == 0:
continue
valid_inds = np.where(m)[0]
valid_tags = tags[jid, m, :]
if len(cluster) == 0: # initialize
for i in valid_inds:
tag = tags[jid, i]
key = tag[0]
cluster[key]['tags'].append(tag)
cluster[key]['scores'][jid] = heats[jid, i]
cluster[key]['coords'][jid] = coords[jid, i]
continue
candidates = list(cluster.keys())[:self.max_num_people]
centroids = [
np.mean(
cluster[k]['tags'], axis=0) for k in candidates
]
num_clusters = len(centroids)
# shape is (num_valid, num_clusters, tag_dim)
dist = valid_tags[:, None, :] - np.array(centroids)[None, ...]
l2_dist = np.linalg.norm(dist, ord=2, axis=2)
# modulate dist with heat value, see `use_detection_val`
cost = np.round(l2_dist) * 100 - heats[jid, m, None]
# pad the cost matrix, otherwise new pose are ignored
if num_valid > num_clusters:
cost = np.pad(cost, ((0, 0), (0, num_valid - num_clusters)),
'constant',
constant_values=((0, 0), (0, 1e-10)))
rows, cols = linear_sum_assignment(cost)
for y, x in zip(rows, cols):
tag = tags[jid, y]
if y < num_valid and x < num_clusters and \
l2_dist[y, x] < self.tag_thresh:
key = candidates[x] # merge to cluster
else:
key = tag[0] # initialize new cluster
cluster[key]['tags'].append(tag)
cluster[key]['scores'][jid] = heats[jid, y]
cluster[key]['coords'][jid] = coords[jid, y]
# shape is [k, J, 2] and [k, J]
pose_tags = np.array([cluster[k]['tags'] for k in cluster])
pose_coords = np.array([cluster[k]['coords'] for k in cluster])
pose_scores = np.array([cluster[k]['scores'] for k in cluster])
valid = pose_scores > 0
pose_kpts = np.zeros((pose_scores.shape[0], J, 3), dtype=np.float32)
if valid.sum() == 0:
return pose_kpts, pose_kpts
# refine coords
valid_coords = pose_coords[valid].astype(np.int32)
y = valid_coords[..., 0].flatten()
x = valid_coords[..., 1].flatten()
_, j = np.nonzero(valid)
offsets = self.lerp(j, y, x, heatmap)
pose_coords[valid, 0] += offsets[0]
pose_coords[valid, 1] += offsets[1]
# mean score before salvage
mean_score = pose_scores.mean(axis=1)
pose_kpts[valid, 2] = pose_scores[valid]
# salvage missing joints
if True:
for pid, coords in enumerate(pose_coords):
tag_mean = np.array(pose_tags[pid]).mean(axis=0)
norm = np.sum((tagmap - tag_mean)**2, axis=3)**0.5
score = heatmap - np.round(norm) # (J, H, W)
flat_score = score.reshape(J, -1)
max_inds = np.argmax(flat_score, axis=1)
max_scores = np.max(flat_score, axis=1)
salvage_joints = (pose_scores[pid] == 0) & (max_scores > 0)
if salvage_joints.sum() == 0:
continue
y = max_inds[salvage_joints] // W
x = max_inds[salvage_joints] % W
offsets = self.lerp(salvage_joints.nonzero()[0], y, x, heatmap)
y = y.astype(np.float32) + offsets[0]
x = x.astype(np.float32) + offsets[1]
pose_coords[pid][salvage_joints, 0] = y
pose_coords[pid][salvage_joints, 1] = x
pose_kpts[pid][salvage_joints, 2] = max_scores[salvage_joints]
pose_kpts[..., :2] = transpred(pose_coords[..., :2][..., ::-1],
original_height, original_width,
min(H, W))
return pose_kpts, mean_score
================================================
FILE: ppdet/modeling/architectures/keypoint_hrnet.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import numpy as np
import math
import cv2
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
from ..keypoint_utils import transform_preds
from .. import layers as L
from paddle.nn import functional as F
__all__ = ['TopDownHRNet', 'TinyPose3DHRNet', 'TinyPose3DHRHeatmapNet']
@register
class TopDownHRNet(BaseArch):
__category__ = 'architecture'
__inject__ = ['loss']
def __init__(self,
width,
num_joints,
backbone='HRNet',
loss='KeyPointMSELoss',
post_process='HRNetPostProcess',
flip_perm=None,
flip=True,
shift_heatmap=True,
use_dark=True):
"""
HRNet network, see https://arxiv.org/abs/1902.09212
Args:
backbone (nn.Layer): backbone instance
post_process (object): `HRNetPostProcess` instance
flip_perm (list): The left-right joints exchange order list
use_dark(bool): Whether to use DARK in post processing
"""
super(TopDownHRNet, self).__init__()
self.backbone = backbone
self.post_process = HRNetPostProcess(use_dark)
self.loss = loss
self.flip_perm = flip_perm
self.flip = flip
self.final_conv = L.Conv2d(width, num_joints, 1, 1, 0, bias=True)
self.shift_heatmap = shift_heatmap
self.deploy = False
@classmethod
def from_config(cls, cfg, *args, **kwargs):
# backbone
backbone = create(cfg['backbone'])
return {'backbone': backbone, }
def _forward(self):
feats = self.backbone(self.inputs)
hrnet_outputs = self.final_conv(feats[0])
if self.training:
return self.loss(hrnet_outputs, self.inputs)
elif self.deploy:
outshape = hrnet_outputs.shape
max_idx = paddle.argmax(
hrnet_outputs.reshape(
(outshape[0], outshape[1], outshape[2] * outshape[3])),
axis=-1)
return hrnet_outputs, max_idx
else:
if self.flip:
self.inputs['image'] = self.inputs['image'].flip([3])
feats = self.backbone(self.inputs)
output_flipped = self.final_conv(feats[0])
output_flipped = self.flip_back(output_flipped.numpy(),
self.flip_perm)
output_flipped = paddle.to_tensor(output_flipped.copy())
if self.shift_heatmap:
output_flipped[:, :, :, 1:] = output_flipped.clone(
)[:, :, :, 0:-1]
hrnet_outputs = (hrnet_outputs + output_flipped) * 0.5
imshape = (self.inputs['im_shape'].numpy()
)[:, ::-1] if 'im_shape' in self.inputs else None
center = self.inputs['center'].numpy(
) if 'center' in self.inputs else np.round(imshape / 2.)
scale = self.inputs['scale'].numpy(
) if 'scale' in self.inputs else imshape / 200.
outputs = self.post_process(hrnet_outputs, center, scale)
return outputs
def get_loss(self):
return self._forward()
def get_pred(self):
res_lst = self._forward()
outputs = {'keypoint': res_lst}
return outputs
def flip_back(self, output_flipped, matched_parts):
assert output_flipped.ndim == 4,\
'output_flipped should be [batch_size, num_joints, height, width]'
output_flipped = output_flipped[:, :, :, ::-1]
for pair in matched_parts:
tmp = output_flipped[:, pair[0], :, :].copy()
output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
output_flipped[:, pair[1], :, :] = tmp
return output_flipped
class HRNetPostProcess(object):
def __init__(self, use_dark=True):
self.use_dark = use_dark
def get_max_preds(self, heatmaps):
'''get predictions from score maps
Args:
heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
Returns:
preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
maxvals: numpy.ndarray([batch_size, num_joints, 2]), the maximum confidence of the keypoints
'''
assert isinstance(heatmaps,
np.ndarray), 'heatmaps should be numpy.ndarray'
assert heatmaps.ndim == 4, 'batch_images should be 4-ndim'
batch_size = heatmaps.shape[0]
num_joints = heatmaps.shape[1]
width = heatmaps.shape[3]
heatmaps_reshaped = heatmaps.reshape((batch_size, num_joints, -1))
idx = np.argmax(heatmaps_reshaped, 2)
maxvals = np.amax(heatmaps_reshaped, 2)
maxvals = maxvals.reshape((batch_size, num_joints, 1))
idx = idx.reshape((batch_size, num_joints, 1))
preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
preds[:, :, 0] = (preds[:, :, 0]) % width
preds[:, :, 1] = np.floor((preds[:, :, 1]) / width)
pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2))
pred_mask = pred_mask.astype(np.float32)
preds *= pred_mask
return preds, maxvals
def gaussian_blur(self, heatmap, kernel):
border = (kernel - 1) // 2
batch_size = heatmap.shape[0]
num_joints = heatmap.shape[1]
height = heatmap.shape[2]
width = heatmap.shape[3]
for i in range(batch_size):
for j in range(num_joints):
origin_max = np.max(heatmap[i, j])
dr = np.zeros((height + 2 * border, width + 2 * border))
dr[border:-border, border:-border] = heatmap[i, j].copy()
dr = cv2.GaussianBlur(dr, (kernel, kernel), 0)
heatmap[i, j] = dr[border:-border, border:-border].copy()
heatmap[i, j] *= origin_max / np.max(heatmap[i, j])
return heatmap
def dark_parse(self, hm, coord):
heatmap_height = hm.shape[0]
heatmap_width = hm.shape[1]
px = int(coord[0])
py = int(coord[1])
if 1 < px < heatmap_width - 2 and 1 < py < heatmap_height - 2:
dx = 0.5 * (hm[py][px + 1] - hm[py][px - 1])
dy = 0.5 * (hm[py + 1][px] - hm[py - 1][px])
dxx = 0.25 * (hm[py][px + 2] - 2 * hm[py][px] + hm[py][px - 2])
dxy = 0.25 * (hm[py+1][px+1] - hm[py-1][px+1] - hm[py+1][px-1] \
+ hm[py-1][px-1])
dyy = 0.25 * (
hm[py + 2 * 1][px] - 2 * hm[py][px] + hm[py - 2 * 1][px])
derivative = np.matrix([[dx], [dy]])
hessian = np.matrix([[dxx, dxy], [dxy, dyy]])
if dxx * dyy - dxy**2 != 0:
hessianinv = hessian.I
offset = -hessianinv * derivative
offset = np.squeeze(np.array(offset.T), axis=0)
coord += offset
return coord
def dark_postprocess(self, hm, coords, kernelsize):
'''DARK postpocessing, Zhang et al. Distribution-Aware Coordinate
Representation for Human Pose Estimation (CVPR 2020).
'''
hm = self.gaussian_blur(hm, kernelsize)
hm = np.maximum(hm, 1e-10)
hm = np.log(hm)
for n in range(coords.shape[0]):
for p in range(coords.shape[1]):
coords[n, p] = self.dark_parse(hm[n][p], coords[n][p])
return coords
def get_final_preds(self, heatmaps, center, scale, kernelsize=3):
"""the highest heatvalue location with a quarter offset in the
direction from the highest response to the second highest response.
Args:
heatmaps (numpy.ndarray): The predicted heatmaps
center (numpy.ndarray): The boxes center
scale (numpy.ndarray): The scale factor
Returns:
preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
maxvals: numpy.ndarray([batch_size, num_joints, 1]), the maximum confidence of the keypoints
"""
coords, maxvals = self.get_max_preds(heatmaps)
heatmap_height = heatmaps.shape[2]
heatmap_width = heatmaps.shape[3]
if self.use_dark:
coords = self.dark_postprocess(heatmaps, coords, kernelsize)
else:
for n in range(coords.shape[0]):
for p in range(coords.shape[1]):
hm = heatmaps[n][p]
px = int(math.floor(coords[n][p][0] + 0.5))
py = int(math.floor(coords[n][p][1] + 0.5))
if 1 < px < heatmap_width - 1 and 1 < py < heatmap_height - 1:
diff = np.array([
hm[py][px + 1] - hm[py][px - 1],
hm[py + 1][px] - hm[py - 1][px]
])
coords[n][p] += np.sign(diff) * .25
preds = coords.copy()
# Transform back
for i in range(coords.shape[0]):
preds[i] = transform_preds(coords[i], center[i], scale[i],
[heatmap_width, heatmap_height])
return preds, maxvals
def __call__(self, output, center, scale):
preds, maxvals = self.get_final_preds(output.numpy(), center, scale)
outputs = [[
np.concatenate(
(preds, maxvals), axis=-1), np.mean(
maxvals, axis=1)
]]
return outputs
class TinyPose3DPostProcess(object):
def __init__(self):
pass
def __call__(self, output, center, scale):
"""
Args:
output (numpy.ndarray): numpy.ndarray([batch_size, num_joints, 3]), keypoints coords
scale (numpy.ndarray): The scale factor
Returns:
preds: numpy.ndarray([batch_size, num_joints, 3]), keypoints coords
"""
preds = output.numpy().copy()
# Transform back
for i in range(output.shape[0]): # batch_size
preds[i][:, 0] = preds[i][:, 0] * scale[i][0]
preds[i][:, 1] = preds[i][:, 1] * scale[i][1]
return preds
def soft_argmax(heatmaps, joint_num):
dims = heatmaps.shape
depth_dim = (int)(dims[1] / joint_num)
heatmaps = heatmaps.reshape((-1, joint_num, depth_dim * dims[2] * dims[3]))
heatmaps = F.softmax(heatmaps, 2)
heatmaps = heatmaps.reshape((-1, joint_num, depth_dim, dims[2], dims[3]))
accu_x = heatmaps.sum(axis=(2, 3))
accu_y = heatmaps.sum(axis=(2, 4))
accu_z = heatmaps.sum(axis=(3, 4))
accu_x = accu_x * paddle.arange(1, 33)
accu_y = accu_y * paddle.arange(1, 33)
accu_z = accu_z * paddle.arange(1, 33)
accu_x = accu_x.sum(axis=2, keepdim=True) - 1
accu_y = accu_y.sum(axis=2, keepdim=True) - 1
accu_z = accu_z.sum(axis=2, keepdim=True) - 1
coord_out = paddle.concat(
(accu_x, accu_y, accu_z), axis=2) # [batch_size, joint_num, 3]
return coord_out
@register
class TinyPose3DHRHeatmapNet(BaseArch):
__category__ = 'architecture'
__inject__ = ['loss']
def __init__(
self,
width, # 40, backbone输出的channel数目
num_joints,
backbone='HRNet',
loss='KeyPointRegressionMSELoss',
post_process=TinyPose3DPostProcess):
"""
Args:
backbone (nn.Layer): backbone instance
post_process (object): post process instance
"""
super(TinyPose3DHRHeatmapNet, self).__init__()
self.backbone = backbone
self.post_process = TinyPose3DPostProcess()
self.loss = loss
self.deploy = False
self.num_joints = num_joints
self.final_conv = L.Conv2d(width, num_joints * 32, 1, 1, 0, bias=True)
@classmethod
def from_config(cls, cfg, *args, **kwargs):
# backbone
backbone = create(cfg['backbone'])
return {'backbone': backbone, }
def _forward(self):
feats = self.backbone(self.inputs) # feats:[[batch_size, 40, 32, 24]]
hrnet_outputs = self.final_conv(feats[0])
res = soft_argmax(hrnet_outputs, self.num_joints)
return res
def get_loss(self):
pose3d = self._forward()
loss = self.loss(pose3d, None, self.inputs)
outputs = {'loss': loss}
return outputs
def get_pred(self):
res_lst = self._forward()
outputs = {'pose3d': res_lst}
return outputs
def flip_back(self, output_flipped, matched_parts):
assert output_flipped.ndim == 4,\
'output_flipped should be [batch_size, num_joints, height, width]'
output_flipped = output_flipped[:, :, :, ::-1]
for pair in matched_parts:
tmp = output_flipped[:, pair[0], :, :].copy()
output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
output_flipped[:, pair[1], :, :] = tmp
return output_flipped
@register
class TinyPose3DHRNet(BaseArch):
__category__ = 'architecture'
__inject__ = ['loss']
def __init__(self,
width,
num_joints,
fc_channel=768,
backbone='HRNet',
loss='KeyPointRegressionMSELoss',
post_process=TinyPose3DPostProcess):
"""
Args:
backbone (nn.Layer): backbone instance
post_process (object): post process instance
"""
super(TinyPose3DHRNet, self).__init__()
self.backbone = backbone
self.post_process = TinyPose3DPostProcess()
self.loss = loss
self.deploy = False
self.num_joints = num_joints
self.final_conv = L.Conv2d(width, num_joints, 1, 1, 0, bias=True)
self.flatten = paddle.nn.Flatten(start_axis=2, stop_axis=3)
self.fc1 = paddle.nn.Linear(fc_channel, 256)
self.act1 = paddle.nn.ReLU()
self.fc2 = paddle.nn.Linear(256, 64)
self.act2 = paddle.nn.ReLU()
self.fc3 = paddle.nn.Linear(64, 3)
@classmethod
def from_config(cls, cfg, *args, **kwargs):
# backbone
backbone = create(cfg['backbone'])
return {'backbone': backbone, }
def _forward(self):
'''
self.inputs is a dict
'''
feats = self.backbone(
self.inputs) # feats:[[batch_size, 40, width/4, height/4]]
hrnet_outputs = self.final_conv(
feats[0]) # hrnet_outputs: [batch_size, num_joints*32,32,32]
flatten_res = self.flatten(
hrnet_outputs) # [batch_size,num_joints*32,32*32]
res = self.fc1(flatten_res)
res = self.act1(res)
res = self.fc2(res)
res = self.act2(res)
res = self.fc3(res)
if self.training:
return self.loss(res, self.inputs)
else: # export model need
return res
def get_loss(self):
return self._forward()
def get_pred(self):
res_lst = self._forward()
outputs = {'pose3d': res_lst}
return outputs
def flip_back(self, output_flipped, matched_parts):
assert output_flipped.ndim == 4,\
'output_flipped should be [batch_size, num_joints, height, width]'
output_flipped = output_flipped[:, :, :, ::-1]
for pair in matched_parts:
tmp = output_flipped[:, pair[0], :, :].copy()
output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
output_flipped[:, pair[1], :, :] = tmp
return output_flipped
================================================
FILE: ppdet/modeling/architectures/keypoint_petr.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
this code is base on https://github.com/hikvision-research/opera/blob/main/opera/models/detectors/petr.py
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from ppdet.core.workspace import register
from .meta_arch import BaseArch
from .. import layers as L
__all__ = ['PETR']
@register
class PETR(BaseArch):
__category__ = 'architecture'
__inject__ = ['backbone', 'neck', 'bbox_head']
def __init__(self,
backbone='ResNet',
neck='ChannelMapper',
bbox_head='PETRHead'):
"""
PETR, see https://openaccess.thecvf.com/content/CVPR2022/papers/Shi_End-to-End_Multi-Person_Pose_Estimation_With_Transformers_CVPR_2022_paper.pdf
Args:
backbone (nn.Layer): backbone instance
neck (nn.Layer): neck between backbone and head
bbox_head (nn.Layer): model output and loss
"""
super(PETR, self).__init__()
self.backbone = backbone
if neck is not None:
self.with_neck = True
self.neck = neck
self.bbox_head = bbox_head
self.deploy = False
def extract_feat(self, img):
"""Directly extract features from the backbone+neck."""
x = self.backbone(img)
if self.with_neck:
x = self.neck(x)
return x
def get_inputs(self):
img_metas = []
gt_bboxes = []
gt_labels = []
gt_keypoints = []
gt_areas = []
pad_gt_mask = self.inputs['pad_gt_mask'].astype("bool").squeeze(-1)
for idx, im_shape in enumerate(self.inputs['im_shape']):
img_meta = {
'img_shape': im_shape.astype("int32").tolist() + [1, ],
'batch_input_shape': self.inputs['image'].shape[-2:],
'image_name': self.inputs['image_file'][idx]
}
img_metas.append(img_meta)
if (not pad_gt_mask[idx].any()):
gt_keypoints.append(self.inputs['gt_joints'][idx][:1])
gt_labels.append(self.inputs['gt_class'][idx][:1])
gt_bboxes.append(self.inputs['gt_bbox'][idx][:1])
gt_areas.append(self.inputs['gt_areas'][idx][:1])
continue
gt_keypoints.append(self.inputs['gt_joints'][idx][pad_gt_mask[idx]])
gt_labels.append(self.inputs['gt_class'][idx][pad_gt_mask[idx]])
gt_bboxes.append(self.inputs['gt_bbox'][idx][pad_gt_mask[idx]])
gt_areas.append(self.inputs['gt_areas'][idx][pad_gt_mask[idx]])
return img_metas, gt_bboxes, gt_labels, gt_keypoints, gt_areas
def get_loss(self):
"""
Args:
img (Tensor): Input images of shape (N, C, H, W).
Typically these should be mean centered and std scaled.
img_metas (list[dict]): A List of image info dict where each dict
has: 'img_shape', 'scale_factor', 'flip', and may also contain
'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
For details on the values of these keys see
:class:`mmdet.datasets.pipelines.Collect`.
gt_bboxes (list[Tensor]): Each item are the truth boxes for each
image in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): Class indices corresponding to each box.
gt_keypoints (list[Tensor]): Each item are the truth keypoints for
each image in [p^{1}_x, p^{1}_y, p^{1}_v, ..., p^{K}_x,
p^{K}_y, p^{K}_v] format.
gt_areas (list[Tensor]): mask areas corresponding to each box.
gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
boxes can be ignored when computing the loss.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
img_metas, gt_bboxes, gt_labels, gt_keypoints, gt_areas = self.get_inputs(
)
gt_bboxes_ignore = getattr(self.inputs, 'gt_bboxes_ignore', None)
x = self.extract_feat(self.inputs)
losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes,
gt_labels, gt_keypoints, gt_areas,
gt_bboxes_ignore)
loss = 0
for k, v in losses.items():
loss += v
losses['loss'] = loss
return losses
def get_pred_numpy(self):
"""Used for computing network flops.
"""
img = self.inputs['image']
batch_size, _, height, width = img.shape
dummy_img_metas = [
dict(
batch_input_shape=(height, width),
img_shape=(height, width, 3),
scale_factor=(1., 1., 1., 1.)) for _ in range(batch_size)
]
x = self.extract_feat(img)
outs = self.bbox_head(x, img_metas=dummy_img_metas)
bbox_list = self.bbox_head.get_bboxes(
*outs, dummy_img_metas, rescale=True)
return bbox_list
def get_pred(self):
"""
"""
img = self.inputs['image']
batch_size, _, height, width = img.shape
img_metas = [
dict(
batch_input_shape=(height, width),
img_shape=(height, width, 3),
scale_factor=self.inputs['scale_factor'][i])
for i in range(batch_size)
]
kptpred = self.simple_test(
self.inputs, img_metas=img_metas, rescale=True)
keypoints = kptpred[0][1][0]
bboxs = kptpred[0][0][0]
keypoints[..., 2] = bboxs[:, None, 4]
res_lst = [[keypoints, bboxs[:, 4]]]
outputs = {'keypoint': res_lst}
return outputs
def simple_test(self, inputs, img_metas, rescale=False):
"""Test function without test time augmentation.
Args:
inputs (list[paddle.Tensor]): List of multiple images.
img_metas (list[dict]): List of image information.
rescale (bool, optional): Whether to rescale the results.
Defaults to False.
Returns:
list[list[np.ndarray]]: BBox and keypoint results of each image
and classes. The outer list corresponds to each image.
The inner list corresponds to each class.
"""
batch_size = len(img_metas)
assert batch_size == 1, 'Currently only batch_size 1 for inference ' \
f'mode is supported. Found batch_size {batch_size}.'
feat = self.extract_feat(inputs)
results_list = self.bbox_head.simple_test(
feat, img_metas, rescale=rescale)
bbox_kpt_results = [
self.bbox_kpt2result(det_bboxes, det_labels, det_kpts,
self.bbox_head.num_classes)
for det_bboxes, det_labels, det_kpts in results_list
]
return bbox_kpt_results
def bbox_kpt2result(self, bboxes, labels, kpts, num_classes):
"""Convert detection results to a list of numpy arrays.
Args:
bboxes (paddle.Tensor | np.ndarray): shape (n, 5).
labels (paddle.Tensor | np.ndarray): shape (n, ).
kpts (paddle.Tensor | np.ndarray): shape (n, K, 3).
num_classes (int): class number, including background class.
Returns:
list(ndarray): bbox and keypoint results of each class.
"""
if bboxes.shape[0] == 0:
return [np.zeros((0, 5), dtype=np.float32) for i in range(num_classes)], \
[np.zeros((0, kpts.size(1), 3), dtype=np.float32)
for i in range(num_classes)]
else:
if isinstance(bboxes, paddle.Tensor):
bboxes = bboxes.numpy()
labels = labels.numpy()
kpts = kpts.numpy()
return [bboxes[labels == i, :] for i in range(num_classes)], \
[kpts[labels == i, :, :] for i in range(num_classes)]
================================================
FILE: ppdet/modeling/architectures/keypoint_vitpose.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import numpy as np
import math
import cv2
from ppdet.core.workspace import register, create, serializable
from .meta_arch import BaseArch
from ..keypoint_utils import transform_preds
from .. import layers as L
__all__ = ['VitPose_TopDown', 'VitPosePostProcess']
@register
class VitPose_TopDown(BaseArch):
__category__ = 'architecture'
__inject__ = ['loss']
def __init__(self, backbone, head, loss, post_process, flip_test):
"""
VitPose network, see https://arxiv.org/pdf/2204.12484v2.pdf
Args:
backbone (nn.Layer): backbone instance
post_process (object): `HRNetPostProcess` instance
"""
super(VitPose_TopDown, self).__init__()
self.backbone = backbone
self.head = head
self.loss = loss
self.post_process = post_process
self.flip_test = flip_test
@classmethod
def from_config(cls, cfg, *args, **kwargs):
# backbone
backbone = create(cfg['backbone'])
#head
head = create(cfg['head'])
#post_process
post_process = create(cfg['post_process'])
return {
'backbone': backbone,
'head': head,
'post_process': post_process
}
def _forward_train(self):
feats = self.backbone.forward_features(self.inputs['image'])
vitpost_output = self.head(feats)
return self.loss(vitpost_output, self.inputs)
def _forward_test(self):
feats = self.backbone.forward_features(self.inputs['image'])
output_heatmap = self.head(feats)
if self.flip_test:
img_flipped = self.inputs['image'].flip(3)
features_flipped = self.backbone.forward_features(img_flipped)
output_flipped_heatmap = self.head.inference_model(features_flipped,
self.flip_test)
output_heatmap = (output_heatmap + output_flipped_heatmap) * 0.5
imshape = (self.inputs['im_shape'].numpy()
)[:, ::-1] if 'im_shape' in self.inputs else None
center = self.inputs['center'].numpy(
) if 'center' in self.inputs else np.round(imshape / 2.)
scale = self.inputs['scale'].numpy(
) if 'scale' in self.inputs else imshape / 200.
result = self.post_process(output_heatmap.cpu().numpy(), center, scale)
return result
def get_loss(self):
return self._forward_train()
def get_pred(self):
res_lst = self._forward_test()
outputs = {'keypoint': res_lst}
return outputs
@register
@serializable
class VitPosePostProcess(object):
def __init__(self, use_dark=False):
self.use_dark = use_dark
def get_max_preds(self, heatmaps):
'''get predictions from score maps
Args:
heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
Returns:
preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
maxvals: numpy.ndarray([batch_size, num_joints, 2]), the maximum confidence of the keypoints
'''
assert isinstance(heatmaps,
np.ndarray), 'heatmaps should be numpy.ndarray'
assert heatmaps.ndim == 4, 'batch_images should be 4-ndim'
batch_size = heatmaps.shape[0]
num_joints = heatmaps.shape[1]
width = heatmaps.shape[3]
heatmaps_reshaped = heatmaps.reshape((batch_size, num_joints, -1))
idx = np.argmax(heatmaps_reshaped, 2)
maxvals = np.amax(heatmaps_reshaped, 2)
maxvals = maxvals.reshape((batch_size, num_joints, 1))
idx = idx.reshape((batch_size, num_joints, 1))
preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
preds[:, :, 0] = (preds[:, :, 0]) % width
preds[:, :, 1] = np.floor((preds[:, :, 1]) // width)
pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2))
pred_mask = pred_mask.astype(np.float32)
preds *= pred_mask
return preds, maxvals
def post_datk_udp(self, coords, batch_heatmaps, kernel=3):
"""DARK post-pocessing. Implemented by udp. Paper ref: Huang et al. The
Devil is in the Details: Delving into Unbiased Data Processing for Human
Pose Estimation (CVPR 2020). Zhang et al. Distribution-Aware Coordinate
Representation for Human Pose Estimation (CVPR 2020).
Note:
- batch size: B
- num keypoints: K
- num persons: N
- height of heatmaps: H
- width of heatmaps: W
B=1 for bottom_up paradigm where all persons share the same heatmap.
B=N for top_down paradigm where each person has its own heatmaps.
Args:
coords (np.ndarray[N, K, 2]): Initial coordinates of human pose.
batch_heatmaps (np.ndarray[B, K, H, W]): batch_heatmaps
kernel (int): Gaussian kernel size (K) for modulation.
Returns:
np.ndarray([N, K, 2]): Refined coordinates.
"""
if not isinstance(batch_heatmaps, np.ndarray):
batch_heatmaps = batch_heatmaps.cpu().numpy()
B, K, H, W = batch_heatmaps.shape
N = coords.shape[0]
assert (B == 1 or B == N)
for heatmaps in batch_heatmaps:
for heatmap in heatmaps:
cv2.GaussianBlur(heatmap, (kernel, kernel), 0, heatmap)
np.clip(batch_heatmaps, 0.001, 50, batch_heatmaps)
np.log(batch_heatmaps, batch_heatmaps)
batch_heatmaps_pad = np.pad(batch_heatmaps, ((0, 0), (0, 0), (1, 1),
(1, 1)),
mode='edge').flatten()
index = coords[..., 0] + 1 + (coords[..., 1] + 1) * (W + 2)
index += (W + 2) * (H + 2) * np.arange(0, B * K).reshape(-1, K)
index = index.astype(int).reshape(-1, 1)
i_ = batch_heatmaps_pad[index]
ix1 = batch_heatmaps_pad[index + 1]
iy1 = batch_heatmaps_pad[index + W + 2]
ix1y1 = batch_heatmaps_pad[index + W + 3]
ix1_y1_ = batch_heatmaps_pad[index - W - 3]
ix1_ = batch_heatmaps_pad[index - 1]
iy1_ = batch_heatmaps_pad[index - 2 - W]
dx = 0.5 * (ix1 - ix1_)
dy = 0.5 * (iy1 - iy1_)
derivative = np.concatenate([dx, dy], axis=1)
derivative = derivative.reshape(N, K, 2, 1)
dxx = ix1 - 2 * i_ + ix1_
dyy = iy1 - 2 * i_ + iy1_
dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_)
hessian = np.concatenate([dxx, dxy, dxy, dyy], axis=1)
hessian = hessian.reshape(N, K, 2, 2)
hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2))
coords -= np.einsum('ijmn,ijnk->ijmk', hessian, derivative).squeeze()
return coords
def transform_preds_udp(self,
coords,
center,
scale,
output_size,
use_udp=True):
"""Get final keypoint predictions from heatmaps and apply scaling and
translation to map them back to the image.
Note:
num_keypoints: K
Args:
coords (np.ndarray[K, ndims]):
* If ndims=2, corrds are predicted keypoint location.
* If ndims=4, corrds are composed of (x, y, scores, tags)
* If ndims=5, corrds are composed of (x, y, scores, tags,
flipped_tags)
center (np.ndarray[2, ]): Center of the bounding box (x, y).
scale (np.ndarray[2, ]): Scale of the bounding box
wrt [width, height].
output_size (np.ndarray[2, ] | list(2,)): Size of the
destination heatmaps.
use_udp (bool): Use unbiased data processing
Returns:
np.ndarray: Predicted coordinates in the images.
"""
assert coords.shape[1] in (2, 4, 5)
assert len(center) == 2
assert len(scale) == 2
assert len(output_size) == 2
# Recover the scale which is normalized by a factor of 200.
scale = scale * 200.0
if use_udp:
scale_x = scale[0] / (output_size[0] - 1.0)
scale_y = scale[1] / (output_size[1] - 1.0)
else:
scale_x = scale[0] / output_size[0]
scale_y = scale[1] / output_size[1]
target_coords = np.ones_like(coords)
target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[
0] * 0.5
target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[
1] * 0.5
return target_coords
def get_final_preds(self, heatmaps, center, scale, kernelsize=11):
"""the highest heatvalue location with a quarter offset in the
direction from the highest response to the second highest response.
Args:
heatmaps (numpy.ndarray): The predicted heatmaps
center (numpy.ndarray): The boxes center
scale (numpy.ndarray): The scale factor
Returns:
preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
maxvals: numpy.ndarray([batch_size, num_joints, 1]), the maximum confidence of the keypoints
"""
coords, maxvals = self.get_max_preds(heatmaps)
N, K, H, W = heatmaps.shape
if self.use_dark:
coords = self.post_datk_udp(coords, heatmaps, kernelsize)
preds = coords.copy()
# Transform back to the image
for i in range(N):
preds[i] = self.transform_preds_udp(preds[i], center[i],
scale[i], [W, H])
else:
for n in range(coords.shape[0]):
for p in range(coords.shape[1]):
hm = heatmaps[n][p]
px = int(math.floor(coords[n][p][0] + 0.5))
py = int(math.floor(coords[n][p][1] + 0.5))
if 1 < px < W - 1 and 1 < py < H - 1:
diff = np.array([
hm[py][px + 1] - hm[py][px - 1],
hm[py + 1][px] - hm[py - 1][px]
])
coords[n][p] += np.sign(diff) * .25
preds = coords.copy()
# Transform back
for i in range(coords.shape[0]):
preds[i] = transform_preds(coords[i], center[i], scale[i],
[W, H])
return preds, maxvals
def __call__(self, output, center, scale):
preds, maxvals = self.get_final_preds(output, center, scale)
outputs = [[
np.concatenate(
(preds, maxvals), axis=-1), np.mean(
maxvals, axis=1)
]]
return outputs
================================================
FILE: ppdet/modeling/architectures/mask_rcnn.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
__all__ = ['MaskRCNN']
@register
class MaskRCNN(BaseArch):
"""
Mask R-CNN network, see https://arxiv.org/abs/1703.06870
Args:
backbone (object): backbone instance
rpn_head (object): `RPNHead` instance
bbox_head (object): `BBoxHead` instance
mask_head (object): `MaskHead` instance
bbox_post_process (object): `BBoxPostProcess` instance
mask_post_process (object): `MaskPostProcess` instance
neck (object): 'FPN' instance
"""
__category__ = 'architecture'
__inject__ = [
'bbox_post_process',
'mask_post_process',
]
def __init__(self,
backbone,
rpn_head,
bbox_head,
mask_head,
bbox_post_process,
mask_post_process,
neck=None):
super(MaskRCNN, self).__init__()
self.backbone = backbone
self.neck = neck
self.rpn_head = rpn_head
self.bbox_head = bbox_head
self.mask_head = mask_head
self.bbox_post_process = bbox_post_process
self.mask_post_process = mask_post_process
@classmethod
def from_config(cls, cfg, *args, **kwargs):
backbone = create(cfg['backbone'])
kwargs = {'input_shape': backbone.out_shape}
neck = cfg['neck'] and create(cfg['neck'], **kwargs)
out_shape = neck and neck.out_shape or backbone.out_shape
kwargs = {'input_shape': out_shape}
rpn_head = create(cfg['rpn_head'], **kwargs)
bbox_head = create(cfg['bbox_head'], **kwargs)
out_shape = neck and out_shape or bbox_head.get_head().out_shape
kwargs = {'input_shape': out_shape}
mask_head = create(cfg['mask_head'], **kwargs)
return {
'backbone': backbone,
'neck': neck,
"rpn_head": rpn_head,
"bbox_head": bbox_head,
"mask_head": mask_head,
}
def _forward(self):
body_feats = self.backbone(self.inputs)
if self.neck is not None:
body_feats = self.neck(body_feats)
if self.training:
rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs)
bbox_loss, bbox_feat = self.bbox_head(body_feats, rois, rois_num,
self.inputs)
rois, rois_num = self.bbox_head.get_assigned_rois()
bbox_targets = self.bbox_head.get_assigned_targets()
# Mask Head needs bbox_feat in Mask RCNN
mask_loss = self.mask_head(body_feats, rois, rois_num, self.inputs,
bbox_targets, bbox_feat)
return rpn_loss, bbox_loss, mask_loss
else:
rois, rois_num, _ = self.rpn_head(body_feats, self.inputs)
preds, feat_func = self.bbox_head(body_feats, rois, rois_num, None)
im_shape = self.inputs['im_shape']
scale_factor = self.inputs['scale_factor']
bbox, bbox_num, nms_keep_idx = self.bbox_post_process(
preds, (rois, rois_num), im_shape, scale_factor)
mask_out = self.mask_head(
body_feats, bbox, bbox_num, self.inputs, feat_func=feat_func)
# rescale the prediction back to origin image
bbox, bbox_pred, bbox_num = self.bbox_post_process.get_pred(
bbox, bbox_num, im_shape, scale_factor)
origin_shape = self.bbox_post_process.get_origin_shape()
mask_pred = self.mask_post_process(mask_out, bbox_pred, bbox_num,
origin_shape)
if self.use_extra_data:
extra_data = {} # record the bbox output before nms, such like scores and nms_keep_idx
"""extra_data:{
'scores': predict scores,
'nms_keep_idx': bbox index before nms,
}
"""
extra_data['scores'] = preds[1] # predict scores (probability)
# Todo: get logits output
extra_data['nms_keep_idx'] = nms_keep_idx # bbox index before nms
return bbox_pred, bbox_num, mask_pred, extra_data
else:
return bbox_pred, bbox_num, mask_pred
def get_loss(self, ):
bbox_loss, mask_loss, rpn_loss = self._forward()
loss = {}
loss.update(rpn_loss)
loss.update(bbox_loss)
loss.update(mask_loss)
total_loss = paddle.add_n(list(loss.values()))
loss.update({'loss': total_loss})
return loss
def get_pred(self):
if self.use_extra_data:
bbox_pred, bbox_num, mask_pred, extra_data = self._forward()
output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred, 'extra_data': extra_data}
else:
bbox_pred, bbox_num, mask_pred = self._forward()
output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred}
return output
================================================
FILE: ppdet/modeling/architectures/meta_arch.py
================================================
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle
import paddle.nn as nn
import typing
from ppdet.core.workspace import register
from ppdet.modeling.post_process import nms
__all__ = ['BaseArch']
@register
class BaseArch(nn.Layer):
def __init__(self, data_format='NCHW', use_extra_data=False):
super(BaseArch, self).__init__()
self.data_format = data_format
self.inputs = {}
self.fuse_norm = False
self.use_extra_data = use_extra_data
def load_meanstd(self, cfg_transform):
scale = 1.
mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
for item in cfg_transform:
if 'NormalizeImage' in item:
mean = np.array(
item['NormalizeImage']['mean'], dtype=np.float32)
std = np.array(item['NormalizeImage']['std'], dtype=np.float32)
if item['NormalizeImage'].get('is_scale', True):
scale = 1. / 255.
break
if self.data_format == 'NHWC':
self.scale = paddle.to_tensor(scale / std).reshape((1, 1, 1, 3))
self.bias = paddle.to_tensor(-mean / std).reshape((1, 1, 1, 3))
else:
self.scale = paddle.to_tensor(scale / std).reshape((1, 3, 1, 1))
self.bias = paddle.to_tensor(-mean / std).reshape((1, 3, 1, 1))
def forward(self, inputs):
if self.data_format == 'NHWC':
image = inputs['image']
inputs['image'] = paddle.transpose(image, [0, 2, 3, 1])
if self.fuse_norm:
image = inputs['image']
self.inputs['image'] = image * self.scale + self.bias
self.inputs['im_shape'] = inputs['im_shape']
self.inputs['scale_factor'] = inputs['scale_factor']
else:
self.inputs = inputs
self.model_arch()
if self.training:
out = self.get_loss()
else:
inputs_list = []
# multi-scale input
if not isinstance(inputs, typing.Sequence):
inputs_list.append(inputs)
else:
inputs_list.extend(inputs)
outs = []
for inp in inputs_list:
if self.fuse_norm:
self.inputs['image'] = inp['image'] * self.scale + self.bias
self.inputs['im_shape'] = inp['im_shape']
self.inputs['scale_factor'] = inp['scale_factor']
else:
self.inputs = inp
outs.append(self.get_pred())
# multi-scale test
if len(outs) > 1:
out = self.merge_multi_scale_predictions(outs)
else:
out = outs[0]
return out
def merge_multi_scale_predictions(self, outs):
# default values for architectures not included in following list
num_classes = 80
nms_threshold = 0.5
keep_top_k = 100
if self.__class__.__name__ in ('CascadeRCNN', 'FasterRCNN', 'MaskRCNN'):
num_classes = self.bbox_head.num_classes
keep_top_k = self.bbox_post_process.nms.keep_top_k
nms_threshold = self.bbox_post_process.nms.nms_threshold
else:
raise Exception(
"Multi scale test only supports CascadeRCNN, FasterRCNN and MaskRCNN for now"
)
final_boxes = []
all_scale_outs = paddle.concat([o['bbox'] for o in outs]).numpy()
for c in range(num_classes):
idxs = all_scale_outs[:, 0] == c
if np.count_nonzero(idxs) == 0:
continue
r = nms(all_scale_outs[idxs, 1:], nms_threshold)
final_boxes.append(
np.concatenate([np.full((r.shape[0], 1), c), r], 1))
out = np.concatenate(final_boxes)
out = np.concatenate(sorted(
out, key=lambda e: e[1])[-keep_top_k:]).reshape((-1, 6))
out = {
'bbox': paddle.to_tensor(out),
'bbox_num': paddle.to_tensor(np.array([out.shape[0], ]))
}
return out
def build_inputs(self, data, input_def):
inputs = {}
for i, k in enumerate(input_def):
inputs[k] = data[i]
return inputs
def model_arch(self, ):
pass
def get_loss(self, ):
raise NotImplementedError("Should implement get_loss method!")
def get_pred(self, ):
raise NotImplementedError("Should implement get_pred method!")
================================================
FILE: ppdet/modeling/architectures/multi_stream_detector.py
================================================
from typing import Dict
from collections import OrderedDict
from ppdet.modeling.architectures.meta_arch import BaseArch
class MultiSteamDetector(BaseArch):
def __init__(self,
model: Dict[str, BaseArch],
train_cfg=None,
test_cfg=None):
super(MultiSteamDetector, self).__init__()
self.submodules = list(model.keys())
for k, v in model.items():
setattr(self, k, v)
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.inference_on = self.test_cfg.get("inference_on",
self.submodules[0])
self.first_load = True
def forward(self, inputs, return_loss=True, **kwargs):
"""Calls either :func:`forward_train` or :func:`forward_test` depending
on whether ``return_loss`` is ``True``.
Note this setting will change the expected inputs. When
``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
and List[dict]), and when ``resturn_loss=False``, img and img_meta
should be double nested (i.e. List[Tensor], List[List[dict]]), with
the outer list indicating test time augmentations.
"""
if return_loss:
return self.forward_train(inputs, **kwargs)
else:
return self.forward_test(inputs, **kwargs)
def get_loss(self, **kwargs):
# losses = self(**data)
return self.forward_train(self, **kwargs)
def model(self, **kwargs) -> BaseArch:
if "submodule" in kwargs:
assert (kwargs["submodule"] in self.submodules
), "Detector does not contain submodule {}".format(kwargs[
"submodule"])
model: BaseArch = getattr(self, kwargs["submodule"])
else:
model: BaseArch = getattr(self, self.inference_on)
return model
def freeze(self, model_ref: str):
assert model_ref in self.submodules
model = getattr(self, model_ref)
model.eval()
for param in model.parameters():
param.stop_gradient = True
def update_ema_model(self, momentum=0.9996):
# print(momentum)
model_dict = self.student.state_dict()
new_dict = OrderedDict()
for key, value in self.teacher.state_dict().items():
if key in model_dict.keys():
new_dict[key] = (model_dict[key] *
(1 - momentum) + value * momentum)
else:
raise Exception("{} is not found in student model".format(key))
self.teacher.set_dict(new_dict)
================================================
FILE: ppdet/modeling/architectures/picodet.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
__all__ = ['PicoDet']
@register
class PicoDet(BaseArch):
"""
Generalized Focal Loss network, see https://arxiv.org/abs/2006.04388
Args:
backbone (object): backbone instance
neck (object): 'FPN' instance
head (object): 'PicoHead' instance
"""
__category__ = 'architecture'
def __init__(self, backbone, neck, head='PicoHead', nms_cpu=False):
super(PicoDet, self).__init__()
self.backbone = backbone
self.neck = neck
self.head = head
self.export_post_process = True
self.export_nms = True
self.nms_cpu = nms_cpu
@classmethod
def from_config(cls, cfg, *args, **kwargs):
backbone = create(cfg['backbone'])
kwargs = {'input_shape': backbone.out_shape}
neck = create(cfg['neck'], **kwargs)
kwargs = {'input_shape': neck.out_shape}
head = create(cfg['head'], **kwargs)
return {
'backbone': backbone,
'neck': neck,
"head": head,
}
def _forward(self):
body_feats = self.backbone(self.inputs)
fpn_feats = self.neck(body_feats)
head_outs = self.head(fpn_feats, self.export_post_process)
if self.training or not self.export_post_process:
return head_outs, None
else:
scale_factor = self.inputs['scale_factor']
bboxes, bbox_num = self.head.post_process(
head_outs,
scale_factor,
export_nms=self.export_nms,
nms_cpu=self.nms_cpu)
return bboxes, bbox_num
def get_loss(self, ):
loss = {}
head_outs, _ = self._forward()
loss_gfl = self.head.get_loss(head_outs, self.inputs)
loss.update(loss_gfl)
total_loss = paddle.add_n(list(loss.values()))
loss.update({'loss': total_loss})
return loss
def get_pred(self):
if not self.export_post_process:
return {'picodet': self._forward()[0]}
elif self.export_nms:
bbox_pred, bbox_num = self._forward()
output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
return output
else:
bboxes, mlvl_scores = self._forward()
output = {'bbox': bboxes, 'scores': mlvl_scores}
return output
================================================
FILE: ppdet/modeling/architectures/pose3d_metro.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
from .. import layers as L
__all__ = ['METRO_Body']
def orthographic_projection(X, camera):
"""Perform orthographic projection of 3D points X using the camera parameters
Args:
X: size = [B, N, 3]
camera: size = [B, 3]
Returns:
Projected 2D points -- size = [B, N, 2]
"""
camera = camera.reshape((-1, 1, 3))
X_trans = X[:, :, :2] + camera[:, :, 1:]
shape = X_trans.shape
X_2d = (camera[:, :, 0] * X_trans.reshape((shape[0], -1))).reshape(shape)
return X_2d
@register
class METRO_Body(BaseArch):
__category__ = 'architecture'
__inject__ = ['loss']
def __init__(
self,
num_joints,
backbone='HRNet',
trans_encoder='',
loss='Pose3DLoss', ):
"""
Modified from METRO network, see https://arxiv.org/abs/2012.09760
Args:
backbone (nn.Layer): backbone instance
"""
super(METRO_Body, self).__init__()
self.num_joints = num_joints
self.backbone = backbone
self.loss = loss
self.deploy = False
self.trans_encoder = trans_encoder
self.conv_learn_tokens = paddle.nn.Conv1D(49, num_joints + 10, 1)
self.cam_param_fc = paddle.nn.Linear(3, 2)
@classmethod
def from_config(cls, cfg, *args, **kwargs):
# backbone
backbone = create(cfg['backbone'])
trans_encoder = create(cfg['trans_encoder'])
return {'backbone': backbone, 'trans_encoder': trans_encoder}
def _forward(self):
batch_size = self.inputs['image'].shape[0]
image_feat = self.backbone(self.inputs)
image_feat_flatten = image_feat.reshape((batch_size, 2048, 49))
image_feat_flatten = image_feat_flatten.transpose(perm=(0, 2, 1))
# and apply a conv layer to learn image token for each 3d joint/vertex position
features = self.conv_learn_tokens(image_feat_flatten) # (B, J, C)
if self.training:
# apply mask vertex/joint modeling
# meta_masks is a tensor of all the masks, randomly generated in dataloader
# we pre-define a [MASK] token, which is a floating-value vector with 0.01s
meta_masks = self.inputs['mjm_mask'].expand((-1, -1, 2048))
constant_tensor = paddle.ones_like(features) * 0.01
features = features * meta_masks + constant_tensor * (1 - meta_masks
)
pred_out = self.trans_encoder(features)
pred_3d_joints = pred_out[:, :self.num_joints, :]
cam_features = pred_out[:, self.num_joints:, :]
# learn camera parameters
pred_2d_joints = self.cam_param_fc(cam_features)
return pred_3d_joints, pred_2d_joints
def get_loss(self):
preds_3d, preds_2d = self._forward()
loss = self.loss(preds_3d, preds_2d, self.inputs)
output = {'loss': loss}
return output
def get_pred(self):
preds_3d, preds_2d = self._forward()
outputs = {'pose3d': preds_3d, 'pose2d': preds_2d}
return outputs
================================================
FILE: ppdet/modeling/architectures/ppyoloe.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import copy
import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
__all__ = ['PPYOLOE', 'PPYOLOEWithAuxHead']
# PP-YOLOE and PP-YOLOE+ are recommended to use this architecture, especially when use distillation or aux head
# PP-YOLOE and PP-YOLOE+ can also use the same architecture of YOLOv3 in yolo.py when not use distillation or aux head
@register
class PPYOLOE(BaseArch):
"""
PPYOLOE network, see https://arxiv.org/abs/2203.16250
Args:
backbone (nn.Layer): backbone instance
neck (nn.Layer): neck instance
yolo_head (nn.Layer): anchor_head instance
post_process (object): `BBoxPostProcess` instance
ssod_loss (object): 'SSODPPYOLOELoss' instance, only used for semi-det(ssod)
for_distill (bool): whether for distillation
feat_distill_place (str): distill which feature for distillation
for_mot (bool): whether return other features for multi-object tracking
models, default False in pure object detection models.
"""
__category__ = 'architecture'
__shared__ = ['for_distill']
__inject__ = ['post_process', 'ssod_loss']
def __init__(self,
backbone='CSPResNet',
neck='CustomCSPPAN',
yolo_head='PPYOLOEHead',
post_process='BBoxPostProcess',
ssod_loss='SSODPPYOLOELoss',
for_distill=False,
feat_distill_place='neck_feats',
with_mask=False,
for_mot=False):
super(PPYOLOE, self).__init__()
self.backbone = backbone
self.neck = neck
self.yolo_head = yolo_head
self.post_process = post_process
self.for_mot = for_mot
self.with_mask = with_mask
# for ssod, semi-det
self.is_teacher = False
self.ssod_loss = ssod_loss
# distill
self.for_distill = for_distill
self.feat_distill_place = feat_distill_place
if for_distill:
assert feat_distill_place in ['backbone_feats', 'neck_feats']
@classmethod
def from_config(cls, cfg, *args, **kwargs):
backbone = create(cfg['backbone'])
kwargs = {'input_shape': backbone.out_shape}
neck = create(cfg['neck'], **kwargs)
kwargs = {'input_shape': neck.out_shape}
yolo_head = create(cfg['yolo_head'], **kwargs)
return {
'backbone': backbone,
'neck': neck,
"yolo_head": yolo_head,
}
def _forward(self):
body_feats = self.backbone(self.inputs)
neck_feats = self.neck(body_feats, self.for_mot)
self.is_teacher = self.inputs.get('is_teacher', False) # for semi-det
if self.training or self.is_teacher:
yolo_losses = self.yolo_head(neck_feats, self.inputs)
if self.for_distill:
if self.feat_distill_place == 'backbone_feats':
self.yolo_head.distill_pairs['backbone_feats'] = body_feats
elif self.feat_distill_place == 'neck_feats':
self.yolo_head.distill_pairs['neck_feats'] = neck_feats
else:
raise ValueError
return yolo_losses
else:
yolo_head_outs = self.yolo_head(neck_feats)
if self.post_process is not None:
bbox, bbox_num, nms_keep_idx = self.post_process(
yolo_head_outs, self.yolo_head.mask_anchors,
self.inputs['im_shape'], self.inputs['scale_factor'])
else:
if not self.with_mask:
bbox, bbox_num, nms_keep_idx = self.yolo_head.post_process(
yolo_head_outs, self.inputs['scale_factor'])
else:
bbox, bbox_num, mask, nms_keep_idx = self.yolo_head.post_process(
yolo_head_outs,
im_shape=self.inputs['im_shape'],
scale_factor=self.inputs['scale_factor'],
infer_shape=self.inputs['image'].shape[2:])
if not self.with_mask:
output = {'bbox': bbox, 'bbox_num': bbox_num}
else:
output = {'bbox': bbox, 'bbox_num': bbox_num, 'mask': mask}
if self.with_mask:
output['mask'] = mask
return output
def get_loss(self):
return self._forward()
def get_pred(self):
return self._forward()
def get_loss_keys(self):
return ['loss_cls', 'loss_iou', 'loss_dfl', 'loss_contrast']
def get_ssod_loss(self, student_head_outs, teacher_head_outs, train_cfg):
ssod_losses = self.ssod_loss(student_head_outs, teacher_head_outs,
train_cfg)
return ssod_losses
@register
class PPYOLOEWithAuxHead(BaseArch):
__category__ = 'architecture'
__inject__ = ['post_process']
def __init__(self,
backbone='CSPResNet',
neck='CustomCSPPAN',
yolo_head='PPYOLOEHead',
aux_head='SimpleConvHead',
post_process='BBoxPostProcess',
for_mot=False,
detach_epoch=5):
"""
PPYOLOE network, see https://arxiv.org/abs/2203.16250
Args:
backbone (nn.Layer): backbone instance
neck (nn.Layer): neck instance
yolo_head (nn.Layer): anchor_head instance
post_process (object): `BBoxPostProcess` instance
for_mot (bool): whether return other features for multi-object tracking
models, default False in pure object detection models.
"""
super(PPYOLOEWithAuxHead, self).__init__()
self.backbone = backbone
self.neck = neck
self.aux_neck = copy.deepcopy(self.neck)
self.yolo_head = yolo_head
self.aux_head = aux_head
self.post_process = post_process
self.for_mot = for_mot
self.detach_epoch = detach_epoch
@classmethod
def from_config(cls, cfg, *args, **kwargs):
# backbone
backbone = create(cfg['backbone'])
# fpn
kwargs = {'input_shape': backbone.out_shape}
neck = create(cfg['neck'], **kwargs)
aux_neck = copy.deepcopy(neck)
# head
kwargs = {'input_shape': neck.out_shape}
yolo_head = create(cfg['yolo_head'], **kwargs)
aux_head = create(cfg['aux_head'], **kwargs)
return {
'backbone': backbone,
'neck': neck,
"yolo_head": yolo_head,
'aux_head': aux_head,
}
def _forward(self):
body_feats = self.backbone(self.inputs)
neck_feats = self.neck(body_feats, self.for_mot)
if self.training:
if self.inputs['epoch_id'] >= self.detach_epoch:
aux_neck_feats = self.aux_neck([f.detach() for f in body_feats])
dual_neck_feats = (paddle.concat(
[f.detach(), aux_f], axis=1) for f, aux_f in
zip(neck_feats, aux_neck_feats))
else:
aux_neck_feats = self.aux_neck(body_feats)
dual_neck_feats = (paddle.concat(
[f, aux_f], axis=1) for f, aux_f in
zip(neck_feats, aux_neck_feats))
aux_cls_scores, aux_bbox_preds = self.aux_head(dual_neck_feats)
loss = self.yolo_head(
neck_feats,
self.inputs,
aux_pred=[aux_cls_scores, aux_bbox_preds])
return loss
else:
yolo_head_outs = self.yolo_head(neck_feats)
if self.post_process is not None:
bbox, bbox_num = self.post_process(
yolo_head_outs, self.yolo_head.mask_anchors,
self.inputs['im_shape'], self.inputs['scale_factor'])
else:
bbox, bbox_num = self.yolo_head.post_process(
yolo_head_outs, self.inputs['scale_factor'])
output = {'bbox': bbox, 'bbox_num': bbox_num}
return output
def get_loss(self):
return self._forward()
def get_pred(self):
return self._forward()
================================================
FILE: ppdet/modeling/architectures/queryinst.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
__all__ = ['QueryInst']
@register
class QueryInst(BaseArch):
__category__ = 'architecture'
__inject__ = ['post_process']
def __init__(self,
backbone,
neck,
rpn_head,
roi_head,
post_process='SparsePostProcess'):
super(QueryInst, self).__init__()
self.backbone = backbone
self.neck = neck
self.rpn_head = rpn_head
self.roi_head = roi_head
self.post_process = post_process
@classmethod
def from_config(cls, cfg, *args, **kwargs):
backbone = create(cfg['backbone'])
kwargs = {'input_shape': backbone.out_shape}
neck = create(cfg['neck'], **kwargs)
kwargs = {'input_shape': neck.out_shape}
rpn_head = create(cfg['rpn_head'], **kwargs)
roi_head = create(cfg['roi_head'], **kwargs)
return {
'backbone': backbone,
'neck': neck,
'rpn_head': rpn_head,
"roi_head": roi_head
}
def _forward(self, targets=None):
features = self.backbone(self.inputs)
features = self.neck(features)
proposal_bboxes, proposal_features = self.rpn_head(self.inputs[
'img_whwh'])
outputs = self.roi_head(features, proposal_bboxes, proposal_features,
targets)
if self.training:
return outputs
else:
bbox_pred, bbox_num, mask_pred = self.post_process(
outputs['class_logits'], outputs['bbox_pred'],
self.inputs['scale_factor_whwh'], self.inputs['ori_shape'],
outputs['mask_logits'])
return bbox_pred, bbox_num, mask_pred
def get_loss(self):
targets = []
for i in range(len(self.inputs['img_whwh'])):
boxes = self.inputs['gt_bbox'][i]
labels = self.inputs['gt_class'][i].squeeze(-1)
img_whwh = self.inputs['img_whwh'][i]
if boxes.shape[0] != 0:
img_whwh_tgt = img_whwh.unsqueeze(0).tile([boxes.shape[0], 1])
else:
img_whwh_tgt = paddle.zeros_like(boxes)
gt_segm = self.inputs['gt_segm'][i].astype('float32')
targets.append({
'boxes': boxes,
'labels': labels,
'img_whwh': img_whwh,
'img_whwh_tgt': img_whwh_tgt,
'gt_segm': gt_segm
})
losses = self._forward(targets)
losses.update({'loss': sum(losses.values())})
return losses
def get_pred(self):
bbox_pred, bbox_num, mask_pred = self._forward()
return {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred}
================================================
FILE: ppdet/modeling/architectures/retinanet.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
import paddle
import paddle.nn.functional as F
__all__ = ['RetinaNet']
@register
class RetinaNet(BaseArch):
__category__ = 'architecture'
def __init__(self, backbone, neck, head):
super(RetinaNet, self).__init__()
self.backbone = backbone
self.neck = neck
self.head = head
@classmethod
def from_config(cls, cfg, *args, **kwargs):
backbone = create(cfg['backbone'])
kwargs = {'input_shape': backbone.out_shape}
neck = create(cfg['neck'], **kwargs)
kwargs = {'input_shape': neck.out_shape}
head = create(cfg['head'], **kwargs)
return {
'backbone': backbone,
'neck': neck,
'head': head,
}
def _forward(self):
body_feats = self.backbone(self.inputs)
neck_feats = self.neck(body_feats)
if self.training:
return self.head(neck_feats, self.inputs)
else:
head_outs = self.head(neck_feats)
bbox, bbox_num, nms_keep_idx = self.head.post_process(
head_outs, self.inputs['im_shape'], self.inputs['scale_factor'])
if self.use_extra_data:
extra_data = {} # record the bbox output before nms, such like scores and nms_keep_idx
"""extra_data:{
'scores': predict scores,
'nms_keep_idx': bbox index before nms,
}
"""
preds_logits = self.head.decode_cls_logits(head_outs[0])
preds_scores = F.sigmoid(preds_logits)
extra_data['logits'] = preds_logits
extra_data['scores'] = preds_scores
extra_data['nms_keep_idx'] = nms_keep_idx # bbox index before nms
return {'bbox': bbox, 'bbox_num': bbox_num, "extra_data": extra_data}
else:
return {'bbox': bbox, 'bbox_num': bbox_num}
def get_loss(self):
return self._forward()
def get_pred(self):
return self._forward()
================================================
FILE: ppdet/modeling/architectures/rtdetrv3.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from .meta_arch import BaseArch
from ppdet.core.workspace import register, create
__all__ = ['RTDETRV3']
# Deformable DETR, DINO use the same architecture as DETR
@register
class RTDETRV3(BaseArch):
__category__ = 'architecture'
__inject__ = ['post_process', 'post_process_semi']
__shared__ = ['with_mask', 'exclude_post_process']
def __init__(self,
backbone,
transformer='DETRTransformer',
detr_head='DETRHead',
neck=None,
aux_o2m_head=None,
post_process='DETRPostProcess',
post_process_semi=None,
with_mask=False,
exclude_post_process=False):
super(RTDETRV3, self).__init__()
self.backbone = backbone
self.transformer = transformer
self.detr_head = detr_head
self.neck = neck
self.aux_o2m_head = aux_o2m_head
self.post_process = post_process
self.with_mask = with_mask
self.exclude_post_process = exclude_post_process
self.post_process_semi = post_process_semi
@classmethod
def from_config(cls, cfg, *args, **kwargs):
# backbone
backbone = create(cfg['backbone'])
# neck
kwargs = {'input_shape': backbone.out_shape}
neck = create(cfg['neck'], **kwargs) if cfg['neck'] else None
# transformer
if neck is not None:
kwargs = {'input_shape': neck.out_shape}
transformer = create(cfg['transformer'], **kwargs)
# head
kwargs = {
'hidden_dim': transformer.hidden_dim,
'nhead': transformer.nhead,
'input_shape': backbone.out_shape
}
detr_head = create(cfg['detr_head'], **kwargs)
kwargs = {'input_shape': neck.out_shape}
aux_o2m_head = create(cfg['aux_o2m_head'], **kwargs)
return {
'backbone': backbone,
'transformer': transformer,
"detr_head": detr_head,
"neck": neck,
"aux_o2m_head": aux_o2m_head
}
def _forward(self):
# Backbone
body_feats = self.backbone(self.inputs)
# Neck
if self.neck is not None:
body_feats = self.neck(body_feats)
# Transformer
pad_mask = self.inputs.get('pad_mask', None)
out_transformer = self.transformer(body_feats, pad_mask, self.inputs)
# DETR Head
if self.training:
detr_losses = self.detr_head(out_transformer, body_feats,
self.inputs)
detr_losses.update({
'loss': paddle.add_n(
[v for k, v in detr_losses.items() if 'log' not in k])
})
if self.aux_o2m_head is not None:
aux_o2m_losses = self.aux_o2m_head(body_feats, self.inputs)
for k, v in aux_o2m_losses.items():
if k == 'loss':
detr_losses[k] += v
k = k + '_aux_o2m'
detr_losses[k] = v
return detr_losses
else:
preds = self.detr_head(out_transformer, body_feats)
if self.exclude_post_process:
bbox, bbox_num, mask = preds
else:
bbox, bbox_num, mask = self.post_process(
preds, self.inputs['im_shape'], self.inputs['scale_factor'],
self.inputs['image'][2:].shape)
# aux_o2m_outs = self.aux_o2m_head(body_feats)
# bbox, bbox_num, nms_keep_idx = self.aux_o2m_head.post_process(
# aux_o2m_outs, self.inputs['scale_factor'])
output = {'bbox': bbox, 'bbox_num': bbox_num}
if self.with_mask:
output['mask'] = mask
return output
def get_loss(self):
return self._forward()
def get_pred(self):
return self._forward()
================================================
FILE: ppdet/modeling/architectures/s2anet.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
__all__ = ['S2ANet']
@register
class S2ANet(BaseArch):
__category__ = 'architecture'
__inject__ = ['head']
def __init__(self, backbone, neck, head):
"""
S2ANet, see https://arxiv.org/pdf/2008.09397.pdf
Args:
backbone (object): backbone instance
neck (object): `FPN` instance
head (object): `Head` instance
"""
super(S2ANet, self).__init__()
self.backbone = backbone
self.neck = neck
self.s2anet_head = head
@classmethod
def from_config(cls, cfg, *args, **kwargs):
backbone = create(cfg['backbone'])
kwargs = {'input_shape': backbone.out_shape}
neck = cfg['neck'] and create(cfg['neck'], **kwargs)
out_shape = neck and neck.out_shape or backbone.out_shape
kwargs = {'input_shape': out_shape}
head = create(cfg['head'], **kwargs)
return {'backbone': backbone, 'neck': neck, "head": head}
def _forward(self):
body_feats = self.backbone(self.inputs)
if self.neck is not None:
body_feats = self.neck(body_feats)
if self.training:
loss = self.s2anet_head(body_feats, self.inputs)
return loss
else:
head_outs = self.s2anet_head(body_feats)
# post_process
bboxes, bbox_num = self.s2anet_head.get_bboxes(head_outs)
# rescale the prediction back to origin image
im_shape = self.inputs['im_shape']
scale_factor = self.inputs['scale_factor']
bboxes = self.s2anet_head.get_pred(bboxes, bbox_num, im_shape,
scale_factor)
# output
output = {'bbox': bboxes, 'bbox_num': bbox_num}
return output
def get_loss(self, ):
loss = self._forward()
return loss
def get_pred(self):
output = self._forward()
return output
================================================
FILE: ppdet/modeling/architectures/solov2.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
__all__ = ['SOLOv2']
@register
class SOLOv2(BaseArch):
"""
SOLOv2 network, see https://arxiv.org/abs/2003.10152
Args:
backbone (object): an backbone instance
solov2_head (object): an `SOLOv2Head` instance
mask_head (object): an `SOLOv2MaskHead` instance
neck (object): neck of network, such as feature pyramid network instance
"""
__category__ = 'architecture'
def __init__(self, backbone, solov2_head, mask_head, neck=None):
super(SOLOv2, self).__init__()
self.backbone = backbone
self.neck = neck
self.solov2_head = solov2_head
self.mask_head = mask_head
@classmethod
def from_config(cls, cfg, *args, **kwargs):
backbone = create(cfg['backbone'])
kwargs = {'input_shape': backbone.out_shape}
neck = create(cfg['neck'], **kwargs)
kwargs = {'input_shape': neck.out_shape}
solov2_head = create(cfg['solov2_head'], **kwargs)
mask_head = create(cfg['mask_head'], **kwargs)
return {
'backbone': backbone,
'neck': neck,
'solov2_head': solov2_head,
'mask_head': mask_head,
}
def model_arch(self):
body_feats = self.backbone(self.inputs)
body_feats = self.neck(body_feats)
self.seg_pred = self.mask_head(body_feats)
self.cate_pred_list, self.kernel_pred_list = self.solov2_head(
body_feats)
def get_loss(self, ):
loss = {}
# get gt_ins_labels, gt_cate_labels, etc.
gt_ins_labels, gt_cate_labels, gt_grid_orders = [], [], []
fg_num = self.inputs['fg_num']
for i in range(len(self.solov2_head.seg_num_grids)):
ins_label = 'ins_label{}'.format(i)
if ins_label in self.inputs:
gt_ins_labels.append(self.inputs[ins_label])
cate_label = 'cate_label{}'.format(i)
if cate_label in self.inputs:
gt_cate_labels.append(self.inputs[cate_label])
grid_order = 'grid_order{}'.format(i)
if grid_order in self.inputs:
gt_grid_orders.append(self.inputs[grid_order])
loss_solov2 = self.solov2_head.get_loss(
self.cate_pred_list, self.kernel_pred_list, self.seg_pred,
gt_ins_labels, gt_cate_labels, gt_grid_orders, fg_num)
loss.update(loss_solov2)
total_loss = paddle.add_n(list(loss.values()))
loss.update({'loss': total_loss})
return loss
def get_pred(self):
seg_masks, cate_labels, cate_scores, bbox_num = self.solov2_head.get_prediction(
self.cate_pred_list, self.kernel_pred_list, self.seg_pred,
self.inputs['im_shape'], self.inputs['scale_factor'])
outs = {
"segm": seg_masks,
"bbox_num": bbox_num,
'cate_label': cate_labels,
'cate_score': cate_scores
}
return outs
================================================
FILE: ppdet/modeling/architectures/sparse_rcnn.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
__all__ = ["SparseRCNN"]
@register
class SparseRCNN(BaseArch):
__category__ = 'architecture'
__inject__ = ["postprocess"]
def __init__(self,
backbone,
neck,
head="SparsercnnHead",
postprocess="SparsePostProcess"):
super(SparseRCNN, self).__init__()
self.backbone = backbone
self.neck = neck
self.head = head
self.postprocess = postprocess
@classmethod
def from_config(cls, cfg, *args, **kwargs):
backbone = create(cfg['backbone'])
kwargs = {'input_shape': backbone.out_shape}
neck = create(cfg['neck'], **kwargs)
kwargs = {'roi_input_shape': neck.out_shape}
head = create(cfg['head'], **kwargs)
return {
'backbone': backbone,
'neck': neck,
"head": head,
}
def _forward(self):
body_feats = self.backbone(self.inputs)
fpn_feats = self.neck(body_feats)
head_outs = self.head(fpn_feats, self.inputs["img_whwh"])
if not self.training:
bbox_pred, bbox_num = self.postprocess(
head_outs["pred_logits"], head_outs["pred_boxes"],
self.inputs["scale_factor_whwh"], self.inputs["ori_shape"])
return bbox_pred, bbox_num
else:
return head_outs
def get_loss(self):
batch_gt_class = self.inputs["gt_class"]
batch_gt_box = self.inputs["gt_bbox"]
batch_whwh = self.inputs["img_whwh"]
targets = []
for i in range(len(batch_gt_class)):
boxes = batch_gt_box[i]
labels = batch_gt_class[i].squeeze(-1)
img_whwh = batch_whwh[i]
img_whwh_tgt = img_whwh.unsqueeze(0).tile([int(boxes.shape[0]), 1])
targets.append({
"boxes": boxes,
"labels": labels,
"img_whwh": img_whwh,
"img_whwh_tgt": img_whwh_tgt
})
outputs = self._forward()
loss_dict = self.head.get_loss(outputs, targets)
acc = loss_dict["acc"]
loss_dict.pop("acc")
total_loss = sum(loss_dict.values())
loss_dict.update({"loss": total_loss, "acc": acc})
return loss_dict
def get_pred(self):
bbox_pred, bbox_num = self._forward()
output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
return output
================================================
FILE: ppdet/modeling/architectures/ssd.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
import paddle
import paddle.nn.functional as F
__all__ = ['SSD']
@register
class SSD(BaseArch):
"""
Single Shot MultiBox Detector, see https://arxiv.org/abs/1512.02325
Args:
backbone (nn.Layer): backbone instance
ssd_head (nn.Layer): `SSDHead` instance
post_process (object): `BBoxPostProcess` instance
"""
__category__ = 'architecture'
__inject__ = ['post_process']
def __init__(self, backbone, ssd_head, post_process, r34_backbone=False):
super(SSD, self).__init__()
self.backbone = backbone
self.ssd_head = ssd_head
self.post_process = post_process
self.r34_backbone = r34_backbone
if self.r34_backbone:
from ppdet.modeling.backbones.resnet import ResNet
assert isinstance(self.backbone, ResNet) and \
self.backbone.depth == 34, \
"If you set r34_backbone=True, please use ResNet-34 as backbone."
self.backbone.res_layers[2].blocks[0].branch2a.conv._stride = [1, 1]
self.backbone.res_layers[2].blocks[0].short.conv._stride = [1, 1]
@classmethod
def from_config(cls, cfg, *args, **kwargs):
# backbone
backbone = create(cfg['backbone'])
# head
kwargs = {'input_shape': backbone.out_shape}
ssd_head = create(cfg['ssd_head'], **kwargs)
return {
'backbone': backbone,
"ssd_head": ssd_head,
}
def _forward(self):
# Backbone
body_feats = self.backbone(self.inputs)
# SSD Head
if self.training:
return self.ssd_head(body_feats, self.inputs['image'],
self.inputs['gt_bbox'],
self.inputs['gt_class'])
else:
preds, anchors = self.ssd_head(body_feats, self.inputs['image'])
bbox, bbox_num, nms_keep_idx = self.post_process(
preds, anchors, self.inputs['im_shape'],
self.inputs['scale_factor'])
if self.use_extra_data:
extra_data = {} # record the bbox output before nms, such like scores and nms_keep_idx
"""extra_data:{
'scores': predict scores,
'nms_keep_idx': bbox index before nms,
}
"""
preds_logits = preds[1] # [[1xNumBBoxNumClass]]
extra_data['scores'] = F.softmax(paddle.concat(
preds_logits, axis=1)).transpose([0, 2, 1])
extra_data['logits'] = paddle.concat(
preds_logits, axis=1).transpose([0, 2, 1])
extra_data['nms_keep_idx'] = nms_keep_idx # bbox index before nms
return bbox, bbox_num, extra_data
else:
return bbox, bbox_num
def get_loss(self, ):
return {"loss": self._forward()}
def get_pred(self):
if self.use_extra_data:
bbox_pred, bbox_num, extra_data = self._forward()
output = {
"bbox": bbox_pred,
"bbox_num": bbox_num,
"extra_data": extra_data
}
else:
bbox_pred, bbox_num = self._forward()
output = {
"bbox": bbox_pred,
"bbox_num": bbox_num,
}
return output
================================================
FILE: ppdet/modeling/architectures/tood.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
__all__ = ['TOOD']
@register
class TOOD(BaseArch):
"""
TOOD: Task-aligned One-stage Object Detection, see https://arxiv.org/abs/2108.07755
Args:
backbone (nn.Layer): backbone instance
neck (nn.Layer): 'FPN' instance
head (nn.Layer): 'TOODHead' instance
"""
__category__ = 'architecture'
def __init__(self, backbone, neck, head):
super(TOOD, self).__init__()
self.backbone = backbone
self.neck = neck
self.head = head
@classmethod
def from_config(cls, cfg, *args, **kwargs):
backbone = create(cfg['backbone'])
kwargs = {'input_shape': backbone.out_shape}
neck = create(cfg['neck'], **kwargs)
kwargs = {'input_shape': neck.out_shape}
head = create(cfg['head'], **kwargs)
return {
'backbone': backbone,
'neck': neck,
"head": head,
}
def _forward(self):
body_feats = self.backbone(self.inputs)
fpn_feats = self.neck(body_feats)
head_outs = self.head(fpn_feats)
if not self.training:
bboxes, bbox_num = self.head.post_process(
head_outs, self.inputs['im_shape'], self.inputs['scale_factor'])
return bboxes, bbox_num
else:
loss = self.head.get_loss(head_outs, self.inputs)
return loss
def get_loss(self):
return self._forward()
def get_pred(self):
bbox_pred, bbox_num = self._forward()
output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
return output
================================================
FILE: ppdet/modeling/architectures/ttfnet.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
__all__ = ['TTFNet']
@register
class TTFNet(BaseArch):
"""
TTFNet network, see https://arxiv.org/abs/1909.00700
Args:
backbone (object): backbone instance
neck (object): 'TTFFPN' instance
ttf_head (object): 'TTFHead' instance
post_process (object): 'BBoxPostProcess' instance
"""
__category__ = 'architecture'
__inject__ = ['post_process']
def __init__(self,
backbone='DarkNet',
neck='TTFFPN',
ttf_head='TTFHead',
post_process='BBoxPostProcess'):
super(TTFNet, self).__init__()
self.backbone = backbone
self.neck = neck
self.ttf_head = ttf_head
self.post_process = post_process
@classmethod
def from_config(cls, cfg, *args, **kwargs):
backbone = create(cfg['backbone'])
kwargs = {'input_shape': backbone.out_shape}
neck = create(cfg['neck'], **kwargs)
kwargs = {'input_shape': neck.out_shape}
ttf_head = create(cfg['ttf_head'], **kwargs)
return {
'backbone': backbone,
'neck': neck,
"ttf_head": ttf_head,
}
def _forward(self):
body_feats = self.backbone(self.inputs)
body_feats = self.neck(body_feats)
hm, wh = self.ttf_head(body_feats)
if self.training:
return hm, wh
else:
bbox, bbox_num = self.post_process(hm, wh, self.inputs['im_shape'],
self.inputs['scale_factor'])
return bbox, bbox_num
def get_loss(self, ):
loss = {}
heatmap = self.inputs['ttf_heatmap']
box_target = self.inputs['ttf_box_target']
reg_weight = self.inputs['ttf_reg_weight']
hm, wh = self._forward()
head_loss = self.ttf_head.get_loss(hm, wh, heatmap, box_target,
reg_weight)
loss.update(head_loss)
total_loss = paddle.add_n(list(loss.values()))
loss.update({'loss': total_loss})
return loss
def get_pred(self):
bbox_pred, bbox_num = self._forward()
output = {
"bbox": bbox_pred,
"bbox_num": bbox_num,
}
return output
================================================
FILE: ppdet/modeling/architectures/yolo.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
from ..post_process import JDEBBoxPostProcess
__all__ = ['YOLOv3']
# YOLOv3,PP-YOLO,PP-YOLOv2,PP-YOLOE,PP-YOLOE+ use the same architecture as YOLOv3
# PP-YOLOE and PP-YOLOE+ are recommended to use PPYOLOE architecture in ppyoloe.py, especially when use distillation or aux head
@register
class YOLOv3(BaseArch):
__category__ = 'architecture'
__shared__ = ['data_format']
__inject__ = ['post_process']
def __init__(self,
backbone='DarkNet',
neck='YOLOv3FPN',
yolo_head='YOLOv3Head',
post_process='BBoxPostProcess',
data_format='NCHW',
for_mot=False):
"""
YOLOv3 network, see https://arxiv.org/abs/1804.02767
Args:
backbone (nn.Layer): backbone instance
neck (nn.Layer): neck instance
yolo_head (nn.Layer): anchor_head instance
bbox_post_process (object): `BBoxPostProcess` instance
data_format (str): data format, NCHW or NHWC
for_mot (bool): whether return other features for multi-object tracking
models, default False in pure object detection models.
"""
super(YOLOv3, self).__init__(data_format=data_format)
self.backbone = backbone
self.neck = neck
self.yolo_head = yolo_head
self.post_process = post_process
self.for_mot = for_mot
self.return_idx = isinstance(post_process, JDEBBoxPostProcess)
@classmethod
def from_config(cls, cfg, *args, **kwargs):
# backbone
backbone = create(cfg['backbone'])
# fpn
kwargs = {'input_shape': backbone.out_shape}
neck = create(cfg['neck'], **kwargs)
# head
kwargs = {'input_shape': neck.out_shape}
yolo_head = create(cfg['yolo_head'], **kwargs)
return {
'backbone': backbone,
'neck': neck,
"yolo_head": yolo_head,
}
def _forward(self):
body_feats = self.backbone(self.inputs)
if self.for_mot:
neck_feats = self.neck(body_feats, self.for_mot)
else:
neck_feats = self.neck(body_feats)
if isinstance(neck_feats, dict):
assert self.for_mot == True
emb_feats = neck_feats['emb_feats']
neck_feats = neck_feats['yolo_feats']
if self.training:
yolo_losses = self.yolo_head(neck_feats, self.inputs)
if self.for_mot:
return {'det_losses': yolo_losses, 'emb_feats': emb_feats}
else:
return yolo_losses
else:
yolo_head_outs = self.yolo_head(neck_feats)
if self.for_mot:
# the detection part of JDE MOT model
boxes_idx, bbox, bbox_num, nms_keep_idx = self.post_process(
yolo_head_outs, self.yolo_head.mask_anchors)
output = {
'bbox': bbox,
'bbox_num': bbox_num,
'boxes_idx': boxes_idx,
'nms_keep_idx': nms_keep_idx,
'emb_feats': emb_feats,
}
else:
if self.return_idx:
# the detection part of JDE MOT model
_, bbox, bbox_num, nms_keep_idx = self.post_process(
yolo_head_outs, self.yolo_head.mask_anchors)
elif self.post_process is not None:
# anchor based YOLOs: YOLOv3,PP-YOLO,PP-YOLOv2 use mask_anchors
bbox, bbox_num, nms_keep_idx = self.post_process(
yolo_head_outs, self.yolo_head.mask_anchors,
self.inputs['im_shape'], self.inputs['scale_factor'])
else:
# anchor free YOLOs: PP-YOLOE, PP-YOLOE+
bbox, bbox_num, nms_keep_idx = self.yolo_head.post_process(
yolo_head_outs, self.inputs['scale_factor'])
if self.use_extra_data:
extra_data = {} # record the bbox output before nms, such like scores and nms_keep_idx
"""extra_data:{
'scores': predict scores,
'nms_keep_idx': bbox index before nms,
}
"""
extra_data['scores'] = yolo_head_outs[0] # predict scores (probability)
# Todo: get logits output
extra_data['nms_keep_idx'] = nms_keep_idx
# Todo support for mask_anchors yolo
output = {'bbox': bbox, 'bbox_num': bbox_num, 'extra_data': extra_data}
else:
output = {'bbox': bbox, 'bbox_num': bbox_num}
return output
def get_loss(self):
return self._forward()
def get_pred(self):
return self._forward()
================================================
FILE: ppdet/modeling/architectures/yolof.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
__all__ = ['YOLOF']
@register
class YOLOF(BaseArch):
__category__ = 'architecture'
def __init__(self,
backbone='ResNet',
neck='DilatedEncoder',
head='YOLOFHead',
for_mot=False):
"""
YOLOF network, see https://arxiv.org/abs/2103.09460
Args:
backbone (nn.Layer): backbone instance
neck (nn.Layer): DilatedEncoder instance
head (nn.Layer): YOLOFHead instance
for_mot (bool): whether return other features for multi-object tracking
models, default False in pure object detection models.
"""
super(YOLOF, self).__init__()
self.backbone = backbone
self.neck = neck
self.head = head
self.for_mot = for_mot
@classmethod
def from_config(cls, cfg, *args, **kwargs):
# backbone
backbone = create(cfg['backbone'])
# fpn
kwargs = {'input_shape': backbone.out_shape}
neck = create(cfg['neck'], **kwargs)
# head
kwargs = {'input_shape': neck.out_shape}
head = create(cfg['head'], **kwargs)
return {
'backbone': backbone,
'neck': neck,
"head": head,
}
def _forward(self):
body_feats = self.backbone(self.inputs)
neck_feats = self.neck(body_feats, self.for_mot)
if self.training:
yolo_losses = self.head(neck_feats, self.inputs)
return yolo_losses
else:
yolo_head_outs = self.head(neck_feats)
bbox, bbox_num = self.head.post_process(yolo_head_outs,
self.inputs['im_shape'],
self.inputs['scale_factor'])
output = {'bbox': bbox, 'bbox_num': bbox_num}
return output
def get_loss(self):
return self._forward()
def get_pred(self):
return self._forward()
================================================
FILE: ppdet/modeling/architectures/yolox.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ppdet.core.workspace import register, create
from .meta_arch import BaseArch
import random
import paddle
import paddle.nn.functional as F
import paddle.distributed as dist
__all__ = ['YOLOX']
@register
class YOLOX(BaseArch):
"""
YOLOX network, see https://arxiv.org/abs/2107.08430
Args:
backbone (nn.Layer): backbone instance
neck (nn.Layer): neck instance
head (nn.Layer): head instance
for_mot (bool): whether used for MOT or not
input_size (list[int]): initial scale, will be reset by self._preprocess()
size_stride (int): stride of the size range
size_range (list[int]): multi-scale range for training
random_interval (int): interval of iter to change self._input_size
"""
__category__ = 'architecture'
def __init__(self,
backbone='CSPDarkNet',
neck='YOLOCSPPAN',
head='YOLOXHead',
for_mot=False,
input_size=[640, 640],
size_stride=32,
size_range=[15, 25],
random_interval=10):
super(YOLOX, self).__init__()
self.backbone = backbone
self.neck = neck
self.head = head
self.for_mot = for_mot
self.input_size = input_size
self._input_size = paddle.to_tensor(input_size)
self.size_stride = size_stride
self.size_range = size_range
self.random_interval = random_interval
self._step = 0
@classmethod
def from_config(cls, cfg, *args, **kwargs):
# backbone
backbone = create(cfg['backbone'])
# fpn
kwargs = {'input_shape': backbone.out_shape}
neck = create(cfg['neck'], **kwargs)
# head
kwargs = {'input_shape': neck.out_shape}
head = create(cfg['head'], **kwargs)
return {
'backbone': backbone,
'neck': neck,
"head": head,
}
def _forward(self):
if self.training:
self._preprocess()
body_feats = self.backbone(self.inputs)
neck_feats = self.neck(body_feats, self.for_mot)
if self.training:
yolox_losses = self.head(neck_feats, self.inputs)
yolox_losses.update({'size': self._input_size[0]})
return yolox_losses
else:
head_outs = self.head(neck_feats)
bbox, bbox_num = self.head.post_process(
head_outs, self.inputs['im_shape'], self.inputs['scale_factor'])
return {'bbox': bbox, 'bbox_num': bbox_num}
def get_loss(self):
return self._forward()
def get_pred(self):
return self._forward()
def _preprocess(self):
# YOLOX multi-scale training, interpolate resize before inputs of the network.
self._get_size()
scale_y = self._input_size[0] / self.input_size[0]
scale_x = self._input_size[1] / self.input_size[1]
if scale_x != 1 or scale_y != 1:
self.inputs['image'] = F.interpolate(
self.inputs['image'],
size=self._input_size,
mode='bilinear',
align_corners=False)
gt_bboxes = self.inputs['gt_bbox']
for i in range(len(gt_bboxes)):
if len(gt_bboxes[i]) > 0:
gt_bboxes[i][:, 0::2] = gt_bboxes[i][:, 0::2] * scale_x
gt_bboxes[i][:, 1::2] = gt_bboxes[i][:, 1::2] * scale_y
self.inputs['gt_bbox'] = gt_bboxes
def _get_size(self):
# random_interval = 10 as default, every 10 iters to change self._input_size
image_ratio = self.input_size[1] * 1.0 / self.input_size[0]
if self._step % self.random_interval == 0:
size_factor = random.randint(*self.size_range)
size = [
self.size_stride * size_factor,
self.size_stride * int(size_factor * image_ratio)
]
self._input_size = paddle.to_tensor(size)
self._step += 1
================================================
FILE: ppdet/modeling/assigners/__init__.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import utils
from . import task_aligned_assigner
from . import atss_assigner
from . import simota_assigner
from . import max_iou_assigner
from . import fcosr_assigner
from . import rotated_task_aligned_assigner
from . import task_aligned_assigner_cr
from . import uniform_assigner
from .utils import *
from .task_aligned_assigner import *
from .atss_assigner import *
from .simota_assigner import *
from .max_iou_assigner import *
from .fcosr_assigner import *
from .rotated_task_aligned_assigner import *
from .task_aligned_assigner_cr import *
from .uniform_assigner import *
from .hungarian_assigner import *
from .pose_utils import *
================================================
FILE: ppdet/modeling/assigners/atss_assigner.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from ..bbox_utils import iou_similarity, batch_iou_similarity
from ..bbox_utils import bbox_center
from .utils import (check_points_inside_bboxes, compute_max_iou_anchor,
compute_max_iou_gt)
__all__ = ['ATSSAssigner']
@register
class ATSSAssigner(nn.Layer):
"""Bridging the Gap Between Anchor-based and Anchor-free Detection
via Adaptive Training Sample Selection
"""
__shared__ = ['num_classes']
def __init__(self,
topk=9,
num_classes=80,
force_gt_matching=False,
eps=1e-9,
sm_use=False):
super(ATSSAssigner, self).__init__()
self.topk = topk
self.num_classes = num_classes
self.force_gt_matching = force_gt_matching
self.eps = eps
self.sm_use = sm_use
def _gather_topk_pyramid(self, gt2anchor_distances, num_anchors_list,
pad_gt_mask):
gt2anchor_distances_list = paddle.split(
gt2anchor_distances, num_anchors_list, axis=-1)
num_anchors_index = np.cumsum(num_anchors_list).tolist()
num_anchors_index = [0, ] + num_anchors_index[:-1]
is_in_topk_list = []
topk_idxs_list = []
for distances, anchors_index in zip(gt2anchor_distances_list,
num_anchors_index):
num_anchors = distances.shape[-1]
_, topk_idxs = paddle.topk(
distances, self.topk, axis=-1, largest=False)
topk_idxs_list.append(topk_idxs + anchors_index)
is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(
axis=-2).astype(gt2anchor_distances.dtype)
is_in_topk_list.append(is_in_topk * pad_gt_mask)
is_in_topk_list = paddle.concat(is_in_topk_list, axis=-1)
topk_idxs_list = paddle.concat(topk_idxs_list, axis=-1)
return is_in_topk_list, topk_idxs_list
@paddle.no_grad()
def forward(self,
anchor_bboxes,
num_anchors_list,
gt_labels,
gt_bboxes,
pad_gt_mask,
bg_index,
gt_scores=None,
pred_bboxes=None):
r"""This code is based on
https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/atss_assigner.py
The assignment is done in following steps
1. compute iou between all bbox (bbox of all pyramid levels) and gt
2. compute center distance between all bbox and gt
3. on each pyramid level, for each gt, select k bbox whose center
are closest to the gt center, so we total select k*l bbox as
candidates for each gt
4. get corresponding iou for the these candidates, and compute the
mean and std, set mean + std as the iou threshold
5. select these candidates whose iou are greater than or equal to
the threshold as positive
6. limit the positive sample's center in gt
7. if an anchor box is assigned to multiple gts, the one with the
highest iou will be selected.
Args:
anchor_bboxes (Tensor, float32): pre-defined anchors, shape(L, 4),
"xmin, xmax, ymin, ymax" format
num_anchors_list (List): num of anchors in each level
gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
bg_index (int): background index
gt_scores (Tensor|None, float32) Score of gt_bboxes,
shape(B, n, 1), if None, then it will initialize with one_hot label
pred_bboxes (Tensor, float32, optional): predicted bounding boxes, shape(B, L, 4)
Returns:
assigned_labels (Tensor): (B, L)
assigned_bboxes (Tensor): (B, L, 4)
assigned_scores (Tensor): (B, L, C), if pred_bboxes is not None, then output ious
"""
assert gt_labels.ndim == gt_bboxes.ndim and \
gt_bboxes.ndim == 3
num_anchors, _ = anchor_bboxes.shape
batch_size, num_max_boxes, _ = gt_bboxes.shape
# negative batch
if num_max_boxes == 0:
assigned_labels = paddle.full(
[batch_size, num_anchors], bg_index, dtype='int32')
assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])
assigned_scores = paddle.zeros(
[batch_size, num_anchors, self.num_classes])
return assigned_labels, assigned_bboxes, assigned_scores
# 1. compute iou between gt and anchor bbox, [B, n, L]
ious = iou_similarity(gt_bboxes.reshape([-1, 4]), anchor_bboxes)
ious = ious.reshape([batch_size, -1, num_anchors])
# 2. compute center distance between all anchors and gt, [B, n, L]
gt_centers = bbox_center(gt_bboxes.reshape([-1, 4])).unsqueeze(1)
anchor_centers = bbox_center(anchor_bboxes)
gt2anchor_distances = (gt_centers - anchor_centers.unsqueeze(0)) \
.norm(2, axis=-1).reshape([batch_size, -1, num_anchors])
# 3. on each pyramid level, selecting topk closest candidates
# based on the center distance, [B, n, L]
is_in_topk, topk_idxs = self._gather_topk_pyramid(
gt2anchor_distances, num_anchors_list, pad_gt_mask)
# 4. get corresponding iou for the these candidates, and compute the
# mean and std, 5. set mean + std as the iou threshold
iou_candidates = ious * is_in_topk
iou_threshold = paddle.index_sample(
iou_candidates.flatten(stop_axis=-2),
topk_idxs.flatten(stop_axis=-2))
iou_threshold = iou_threshold.reshape([batch_size, num_max_boxes, -1])
iou_threshold = iou_threshold.mean(axis=-1, keepdim=True) + \
iou_threshold.std(axis=-1, keepdim=True)
is_in_topk = paddle.where(iou_candidates > iou_threshold, is_in_topk,
paddle.zeros_like(is_in_topk))
# 6. check the positive sample's center in gt, [B, n, L]
if self.sm_use:
is_in_gts = check_points_inside_bboxes(
anchor_centers, gt_bboxes, sm_use=True)
else:
is_in_gts = check_points_inside_bboxes(anchor_centers, gt_bboxes)
# select positive sample, [B, n, L]
mask_positive = is_in_topk * is_in_gts * pad_gt_mask
# 7. if an anchor box is assigned to multiple gts,
# the one with the highest iou will be selected.
mask_positive_sum = mask_positive.sum(axis=-2)
if mask_positive_sum.max() > 1:
mask_multiple_gts = (
mask_positive_sum.unsqueeze(1) > 1).astype('int32').tile(
[1, num_max_boxes, 1]).astype('bool')
if self.sm_use:
is_max_iou = compute_max_iou_anchor(ious * mask_positive)
else:
is_max_iou = compute_max_iou_anchor(ious)
mask_positive = paddle.where(mask_multiple_gts, is_max_iou,
mask_positive)
mask_positive_sum = mask_positive.sum(axis=-2)
# 8. make sure every gt_bbox matches the anchor
if self.force_gt_matching:
is_max_iou = compute_max_iou_gt(ious) * pad_gt_mask
mask_max_iou = (is_max_iou.sum(-2, keepdim=True) == 1).tile(
[1, num_max_boxes, 1])
mask_positive = paddle.where(mask_max_iou, is_max_iou,
mask_positive)
mask_positive_sum = mask_positive.sum(axis=-2)
assigned_gt_index = mask_positive.argmax(axis=-2)
# assigned target
batch_ind = paddle.arange(
end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
assigned_gt_index = assigned_gt_index + (batch_ind * num_max_boxes).astype(assigned_gt_index.dtype)
assigned_labels = paddle.gather(
gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
assigned_labels = paddle.where(
mask_positive_sum > 0, assigned_labels,
paddle.full_like(assigned_labels, bg_index))
assigned_bboxes = paddle.gather(
gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0)
assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])
assigned_scores = F.one_hot(assigned_labels, self.num_classes + 1)
ind = list(range(self.num_classes + 1))
ind.remove(bg_index)
assigned_scores = paddle.index_select(
assigned_scores, paddle.to_tensor(ind), axis=-1)
if pred_bboxes is not None:
# assigned iou
ious = batch_iou_similarity(gt_bboxes, pred_bboxes) * mask_positive
ious = ious.max(axis=-2).unsqueeze(-1)
assigned_scores *= ious
elif gt_scores is not None:
gather_scores = paddle.gather(
gt_scores.flatten(), assigned_gt_index.flatten(), axis=0)
gather_scores = gather_scores.reshape([batch_size, num_anchors])
gather_scores = paddle.where(mask_positive_sum > 0, gather_scores,
paddle.zeros_like(gather_scores))
assigned_scores *= gather_scores.unsqueeze(-1)
return assigned_labels, assigned_bboxes, assigned_scores
================================================
FILE: ppdet/modeling/assigners/clrnet_assigner.py
================================================
import paddle
import paddle.nn.functional as F
from ppdet.modeling.losses.clrnet_line_iou_loss import line_iou
def distance_cost(predictions, targets, img_w):
"""
repeat predictions and targets to generate all combinations
use the abs distance as the new distance cost
"""
num_priors = predictions.shape[0]
num_targets = targets.shape[0]
predictions = paddle.repeat_interleave(
predictions, num_targets, axis=0)[..., 6:]
targets = paddle.concat(x=num_priors * [targets])[..., 6:]
invalid_masks = (targets < 0) | (targets >= img_w)
lengths = (~invalid_masks).sum(axis=1)
distances = paddle.abs(x=targets - predictions)
distances[invalid_masks] = 0.0
distances = distances.sum(axis=1) / (lengths.cast("float32") + 1e-09)
distances = distances.reshape([num_priors, num_targets])
return distances
def focal_cost(cls_pred, gt_labels, alpha=0.25, gamma=2, eps=1e-12):
"""
Args:
cls_pred (Tensor): Predicted classification logits, shape
[num_query, num_class].
gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
Returns:
torch.Tensor: cls_cost value
"""
cls_pred = F.sigmoid(cls_pred)
neg_cost = -(1 - cls_pred + eps).log() * (1 - alpha) * cls_pred.pow(gamma)
pos_cost = -(cls_pred + eps).log() * alpha * (1 - cls_pred).pow(gamma)
cls_cost = pos_cost.index_select(
gt_labels, axis=1) - neg_cost.index_select(
gt_labels, axis=1)
return cls_cost
def dynamic_k_assign(cost, pair_wise_ious):
"""
Assign grouth truths with priors dynamically.
Args:
cost: the assign cost.
pair_wise_ious: iou of grouth truth and priors.
Returns:
prior_idx: the index of assigned prior.
gt_idx: the corresponding ground truth index.
"""
matching_matrix = paddle.zeros_like(cost)
ious_matrix = pair_wise_ious
ious_matrix[ious_matrix < 0] = 0.0
n_candidate_k = 4
topk_ious, _ = paddle.topk(ious_matrix, n_candidate_k, axis=0)
dynamic_ks = paddle.clip(x=topk_ious.sum(0).cast("int32"), min=1)
num_gt = cost.shape[1]
for gt_idx in range(num_gt):
_, pos_idx = paddle.topk(
x=cost[:, gt_idx], k=dynamic_ks[gt_idx].item(), largest=False)
matching_matrix[pos_idx, gt_idx] = 1.0
del topk_ious, dynamic_ks, pos_idx
matched_gt = matching_matrix.sum(axis=1)
if (matched_gt > 1).sum() > 0:
matched_gt_indices = paddle.nonzero(matched_gt > 1)[:, 0]
cost_argmin = paddle.argmin(
cost.index_select(matched_gt_indices), axis=1)
matching_matrix[matched_gt_indices][0] *= 0.0
matching_matrix[matched_gt_indices, cost_argmin] = 1.0
prior_idx = matching_matrix.sum(axis=1).nonzero()
gt_idx = matching_matrix[prior_idx].argmax(axis=-1)
return prior_idx.flatten(), gt_idx.flatten()
def cdist_paddle(x1, x2, p=2):
assert x1.shape[1] == x2.shape[1]
B, M = x1.shape
# if p == np.inf:
# dist = np.max(np.abs(x1[:, np.newaxis, :] - x2[np.newaxis, :, :]), axis=-1)
if p == 1:
dist = paddle.sum(
paddle.abs(x1.unsqueeze(axis=1) - x2.unsqueeze(axis=0)), axis=-1)
else:
dist = paddle.pow(paddle.sum(paddle.pow(
paddle.abs(x1.unsqueeze(axis=1) - x2.unsqueeze(axis=0)), p),
axis=-1),
1 / p)
return dist
def assign(predictions,
targets,
img_w,
img_h,
distance_cost_weight=3.0,
cls_cost_weight=1.0):
"""
computes dynamicly matching based on the cost, including cls cost and lane similarity cost
Args:
predictions (Tensor): predictions predicted by each stage, shape: (num_priors, 78)
targets (Tensor): lane targets, shape: (num_targets, 78)
return:
matched_row_inds (Tensor): matched predictions, shape: (num_targets)
matched_col_inds (Tensor): matched targets, shape: (num_targets)
"""
predictions = predictions.detach().clone()
predictions[:, 3] *= img_w - 1
predictions[:, 6:] *= img_w - 1
targets = targets.detach().clone()
distances_score = distance_cost(predictions, targets, img_w)
distances_score = 1 - distances_score / paddle.max(x=distances_score) + 0.01
cls_score = focal_cost(predictions[:, :2], targets[:, 1].cast('int64'))
num_priors = predictions.shape[0]
num_targets = targets.shape[0]
target_start_xys = targets[:, 2:4]
target_start_xys[..., 0] *= (img_h - 1)
prediction_start_xys = predictions[:, 2:4]
prediction_start_xys[..., 0] *= (img_h - 1)
start_xys_score = cdist_paddle(
prediction_start_xys, target_start_xys,
p=2).reshape([num_priors, num_targets])
start_xys_score = 1 - start_xys_score / paddle.max(x=start_xys_score) + 0.01
target_thetas = targets[:, 4].unsqueeze(axis=-1)
theta_score = cdist_paddle(
predictions[:, 4].unsqueeze(axis=-1), target_thetas,
p=1).reshape([num_priors, num_targets]) * 180
theta_score = 1 - theta_score / paddle.max(x=theta_score) + 0.01
cost = -(distances_score * start_xys_score * theta_score
)**2 * distance_cost_weight + cls_score * cls_cost_weight
iou = line_iou(predictions[..., 6:], targets[..., 6:], img_w, aligned=False)
matched_row_inds, matched_col_inds = dynamic_k_assign(cost, iou)
return matched_row_inds, matched_col_inds
================================================
FILE: ppdet/modeling/assigners/fcosr_assigner.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from ppdet.modeling.rbox_utils import box2corners, check_points_in_polys, paddle_gather
__all__ = ['FCOSRAssigner']
EPS = 1e-9
@register
class FCOSRAssigner(nn.Layer):
""" FCOSR Assigner, refer to https://arxiv.org/abs/2111.10780 for details
1. compute normalized gaussian distribution score and refined gaussian distribution score
2. refer to ellipse center sampling, sample points whose normalized gaussian distribution score is greater than threshold
3. refer to multi-level sampling, assign ground truth to feature map which follows two conditions.
i). first, the ratio between the short edge of the target and the stride of the feature map is less than 2.
ii). second, the long edge of minimum bounding rectangle of the target is larger than the acceptance range of feature map
4. refer to fuzzy sample label assignment, the points satisfying 2 and 3 will be assigned to the ground truth according to gaussian distribution score
"""
__shared__ = ['num_classes']
def __init__(self,
num_classes=80,
factor=12,
threshold=0.23,
boundary=[[-1, 128], [128, 320], [320, 10000]],
score_type='iou'):
super(FCOSRAssigner, self).__init__()
self.num_classes = num_classes
self.factor = factor
self.threshold = threshold
self.boundary = [
paddle.to_tensor(
l, dtype=paddle.float32).reshape([1, 1, 2]) for l in boundary
]
self.score_type = score_type
def get_gaussian_distribution_score(self, points, gt_rboxes, gt_polys):
# projecting points to coordinate system defined by each rbox
# [B, N, 4, 2] -> 4 * [B, N, 1, 2]
a, b, c, d = gt_polys.split(4, axis=2)
# [1, L, 2] -> [1, 1, L, 2]
points = points.unsqueeze(0)
ab = b - a
ad = d - a
# [B, N, 5] -> [B, N, 2], [B, N, 2], [B, N, 1]
xy, wh, angle = gt_rboxes.split([2, 2, 1], axis=-1)
# [B, N, 2] -> [B, N, 1, 2]
xy = xy.unsqueeze(2)
# vector of points to center [B, N, L, 2]
vec = points - xy
# = |ab| * |vec| * cos(theta) [B, N, L]
vec_dot_ab = paddle.sum(vec * ab, axis=-1)
# = |ad| * |vec| * cos(theta) [B, N, L]
vec_dot_ad = paddle.sum(vec * ad, axis=-1)
# norm_ab [B, N, L]
norm_ab = paddle.sum(ab * ab, axis=-1).sqrt()
# norm_ad [B, N, L]
norm_ad = paddle.sum(ad * ad, axis=-1).sqrt()
# min(h, w), [B, N, 1]
min_edge = paddle.min(wh, axis=-1, keepdim=True)
# delta_x, delta_y [B, N, L]
delta_x = vec_dot_ab.pow(2) / (norm_ab.pow(3) * min_edge + EPS)
delta_y = vec_dot_ad.pow(2) / (norm_ad.pow(3) * min_edge + EPS)
# score [B, N, L]
norm_score = paddle.exp(-0.5 * self.factor * (delta_x + delta_y))
# simplified calculation
sigma = min_edge / self.factor
refined_score = norm_score / (2 * np.pi * sigma + EPS)
return norm_score, refined_score
def get_rotated_inside_mask(self, points, gt_polys, scores):
inside_mask = check_points_in_polys(points, gt_polys)
center_mask = scores >= self.threshold
return (inside_mask & center_mask).cast(paddle.float32)
def get_inside_range_mask(self, points, gt_bboxes, gt_rboxes, stride_tensor,
regress_range):
# [1, L, 2] -> [1, 1, L, 2]
points = points.unsqueeze(0)
# [B, n, 4] -> [B, n, 1, 4]
x1y1, x2y2 = gt_bboxes.unsqueeze(2).split(2, axis=-1)
# [B, n, L, 2]
lt = points - x1y1
rb = x2y2 - points
# [B, n, L, 4]
ltrb = paddle.concat([lt, rb], axis=-1)
# [B, n, L, 4] -> [B, n, L]
inside_mask = paddle.min(ltrb, axis=-1) > EPS
# regress_range [1, L, 2] -> [1, 1, L, 2]
regress_range = regress_range.unsqueeze(0)
# stride_tensor [1, L, 1] -> [1, 1, L]
stride_tensor = stride_tensor.transpose((0, 2, 1))
# fcos range
# [B, n, L, 4] -> [B, n, L]
ltrb_max = paddle.max(ltrb, axis=-1)
# [1, 1, L, 2] -> [1, 1, L]
low, high = regress_range[..., 0], regress_range[..., 1]
# [B, n, L]
regress_mask = (ltrb_max >= low) & (ltrb_max <= high)
# mask for rotated
# [B, n, 1]
min_edge = paddle.min(gt_rboxes[..., 2:4], axis=-1, keepdim=True)
# [B, n , L]
rotated_mask = ((min_edge / stride_tensor) < 2.0) & (ltrb_max > high)
mask = inside_mask & (regress_mask | rotated_mask)
return mask.cast(paddle.float32)
@paddle.no_grad()
def forward(self,
anchor_points,
stride_tensor,
num_anchors_list,
gt_labels,
gt_bboxes,
gt_rboxes,
pad_gt_mask,
bg_index,
pred_rboxes=None):
r"""
Args:
anchor_points (Tensor, float32): pre-defined anchor points, shape(1, L, 2),
"x, y" format
stride_tensor (Tensor, float32): stride tensor, shape (1, L, 1)
num_anchors_list (List): num of anchors in each level
gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
gt_rboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 5)
pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
bg_index (int): background index
pred_rboxes (Tensor, float32, optional): predicted bounding boxes, shape(B, L, 5)
Returns:
assigned_labels (Tensor): (B, L)
assigned_rboxes (Tensor): (B, L, 5)
assigned_scores (Tensor): (B, L, C), if pred_rboxes is not None, then output ious
"""
_, num_anchors, _ = anchor_points.shape
batch_size, num_max_boxes, _ = gt_rboxes.shape
if num_max_boxes == 0:
assigned_labels = paddle.full(
[batch_size, num_anchors], bg_index, dtype=gt_labels.dtype)
assigned_rboxes = paddle.zeros([batch_size, num_anchors, 5])
assigned_scores = paddle.zeros(
[batch_size, num_anchors, self.num_classes])
return assigned_labels, assigned_rboxes, assigned_scores
# get normalized gaussian distribution score and refined distribution score
gt_polys = box2corners(gt_rboxes)
score, refined_score = self.get_gaussian_distribution_score(
anchor_points, gt_rboxes, gt_polys)
inside_mask = self.get_rotated_inside_mask(anchor_points, gt_polys,
score)
regress_ranges = []
for num, bound in zip(num_anchors_list, self.boundary):
regress_ranges.append(bound.tile((1, num, 1)))
regress_ranges = paddle.concat(regress_ranges, axis=1)
regress_mask = self.get_inside_range_mask(
anchor_points, gt_bboxes, gt_rboxes, stride_tensor, regress_ranges)
# [B, n, L]
mask_positive = inside_mask * regress_mask * pad_gt_mask
refined_score = refined_score * mask_positive - (1. - mask_positive)
argmax_refined_score = refined_score.argmax(axis=-2)
max_refined_score = refined_score.max(axis=-2)
assigned_gt_index = argmax_refined_score
# assigned target
batch_ind = paddle.arange(
end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
assigned_gt_index = assigned_gt_index + (batch_ind * num_max_boxes).astype(assigned_gt_index.dtype)
assigned_labels = paddle.gather(
gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
assigned_labels = paddle.where(
max_refined_score > 0, assigned_labels,
paddle.full_like(assigned_labels, bg_index))
assigned_rboxes = paddle.gather(
gt_rboxes.reshape([-1, 5]), assigned_gt_index.flatten(), axis=0)
assigned_rboxes = assigned_rboxes.reshape([batch_size, num_anchors, 5])
assigned_scores = F.one_hot(assigned_labels, self.num_classes + 1)
ind = list(range(self.num_classes + 1))
ind.remove(bg_index)
assigned_scores = paddle.index_select(
assigned_scores, paddle.to_tensor(ind), axis=-1)
if self.score_type == 'gaussian':
selected_scores = paddle_gather(
score, 1, argmax_refined_score.unsqueeze(-2)).squeeze(-2)
assigned_scores = assigned_scores * selected_scores.unsqueeze(-1)
elif self.score_type == 'iou':
assert pred_rboxes is not None, 'If score type is iou, pred_rboxes should not be None'
from ext_op import matched_rbox_iou
b, l = pred_rboxes.shape[:2]
iou_score = matched_rbox_iou(
pred_rboxes.reshape((-1, 5)), assigned_rboxes.reshape(
(-1, 5))).reshape((b, l, 1))
assigned_scores = assigned_scores * iou_score
return assigned_labels, assigned_rboxes, assigned_scores
================================================
FILE: ppdet/modeling/assigners/hungarian_assigner.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
try:
from scipy.optimize import linear_sum_assignment
except ImportError:
linear_sum_assignment = None
import paddle
from ppdet.core.workspace import register
__all__ = ['PoseHungarianAssigner', 'PseudoSampler']
class AssignResult:
"""Stores assignments between predicted and truth boxes.
Attributes:
num_gts (int): the number of truth boxes considered when computing this
assignment
gt_inds (LongTensor): for each predicted box indicates the 1-based
index of the assigned truth box. 0 means unassigned and -1 means
ignore.
max_overlaps (FloatTensor): the iou between the predicted box and its
assigned truth box.
labels (None | LongTensor): If specified, for each predicted box
indicates the category label of the assigned truth box.
"""
def __init__(self, num_gts, gt_inds, max_overlaps, labels=None):
self.num_gts = num_gts
self.gt_inds = gt_inds
self.max_overlaps = max_overlaps
self.labels = labels
# Interface for possible user-defined properties
self._extra_properties = {}
@property
def num_preds(self):
"""int: the number of predictions in this assignment"""
return len(self.gt_inds)
def set_extra_property(self, key, value):
"""Set user-defined new property."""
assert key not in self.info
self._extra_properties[key] = value
def get_extra_property(self, key):
"""Get user-defined property."""
return self._extra_properties.get(key, None)
@property
def info(self):
"""dict: a dictionary of info about the object"""
basic_info = {
'num_gts': self.num_gts,
'num_preds': self.num_preds,
'gt_inds': self.gt_inds,
'max_overlaps': self.max_overlaps,
'labels': self.labels,
}
basic_info.update(self._extra_properties)
return basic_info
@register
class PoseHungarianAssigner:
"""Computes one-to-one matching between predictions and ground truth.
This class computes an assignment between the targets and the predictions
based on the costs. The costs are weighted sum of three components:
classification cost, regression L1 cost and regression oks cost. The
targets don't include the no_object, so generally there are more
predictions than targets. After the one-to-one matching, the un-matched
are treated as backgrounds. Thus each query prediction will be assigned
with `0` or a positive integer indicating the ground truth index:
- 0: negative sample, no assigned gt.
- positive integer: positive sample, index (1-based) of assigned gt.
Args:
cls_weight (int | float, optional): The scale factor for classification
cost. Default 1.0.
kpt_weight (int | float, optional): The scale factor for regression
L1 cost. Default 1.0.
oks_weight (int | float, optional): The scale factor for regression
oks cost. Default 1.0.
"""
__inject__ = ['cls_cost', 'kpt_cost', 'oks_cost']
def __init__(self,
cls_cost='ClassificationCost',
kpt_cost='KptL1Cost',
oks_cost='OksCost'):
self.cls_cost = cls_cost
self.kpt_cost = kpt_cost
self.oks_cost = oks_cost
def assign(self,
cls_pred,
kpt_pred,
gt_labels,
gt_keypoints,
gt_areas,
img_meta,
eps=1e-7):
"""Computes one-to-one matching based on the weighted costs.
This method assign each query prediction to a ground truth or
background. The `assigned_gt_inds` with -1 means don't care,
0 means negative sample, and positive number is the index (1-based)
of assigned gt.
The assignment is done in the following steps, the order matters.
1. assign every prediction to -1
2. compute the weighted costs
3. do Hungarian matching on CPU based on the costs
4. assign all to 0 (background) first, then for each matched pair
between predictions and gts, treat this prediction as foreground
and assign the corresponding gt index (plus 1) to it.
Args:
cls_pred (Tensor): Predicted classification logits, shape
[num_query, num_class].
kpt_pred (Tensor): Predicted keypoints with normalized coordinates
(x_{i}, y_{i}), which are all in range [0, 1]. Shape
[num_query, K*2].
gt_labels (Tensor): Label of `gt_keypoints`, shape (num_gt,).
gt_keypoints (Tensor): Ground truth keypoints with unnormalized
coordinates [p^{1}_x, p^{1}_y, p^{1}_v, ..., \
p^{K}_x, p^{K}_y, p^{K}_v]. Shape [num_gt, K*3].
gt_areas (Tensor): Ground truth mask areas, shape (num_gt,).
img_meta (dict): Meta information for current image.
eps (int | float, optional): A value added to the denominator for
numerical stability. Default 1e-7.
Returns:
:obj:`AssignResult`: The assigned result.
"""
num_gts, num_kpts = gt_keypoints.shape[0], kpt_pred.shape[0]
if not gt_keypoints.astype('bool').any():
num_gts = 0
# 1. assign -1 by default
assigned_gt_inds = paddle.full((num_kpts, ), -1, dtype="int64")
assigned_labels = paddle.full((num_kpts, ), -1, dtype="int64")
if num_gts == 0 or num_kpts == 0:
# No ground truth or keypoints, return empty assignment
if num_gts == 0:
# No ground truth, assign all to background
assigned_gt_inds[:] = 0
return AssignResult(
num_gts, assigned_gt_inds, None, labels=assigned_labels)
img_h, img_w, _ = img_meta['img_shape']
factor = paddle.to_tensor(
[img_w, img_h, img_w, img_h], dtype=gt_keypoints.dtype).reshape(
(1, -1))
# 2. compute the weighted costs
# classification cost
cls_cost = self.cls_cost(cls_pred, gt_labels)
# keypoint regression L1 cost
gt_keypoints_reshape = gt_keypoints.reshape((gt_keypoints.shape[0], -1,
3))
valid_kpt_flag = gt_keypoints_reshape[..., -1]
kpt_pred_tmp = kpt_pred.clone().detach().reshape((kpt_pred.shape[0], -1,
2))
normalize_gt_keypoints = gt_keypoints_reshape[
..., :2] / factor[:, :2].unsqueeze(0)
kpt_cost = self.kpt_cost(kpt_pred_tmp, normalize_gt_keypoints,
valid_kpt_flag)
# keypoint OKS cost
kpt_pred_tmp = kpt_pred.clone().detach().reshape((kpt_pred.shape[0], -1,
2))
kpt_pred_tmp = kpt_pred_tmp * factor[:, :2].unsqueeze(0)
oks_cost = self.oks_cost(kpt_pred_tmp, gt_keypoints_reshape[..., :2],
valid_kpt_flag, gt_areas)
# weighted sum of above three costs
cost = cls_cost + kpt_cost + oks_cost
# 3. do Hungarian matching on CPU using linear_sum_assignment
cost = cost.detach().cpu()
if linear_sum_assignment is None:
raise ImportError('Please run "pip install scipy" '
'to install scipy first.')
matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
matched_row_inds = paddle.to_tensor(matched_row_inds)
matched_col_inds = paddle.to_tensor(matched_col_inds)
# 4. assign backgrounds and foregrounds
# assign all indices to backgrounds first
assigned_gt_inds[:] = 0
# assign foregrounds based on matching results
assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
assigned_labels[matched_row_inds] = gt_labels[matched_col_inds][
..., 0].astype("int64")
return AssignResult(
num_gts, assigned_gt_inds, None, labels=assigned_labels)
class SamplingResult:
"""Bbox sampling result.
"""
def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result,
gt_flags):
self.pos_inds = pos_inds
self.neg_inds = neg_inds
if pos_inds.size > 0:
self.pos_bboxes = bboxes[pos_inds]
self.neg_bboxes = bboxes[neg_inds]
self.pos_is_gt = gt_flags[pos_inds]
self.num_gts = gt_bboxes.shape[0]
self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
if gt_bboxes.numel() == 0:
# hack for index error case
assert self.pos_assigned_gt_inds.numel() == 0
self.pos_gt_bboxes = paddle.zeros(
gt_bboxes.shape, dtype=gt_bboxes.dtype).reshape((-1, 4))
else:
if len(gt_bboxes.shape) < 2:
gt_bboxes = gt_bboxes.reshape((-1, 4))
self.pos_gt_bboxes = paddle.index_select(
gt_bboxes,
self.pos_assigned_gt_inds.astype('int64'),
axis=0)
if assign_result.labels is not None:
self.pos_gt_labels = assign_result.labels[pos_inds]
else:
self.pos_gt_labels = None
@property
def bboxes(self):
"""paddle.Tensor: concatenated positive and negative boxes"""
return paddle.concat([self.pos_bboxes, self.neg_bboxes])
def __nice__(self):
data = self.info.copy()
data['pos_bboxes'] = data.pop('pos_bboxes').shape
data['neg_bboxes'] = data.pop('neg_bboxes').shape
parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
body = ' ' + ',\n '.join(parts)
return '{\n' + body + '\n}'
@property
def info(self):
"""Returns a dictionary of info about the object."""
return {
'pos_inds': self.pos_inds,
'neg_inds': self.neg_inds,
'pos_bboxes': self.pos_bboxes,
'neg_bboxes': self.neg_bboxes,
'pos_is_gt': self.pos_is_gt,
'num_gts': self.num_gts,
'pos_assigned_gt_inds': self.pos_assigned_gt_inds,
}
@register
class PseudoSampler:
"""A pseudo sampler that does not do sampling actually."""
def __init__(self, **kwargs):
pass
def _sample_pos(self, **kwargs):
"""Sample positive samples."""
raise NotImplementedError
def _sample_neg(self, **kwargs):
"""Sample negative samples."""
raise NotImplementedError
def sample(self, assign_result, bboxes, gt_bboxes, *args, **kwargs):
"""Directly returns the positive and negative indices of samples.
Args:
assign_result (:obj:`AssignResult`): Assigned results
bboxes (paddle.Tensor): Bounding boxes
gt_bboxes (paddle.Tensor): Ground truth boxes
Returns:
:obj:`SamplingResult`: sampler results
"""
pos_inds = paddle.nonzero(
assign_result.gt_inds > 0, as_tuple=False).squeeze(-1)
neg_inds = paddle.nonzero(
assign_result.gt_inds == 0, as_tuple=False).squeeze(-1)
gt_flags = paddle.zeros([bboxes.shape[0]], dtype='int32')
sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
assign_result, gt_flags)
return sampling_result
================================================
FILE: ppdet/modeling/assigners/max_iou_assigner.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ppdet.core.workspace import register
from ppdet.modeling.proposal_generator.target import label_box
__all__ = ['MaxIoUAssigner']
@register
class MaxIoUAssigner(object):
"""a standard bbox assigner based on max IoU, use ppdet's label_box
as backend.
Args:
positive_overlap (float): threshold for defining positive samples
negative_overlap (float): threshold for denining negative samples
allow_low_quality (bool): whether to lower IoU thr if a GT poorly
overlaps with candidate bboxes
"""
def __init__(self,
positive_overlap,
negative_overlap,
allow_low_quality=True):
self.positive_overlap = positive_overlap
self.negative_overlap = negative_overlap
self.allow_low_quality = allow_low_quality
def __call__(self, bboxes, gt_bboxes):
matches, match_labels = label_box(
bboxes,
gt_bboxes,
positive_overlap=self.positive_overlap,
negative_overlap=self.negative_overlap,
allow_low_quality=self.allow_low_quality,
ignore_thresh=-1,
is_crowd=None,
assign_on_cpu=False)
return matches, match_labels
================================================
FILE: ppdet/modeling/assigners/pose_utils.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle
import paddle.nn.functional as F
from ppdet.core.workspace import register
__all__ = ['KptL1Cost', 'OksCost', 'ClassificationCost']
def masked_fill(x, mask, value):
y = paddle.full(x.shape, value, x.dtype)
return paddle.where(mask, y, x)
@register
class KptL1Cost(object):
"""KptL1Cost.
this function based on: https://github.com/hikvision-research/opera/blob/main/opera/core/bbox/match_costs/match_cost.py
Args:
weight (int | float, optional): loss_weight.
"""
def __init__(self, weight=1.0):
self.weight = weight
def __call__(self, kpt_pred, gt_keypoints, valid_kpt_flag):
"""
Args:
kpt_pred (Tensor): Predicted keypoints with normalized coordinates
(x_{i}, y_{i}), which are all in range [0, 1]. Shape
[num_query, K, 2].
gt_keypoints (Tensor): Ground truth keypoints with normalized
coordinates (x_{i}, y_{i}). Shape [num_gt, K, 2].
valid_kpt_flag (Tensor): valid flag of ground truth keypoints.
Shape [num_gt, K].
Returns:
paddle.Tensor: kpt_cost value with weight.
"""
kpt_cost = []
for i in range(len(gt_keypoints)):
if gt_keypoints[i].size == 0:
kpt_cost.append(kpt_pred.sum() * 0)
kpt_pred_tmp = kpt_pred.clone()
valid_flag = valid_kpt_flag[i] > 0
valid_flag_expand = valid_flag.unsqueeze(0).unsqueeze(-1).expand_as(
kpt_pred_tmp)
if not valid_flag_expand.all():
kpt_pred_tmp = masked_fill(kpt_pred_tmp, ~valid_flag_expand, 0)
cost = F.pairwise_distance(
kpt_pred_tmp.reshape((kpt_pred_tmp.shape[0], -1)),
gt_keypoints[i].reshape((-1, )).unsqueeze(0),
p=1,
keepdim=True)
avg_factor = paddle.clip(
valid_flag.astype('float32').sum() * 2, 1.0)
cost = cost / avg_factor
kpt_cost.append(cost)
kpt_cost = paddle.concat(kpt_cost, axis=1)
return kpt_cost * self.weight
@register
class OksCost(object):
"""OksCost.
this function based on: https://github.com/hikvision-research/opera/blob/main/opera/core/bbox/match_costs/match_cost.py
Args:
num_keypoints (int): number of keypoints
weight (int | float, optional): loss_weight.
"""
def __init__(self, num_keypoints=17, weight=1.0):
self.weight = weight
if num_keypoints == 17:
self.sigmas = np.array(
[
.26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,
1.07, .87, .87, .89, .89
],
dtype=np.float32) / 10.0
elif num_keypoints == 14:
self.sigmas = np.array(
[
.79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89,
.89, .79, .79
],
dtype=np.float32) / 10.0
else:
raise ValueError(f'Unsupported keypoints number {num_keypoints}')
def __call__(self, kpt_pred, gt_keypoints, valid_kpt_flag, gt_areas):
"""
Args:
kpt_pred (Tensor): Predicted keypoints with unnormalized
coordinates (x_{i}, y_{i}). Shape [num_query, K, 2].
gt_keypoints (Tensor): Ground truth keypoints with unnormalized
coordinates (x_{i}, y_{i}). Shape [num_gt, K, 2].
valid_kpt_flag (Tensor): valid flag of ground truth keypoints.
Shape [num_gt, K].
gt_areas (Tensor): Ground truth mask areas. Shape [num_gt,].
Returns:
paddle.Tensor: oks_cost value with weight.
"""
sigmas = paddle.to_tensor(self.sigmas)
variances = (sigmas * 2)**2
oks_cost = []
assert len(gt_keypoints) == len(gt_areas)
for i in range(len(gt_keypoints)):
if gt_keypoints[i].size == 0:
oks_cost.append(kpt_pred.sum() * 0)
squared_distance = \
(kpt_pred[:, :, 0] - gt_keypoints[i, :, 0].unsqueeze(0)) ** 2 + \
(kpt_pred[:, :, 1] - gt_keypoints[i, :, 1].unsqueeze(0)) ** 2
vis_flag = (valid_kpt_flag[i] > 0).astype('int')
vis_ind = vis_flag.nonzero(as_tuple=False)[:, 0]
num_vis_kpt = vis_ind.shape[0]
# assert num_vis_kpt > 0
if num_vis_kpt == 0:
oks_cost.append(paddle.zeros((squared_distance.shape[0], 1)))
continue
area = gt_areas[i]
squared_distance0 = squared_distance / (area * variances * 2)
squared_distance0 = paddle.index_select(
squared_distance0, vis_ind, axis=1)
squared_distance1 = paddle.exp(-squared_distance0).sum(axis=1,
keepdim=True)
oks = squared_distance1 / num_vis_kpt
# The 1 is a constant that doesn't change the matching, so omitted.
oks_cost.append(-oks)
oks_cost = paddle.concat(oks_cost, axis=1)
return oks_cost * self.weight
@register
class ClassificationCost:
"""ClsSoftmaxCost.
Args:
weight (int | float, optional): loss_weight
"""
def __init__(self, weight=1.):
self.weight = weight
def __call__(self, cls_pred, gt_labels):
"""
Args:
cls_pred (Tensor): Predicted classification logits, shape
(num_query, num_class).
gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
Returns:
paddle.Tensor: cls_cost value with weight
"""
# Following the official DETR repo, contrary to the loss that
# NLL is used, we approximate it in 1 - cls_score[gt_label].
# The 1 is a constant that doesn't change the matching,
# so it can be omitted.
cls_score = cls_pred.softmax(-1)
cls_cost = -cls_score[:, gt_labels]
return cls_cost * self.weight
@register
class FocalLossCost:
"""FocalLossCost.
Args:
weight (int | float, optional): loss_weight
alpha (int | float, optional): focal_loss alpha
gamma (int | float, optional): focal_loss gamma
eps (float, optional): default 1e-12
binary_input (bool, optional): Whether the input is binary,
default False.
"""
def __init__(self,
weight=1.,
alpha=0.25,
gamma=2,
eps=1e-12,
binary_input=False):
self.weight = weight
self.alpha = alpha
self.gamma = gamma
self.eps = eps
self.binary_input = binary_input
def _focal_loss_cost(self, cls_pred, gt_labels):
"""
Args:
cls_pred (Tensor): Predicted classification logits, shape
(num_query, num_class).
gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
Returns:
paddle.Tensor: cls_cost value with weight
"""
if gt_labels.size == 0:
return cls_pred.sum() * 0
cls_pred = F.sigmoid(cls_pred)
neg_cost = -(1 - cls_pred + self.eps).log() * (
1 - self.alpha) * cls_pred.pow(self.gamma)
pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
1 - cls_pred).pow(self.gamma)
cls_cost = paddle.index_select(
pos_cost, gt_labels, axis=1) - paddle.index_select(
neg_cost, gt_labels, axis=1)
return cls_cost * self.weight
def _mask_focal_loss_cost(self, cls_pred, gt_labels):
"""
Args:
cls_pred (Tensor): Predicted classfication logits
in shape (num_query, d1, ..., dn), dtype=paddle.float32.
gt_labels (Tensor): Ground truth in shape (num_gt, d1, ..., dn),
dtype=paddle.long. Labels should be binary.
Returns:
Tensor: Focal cost matrix with weight in shape\
(num_query, num_gt).
"""
cls_pred = cls_pred.flatten(1)
gt_labels = gt_labels.flatten(1).float()
n = cls_pred.shape[1]
cls_pred = F.sigmoid(cls_pred)
neg_cost = -(1 - cls_pred + self.eps).log() * (
1 - self.alpha) * cls_pred.pow(self.gamma)
pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
1 - cls_pred).pow(self.gamma)
cls_cost = paddle.einsum('nc,mc->nm', pos_cost, gt_labels) + \
paddle.einsum('nc,mc->nm', neg_cost, (1 - gt_labels))
return cls_cost / n * self.weight
def __call__(self, cls_pred, gt_labels):
"""
Args:
cls_pred (Tensor): Predicted classfication logits.
gt_labels (Tensor)): Labels.
Returns:
Tensor: Focal cost matrix with weight in shape\
(num_query, num_gt).
"""
if self.binary_input:
return self._mask_focal_loss_cost(cls_pred, gt_labels)
else:
return self._focal_loss_cost(cls_pred, gt_labels)
================================================
FILE: ppdet/modeling/assigners/rotated_task_aligned_assigner.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from ..rbox_utils import rotated_iou_similarity, check_points_in_rotated_boxes
from .utils import gather_topk_anchors, compute_max_iou_anchor
__all__ = ['RotatedTaskAlignedAssigner']
@register
class RotatedTaskAlignedAssigner(nn.Layer):
"""TOOD: Task-aligned One-stage Object Detection
"""
def __init__(self, topk=13, alpha=1.0, beta=6.0, eps=1e-9):
super(RotatedTaskAlignedAssigner, self).__init__()
self.topk = topk
self.alpha = alpha
self.beta = beta
self.eps = eps
@paddle.no_grad()
def forward(self,
pred_scores,
pred_bboxes,
anchor_points,
num_anchors_list,
gt_labels,
gt_bboxes,
pad_gt_mask,
bg_index,
gt_scores=None):
r"""This code is based on
https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py
The assignment is done in following steps
1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt
2. select top-k bbox as candidates for each gt
3. limit the positive sample's center in gt (because the anchor-free detector
only can predict positive distance)
4. if an anchor box is assigned to multiple gts, the one with the
highest iou will be selected.
Args:
pred_scores (Tensor, float32): predicted class probability, shape(B, L, C)
pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 5)
anchor_points (Tensor, float32): pre-defined anchors, shape(1, L, 2), "cxcy" format
num_anchors_list (List): num of anchors in each level, shape(L)
gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 5)
pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
bg_index (int): background index
gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1)
Returns:
assigned_labels (Tensor): (B, L)
assigned_bboxes (Tensor): (B, L, 5)
assigned_scores (Tensor): (B, L, C)
"""
assert pred_scores.ndim == pred_bboxes.ndim
assert gt_labels.ndim == gt_bboxes.ndim and \
gt_bboxes.ndim == 3
batch_size, num_anchors, num_classes = pred_scores.shape
_, num_max_boxes, _ = gt_bboxes.shape
# negative batch
if num_max_boxes == 0:
assigned_labels = paddle.full(
[batch_size, num_anchors], bg_index, dtype=gt_labels.dtype)
assigned_bboxes = paddle.zeros([batch_size, num_anchors, 5])
assigned_scores = paddle.zeros(
[batch_size, num_anchors, num_classes])
return assigned_labels, assigned_bboxes, assigned_scores
# compute iou between gt and pred bbox, [B, n, L]
ious = rotated_iou_similarity(gt_bboxes, pred_bboxes)
ious = paddle.where(ious > 1 + self.eps, paddle.zeros_like(ious), ious)
ious.stop_gradient = True
# gather pred bboxes class score
pred_scores = pred_scores.transpose([0, 2, 1])
batch_ind = paddle.arange(
end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
gt_labels_ind = paddle.stack(
[batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)],
axis=-1)
bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind)
# compute alignment metrics, [B, n, L]
alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(
self.beta)
# check the positive sample's center in gt, [B, n, L]
is_in_gts = check_points_in_rotated_boxes(anchor_points, gt_bboxes)
# select topk largest alignment metrics pred bbox as candidates
# for each gt, [B, n, L]
is_in_topk = gather_topk_anchors(
alignment_metrics * is_in_gts.astype(alignment_metrics.dtype), self.topk, topk_mask=pad_gt_mask)
# select positive sample, [B, n, L]
mask_positive = is_in_topk * is_in_gts.astype(is_in_topk.dtype) * pad_gt_mask
# if an anchor box is assigned to multiple gts,
# the one with the highest iou will be selected, [B, n, L]
mask_positive_sum = mask_positive.sum(axis=-2)
if mask_positive_sum.max() > 1:
mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile(
[1, num_max_boxes, 1])
is_max_iou = compute_max_iou_anchor(ious)
mask_positive = paddle.where(mask_multiple_gts, is_max_iou,
mask_positive)
mask_positive_sum = mask_positive.sum(axis=-2)
assigned_gt_index = mask_positive.argmax(axis=-2)
# assigned target
assigned_gt_index = assigned_gt_index + (batch_ind * num_max_boxes).astype(assigned_gt_index.dtype)
assigned_labels = paddle.gather(
gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
assigned_labels = paddle.where(
mask_positive_sum > 0, assigned_labels,
paddle.full_like(assigned_labels, bg_index))
assigned_bboxes = paddle.gather(
gt_bboxes.reshape([-1, 5]), assigned_gt_index.flatten(), axis=0)
assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 5])
assigned_scores = F.one_hot(assigned_labels, num_classes + 1)
ind = list(range(num_classes + 1))
ind.remove(bg_index)
assigned_scores = paddle.index_select(
assigned_scores, paddle.to_tensor(ind), axis=-1)
# rescale alignment metrics
alignment_metrics *= mask_positive
max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True)
max_ious_per_instance = (ious * mask_positive).max(axis=-1,
keepdim=True)
alignment_metrics = alignment_metrics / (
max_metrics_per_instance + self.eps) * max_ious_per_instance
alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1)
assigned_scores = assigned_scores * alignment_metrics
assigned_bboxes.stop_gradient = True
assigned_scores.stop_gradient = True
assigned_labels.stop_gradient = True
return assigned_labels, assigned_bboxes, assigned_scores
================================================
FILE: ppdet/modeling/assigners/simota_assigner.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# The code is based on:
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/assigners/sim_ota_assigner.py
import paddle
import numpy as np
import paddle.nn.functional as F
from ppdet.modeling.losses.varifocal_loss import varifocal_loss
from ppdet.modeling.bbox_utils import batch_bbox_overlaps
from ppdet.core.workspace import register
@register
class SimOTAAssigner(object):
"""Computes matching between predictions and ground truth.
Args:
center_radius (int | float, optional): Ground truth center size
to judge whether a prior is in center. Default 2.5.
candidate_topk (int, optional): The candidate top-k which used to
get top-k ious to calculate dynamic-k. Default 10.
iou_weight (int | float, optional): The scale factor for regression
iou cost. Default 3.0.
cls_weight (int | float, optional): The scale factor for classification
cost. Default 1.0.
num_classes (int): The num_classes of dataset.
use_vfl (int): Whether to use varifocal_loss when calculating the cost matrix.
"""
__shared__ = ['num_classes']
def __init__(self,
center_radius=2.5,
candidate_topk=10,
iou_weight=3.0,
cls_weight=1.0,
num_classes=80,
use_vfl=True):
self.center_radius = center_radius
self.candidate_topk = candidate_topk
self.iou_weight = iou_weight
self.cls_weight = cls_weight
self.num_classes = num_classes
self.use_vfl = use_vfl
def get_in_gt_and_in_center_info(self, flatten_center_and_stride,
gt_bboxes):
num_gt = gt_bboxes.shape[0]
flatten_x = flatten_center_and_stride[:, 0].unsqueeze(1).tile(
[1, num_gt])
flatten_y = flatten_center_and_stride[:, 1].unsqueeze(1).tile(
[1, num_gt])
flatten_stride_x = flatten_center_and_stride[:, 2].unsqueeze(1).tile(
[1, num_gt])
flatten_stride_y = flatten_center_and_stride[:, 3].unsqueeze(1).tile(
[1, num_gt])
# is prior centers in gt bboxes, shape: [n_center, n_gt]
l_ = flatten_x - gt_bboxes[:, 0]
t_ = flatten_y - gt_bboxes[:, 1]
r_ = gt_bboxes[:, 2] - flatten_x
b_ = gt_bboxes[:, 3] - flatten_y
deltas = paddle.stack([l_, t_, r_, b_], axis=1)
is_in_gts = deltas.min(axis=1) > 0
is_in_gts_all = is_in_gts.sum(axis=1) > 0
# is prior centers in gt centers
gt_center_xs = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
gt_center_ys = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
ct_bound_l = gt_center_xs - self.center_radius * flatten_stride_x
ct_bound_t = gt_center_ys - self.center_radius * flatten_stride_y
ct_bound_r = gt_center_xs + self.center_radius * flatten_stride_x
ct_bound_b = gt_center_ys + self.center_radius * flatten_stride_y
cl_ = flatten_x - ct_bound_l
ct_ = flatten_y - ct_bound_t
cr_ = ct_bound_r - flatten_x
cb_ = ct_bound_b - flatten_y
ct_deltas = paddle.stack([cl_, ct_, cr_, cb_], axis=1)
is_in_cts = ct_deltas.min(axis=1) > 0
is_in_cts_all = is_in_cts.sum(axis=1) > 0
# in any of gts or gt centers, shape: [n_center]
is_in_gts_or_centers_all = paddle.logical_or(is_in_gts_all,
is_in_cts_all)
is_in_gts_or_centers_all_inds = paddle.nonzero(
is_in_gts_or_centers_all).squeeze(1)
# both in gts and gt centers, shape: [num_fg, num_gt]
is_in_gts_and_centers = paddle.logical_and(
paddle.gather(
is_in_gts.cast('int'), is_in_gts_or_centers_all_inds,
axis=0).cast('bool'),
paddle.gather(
is_in_cts.cast('int'), is_in_gts_or_centers_all_inds,
axis=0).cast('bool'))
return is_in_gts_or_centers_all, is_in_gts_or_centers_all_inds, is_in_gts_and_centers
def dynamic_k_matching(self, cost_matrix, pairwise_ious, num_gt):
match_matrix = np.zeros_like(cost_matrix.numpy())
# select candidate topk ious for dynamic-k calculation
topk_ious, _ = paddle.topk(
pairwise_ious,
min(self.candidate_topk, pairwise_ious.shape[0]),
axis=0)
# calculate dynamic k for each gt
dynamic_ks = paddle.clip(topk_ious.sum(0).cast('int'), min=1)
for gt_idx in range(num_gt):
_, pos_idx = paddle.topk(
cost_matrix[:, gt_idx], k=dynamic_ks[gt_idx], largest=False)
match_matrix[:, gt_idx][pos_idx.numpy()] = 1.0
del topk_ious, dynamic_ks, pos_idx
# match points more than two gts
extra_match_gts_mask = match_matrix.sum(1) > 1
if extra_match_gts_mask.sum() > 0:
cost_matrix = cost_matrix.numpy()
cost_argmin = np.argmin(
cost_matrix[extra_match_gts_mask, :], axis=1)
match_matrix[extra_match_gts_mask, :] *= 0.0
match_matrix[extra_match_gts_mask, cost_argmin] = 1.0
# get foreground mask
match_fg_mask_inmatrix = match_matrix.sum(1) > 0
match_gt_inds_to_fg = match_matrix[match_fg_mask_inmatrix, :].argmax(1)
return match_gt_inds_to_fg, match_fg_mask_inmatrix
def get_sample(self, assign_gt_inds, gt_bboxes):
pos_inds = np.unique(np.nonzero(assign_gt_inds > 0)[0])
neg_inds = np.unique(np.nonzero(assign_gt_inds == 0)[0])
pos_assigned_gt_inds = assign_gt_inds[pos_inds] - 1
if gt_bboxes.size == 0:
# hack for index error case
assert pos_assigned_gt_inds.size == 0
pos_gt_bboxes = np.empty_like(gt_bboxes).reshape(-1, 4)
else:
if len(gt_bboxes.shape) < 2:
gt_bboxes = gt_bboxes.resize(-1, 4)
pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]
return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds
def __call__(self,
flatten_cls_pred_scores,
flatten_center_and_stride,
flatten_bboxes,
gt_bboxes,
gt_labels,
eps=1e-7):
"""Assign gt to priors using SimOTA.
TODO: add comment.
Returns:
assign_result: The assigned result.
"""
num_gt = gt_bboxes.shape[0]
num_bboxes = flatten_bboxes.shape[0]
if num_gt == 0 or num_bboxes == 0:
# No ground truth or boxes
label = np.ones([num_bboxes], dtype=np.int64) * self.num_classes
label_weight = np.ones([num_bboxes], dtype=np.float32)
bbox_target = np.zeros_like(flatten_center_and_stride)
return 0, label, label_weight, bbox_target
is_in_gts_or_centers_all, is_in_gts_or_centers_all_inds, is_in_boxes_and_center = self.get_in_gt_and_in_center_info(
flatten_center_and_stride, gt_bboxes)
# bboxes and scores to calculate matrix
valid_flatten_bboxes = flatten_bboxes[is_in_gts_or_centers_all_inds]
valid_cls_pred_scores = flatten_cls_pred_scores[
is_in_gts_or_centers_all_inds]
num_valid_bboxes = valid_flatten_bboxes.shape[0]
pairwise_ious = batch_bbox_overlaps(valid_flatten_bboxes,
gt_bboxes) # [num_points,num_gts]
if self.use_vfl:
gt_vfl_labels = gt_labels.squeeze(-1).unsqueeze(0).tile(
[num_valid_bboxes, 1]).reshape([-1])
valid_pred_scores = valid_cls_pred_scores.unsqueeze(1).tile(
[1, num_gt, 1]).reshape([-1, self.num_classes])
vfl_score = np.zeros(valid_pred_scores.shape)
vfl_score[np.arange(0, vfl_score.shape[0]), gt_vfl_labels.numpy(
)] = pairwise_ious.reshape([-1])
vfl_score = paddle.to_tensor(vfl_score)
losses_vfl = varifocal_loss(
valid_pred_scores, vfl_score,
use_sigmoid=False).reshape([num_valid_bboxes, num_gt])
losses_giou = batch_bbox_overlaps(
valid_flatten_bboxes, gt_bboxes, mode='giou')
cost_matrix = (
losses_vfl * self.cls_weight + losses_giou * self.iou_weight +
paddle.logical_not(is_in_boxes_and_center).cast('float32') *
100000000)
else:
iou_cost = -paddle.log(pairwise_ious + eps)
gt_onehot_label = (F.one_hot(
gt_labels.squeeze(-1).cast(paddle.int64),
flatten_cls_pred_scores.shape[-1]).cast('float32').unsqueeze(0)
.tile([num_valid_bboxes, 1, 1]))
valid_pred_scores = valid_cls_pred_scores.unsqueeze(1).tile(
[1, num_gt, 1])
cls_cost = F.binary_cross_entropy(
valid_pred_scores, gt_onehot_label, reduction='none').sum(-1)
cost_matrix = (
cls_cost * self.cls_weight + iou_cost * self.iou_weight +
paddle.logical_not(is_in_boxes_and_center).cast('float32') *
100000000)
match_gt_inds_to_fg, match_fg_mask_inmatrix = \
self.dynamic_k_matching(
cost_matrix, pairwise_ious, num_gt)
# sample and assign results
assigned_gt_inds = np.zeros([num_bboxes], dtype=np.int64)
match_fg_mask_inall = np.zeros_like(assigned_gt_inds)
match_fg_mask_inall[is_in_gts_or_centers_all.numpy(
)] = match_fg_mask_inmatrix
assigned_gt_inds[match_fg_mask_inall.astype(
np.bool_)] = match_gt_inds_to_fg + 1
pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds \
= self.get_sample(assigned_gt_inds, gt_bboxes.numpy())
bbox_target = np.zeros(flatten_bboxes.shape, paddle.common_ops_import.convert_dtype(flatten_bboxes.dtype))
bbox_weight = np.zeros_like(bbox_target)
label = np.ones([num_bboxes], dtype=np.int64) * self.num_classes
label_weight = np.zeros([num_bboxes], dtype=np.float32)
if len(pos_inds) > 0:
gt_labels = gt_labels.numpy()
pos_bbox_targets = pos_gt_bboxes
bbox_target[pos_inds, :] = pos_bbox_targets
bbox_weight[pos_inds, :] = 1.0
if not np.any(gt_labels):
label[pos_inds] = 0
else:
label[pos_inds] = gt_labels.squeeze(-1)[pos_assigned_gt_inds]
label_weight[pos_inds] = 1.0
if len(neg_inds) > 0:
label_weight[neg_inds] = 1.0
pos_num = max(pos_inds.size, 1)
return pos_num, label, label_weight, bbox_target
================================================
FILE: ppdet/modeling/assigners/task_aligned_assigner.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from ..bbox_utils import batch_iou_similarity
from .utils import (gather_topk_anchors, check_points_inside_bboxes,
compute_max_iou_anchor)
__all__ = ['TaskAlignedAssigner']
def is_close_gt(anchor, gt, stride_lst, max_dist=2.0, alpha=2.):
"""Calculate distance ratio of box1 and box2 in batch for larger stride
anchors dist/stride to promote the survive of large distance match
Args:
anchor (Tensor): box with the shape [L, 2]
gt (Tensor): box with the shape [N, M2, 4]
Return:
dist (Tensor): dist ratio between box1 and box2 with the shape [N, M1, M2]
"""
center1 = anchor.unsqueeze(0)
center2 = (gt[..., :2] + gt[..., -2:]) / 2.
center1 = center1.unsqueeze(1) # [N, M1, 2] -> [N, 1, M1, 2]
center2 = center2.unsqueeze(2) # [N, M2, 2] -> [N, M2, 1, 2]
stride = paddle.concat([
paddle.full([x], 32 / pow(2, idx)) for idx, x in enumerate(stride_lst)
]).unsqueeze(0).unsqueeze(0)
dist = paddle.linalg.norm(center1 - center2, p=2, axis=-1) / stride
dist_ratio = dist
dist_ratio[dist < max_dist] = 1.
dist_ratio[dist >= max_dist] = 0.
return dist_ratio
@register
class TaskAlignedAssigner(nn.Layer):
"""TOOD: Task-aligned One-stage Object Detection
"""
def __init__(self,
topk=13,
alpha=1.0,
beta=6.0,
eps=1e-9,
is_close_gt=False):
super(TaskAlignedAssigner, self).__init__()
self.topk = topk
self.alpha = alpha
self.beta = beta
self.eps = eps
self.is_close_gt = is_close_gt
@paddle.no_grad()
def forward(self,
pred_scores,
pred_bboxes,
anchor_points,
num_anchors_list,
gt_labels,
gt_bboxes,
pad_gt_mask,
bg_index,
gt_segms=None,
gt_scores=None):
r"""This code is based on
https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py
The assignment is done in following steps
1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt
2. select top-k bbox as candidates for each gt
3. limit the positive sample's center in gt (because the anchor-free detector
only can predict positive distance)
4. if an anchor box is assigned to multiple gts, the one with the
highest iou will be selected.
Args:
pred_scores (Tensor, float32): predicted class probability, shape(B, L, C)
pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 4)
anchor_points (Tensor, float32): pre-defined anchors, shape(L, 2), "cxcy" format
num_anchors_list (List): num of anchors in each level, shape(L)
gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
bg_index (int): background index
gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1)
Returns:
assigned_labels (Tensor): (B, L)
assigned_bboxes (Tensor): (B, L, 4)
assigned_scores (Tensor): (B, L, C)
"""
assert pred_scores.ndim == pred_bboxes.ndim
assert gt_labels.ndim == gt_bboxes.ndim and \
gt_bboxes.ndim == 3
batch_size, num_anchors, num_classes = pred_scores.shape
_, num_max_boxes, _ = gt_bboxes.shape
# negative batch
if num_max_boxes == 0:
assigned_labels = paddle.full(
[batch_size, num_anchors], bg_index, dtype='int32')
assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])
assigned_scores = paddle.zeros(
[batch_size, num_anchors, num_classes])
return assigned_labels, assigned_bboxes, assigned_scores
# compute iou between gt and pred bbox, [B, n, L]
ious = batch_iou_similarity(gt_bboxes, pred_bboxes)
# gather pred bboxes class score
pred_scores = pred_scores.transpose([0, 2, 1])
batch_ind = paddle.arange(
end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
gt_labels_ind = paddle.stack(
[batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)],
axis=-1)
bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind)
# compute alignment metrics, [B, n, L]
alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(
self.beta)
# check the positive sample's center in gt, [B, n, L]
if self.is_close_gt:
is_in_gts = is_close_gt(anchor_points, gt_bboxes, num_anchors_list)
else:
is_in_gts = check_points_inside_bboxes(anchor_points, gt_bboxes)
# select topk largest alignment metrics pred bbox as candidates
# for each gt, [B, n, L]
is_in_topk = gather_topk_anchors(
alignment_metrics * is_in_gts, self.topk, topk_mask=pad_gt_mask)
# select positive sample, [B, n, L]
mask_positive = is_in_topk * is_in_gts * pad_gt_mask
# if an anchor box is assigned to multiple gts,
# the one with the highest iou will be selected, [B, n, L]
mask_positive_sum = mask_positive.sum(axis=-2)
if mask_positive_sum.max() > 1:
mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile(
[1, num_max_boxes, 1])
is_max_iou = compute_max_iou_anchor(ious)
mask_positive = paddle.where(mask_multiple_gts, is_max_iou,
mask_positive)
mask_positive_sum = mask_positive.sum(axis=-2)
assigned_gt_index = mask_positive.argmax(axis=-2)
# assigned target
assigned_gt_index = assigned_gt_index + (batch_ind * num_max_boxes).astype(assigned_gt_index.dtype)
assigned_labels = paddle.gather(
gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
assigned_labels = paddle.where(
mask_positive_sum > 0, assigned_labels,
paddle.full_like(assigned_labels, bg_index))
assigned_bboxes = paddle.gather(
gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0)
assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])
assigned_scores = F.one_hot(assigned_labels, num_classes + 1)
ind = list(range(num_classes + 1))
ind.remove(bg_index)
assigned_scores = paddle.index_select(
assigned_scores, paddle.to_tensor(ind), axis=-1)
# rescale alignment metrics
alignment_metrics *= mask_positive
max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True)
max_ious_per_instance = (ious * mask_positive).max(axis=-1,
keepdim=True)
alignment_metrics = alignment_metrics / (
max_metrics_per_instance + self.eps) * max_ious_per_instance
alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1)
assigned_scores = assigned_scores * alignment_metrics
if gt_segms is not None:
return assigned_labels, assigned_bboxes, assigned_scores, assigned_gt_index
else:
return assigned_labels, assigned_bboxes, assigned_scores
================================================
FILE: ppdet/modeling/assigners/task_aligned_assigner_cr.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from ..bbox_utils import batch_iou_similarity
from .utils import (gather_topk_anchors, check_points_inside_bboxes,
compute_max_iou_anchor)
__all__ = ['TaskAlignedAssigner_CR']
@register
class TaskAlignedAssigner_CR(nn.Layer):
"""TOOD: Task-aligned One-stage Object Detection with Center R
"""
def __init__(self,
topk=13,
alpha=1.0,
beta=6.0,
center_radius=None,
eps=1e-9):
super(TaskAlignedAssigner_CR, self).__init__()
self.topk = topk
self.alpha = alpha
self.beta = beta
self.center_radius = center_radius
self.eps = eps
@paddle.no_grad()
def forward(self,
pred_scores,
pred_bboxes,
anchor_points,
stride_tensor,
gt_labels,
gt_bboxes,
pad_gt_mask,
bg_index,
gt_scores=None):
r"""This code is based on
https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py
The assignment is done in following steps
1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt
2. select top-k bbox as candidates for each gt
3. limit the positive sample's center in gt (because the anchor-free detector
only can predict positive distance)
4. if an anchor box is assigned to multiple gts, the one with the
highest iou will be selected.
Args:
pred_scores (Tensor, float32): predicted class probability, shape(B, L, C)
pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 4)
anchor_points (Tensor, float32): pre-defined anchors, shape(L, 2), "cxcy" format
stride_tensor (Tensor, float32): stride of feature map, shape(L, 1)
gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
bg_index (int): background index
gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1)
Returns:
assigned_labels (Tensor): (B, L)
assigned_bboxes (Tensor): (B, L, 4)
assigned_scores (Tensor): (B, L, C)
"""
assert pred_scores.ndim == pred_bboxes.ndim
assert gt_labels.ndim == gt_bboxes.ndim and \
gt_bboxes.ndim == 3
batch_size, num_anchors, num_classes = pred_scores.shape
_, num_max_boxes, _ = gt_bboxes.shape
# negative batch
if num_max_boxes == 0:
assigned_labels = paddle.full(
[batch_size, num_anchors], bg_index, dtype='int32')
assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])
assigned_scores = paddle.zeros(
[batch_size, num_anchors, num_classes])
return assigned_labels, assigned_bboxes, assigned_scores
# compute iou between gt and pred bbox, [B, n, L]
ious = batch_iou_similarity(gt_bboxes, pred_bboxes)
# gather pred bboxes class score
pred_scores = pred_scores.transpose([0, 2, 1])
batch_ind = paddle.arange(
end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
gt_labels_ind = paddle.stack(
[batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)],
axis=-1)
bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind)
# compute alignment metrics, [B, n, L]
alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(
self.beta) * pad_gt_mask
# select positive sample, [B, n, L]
if self.center_radius is None:
# check the positive sample's center in gt, [B, n, L]
is_in_gts = check_points_inside_bboxes(
anchor_points, gt_bboxes, sm_use=True)
# select topk largest alignment metrics pred bbox as candidates
# for each gt, [B, n, L]
mask_positive = gather_topk_anchors(
alignment_metrics, self.topk, topk_mask=pad_gt_mask) * is_in_gts
else:
is_in_gts, is_in_center = check_points_inside_bboxes(
anchor_points,
gt_bboxes,
stride_tensor * self.center_radius,
sm_use=True)
is_in_gts *= pad_gt_mask
is_in_center *= pad_gt_mask
candidate_metrics = paddle.where(
is_in_gts.sum(-1, keepdim=True) == 0,
alignment_metrics + is_in_center,
alignment_metrics)
mask_positive = gather_topk_anchors(
candidate_metrics, self.topk,
topk_mask=pad_gt_mask) * paddle.cast((is_in_center > 0) |
(is_in_gts > 0), 'float32')
# if an anchor box is assigned to multiple gts,
# the one with the highest iou will be selected, [B, n, L]
mask_positive_sum = mask_positive.sum(axis=-2)
if mask_positive_sum.max() > 1:
mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile(
[1, num_max_boxes, 1])
is_max_iou = compute_max_iou_anchor(ious * mask_positive)
mask_positive = paddle.where(mask_multiple_gts, is_max_iou,
mask_positive)
mask_positive_sum = mask_positive.sum(axis=-2)
assigned_gt_index = mask_positive.argmax(axis=-2)
# assigned target
assigned_gt_index = assigned_gt_index + (batch_ind * num_max_boxes).astype(assigned_gt_index.dtype)
assigned_labels = paddle.gather(
gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
assigned_labels = paddle.where(
mask_positive_sum > 0, assigned_labels,
paddle.full_like(assigned_labels, bg_index))
assigned_bboxes = paddle.gather(
gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0)
assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])
assigned_scores = F.one_hot(assigned_labels, num_classes + 1)
ind = list(range(num_classes + 1))
ind.remove(bg_index)
assigned_scores = paddle.index_select(
assigned_scores, paddle.to_tensor(ind), axis=-1)
# rescale alignment metrics
alignment_metrics *= mask_positive
max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True)
max_ious_per_instance = (ious * mask_positive).max(axis=-1,
keepdim=True)
alignment_metrics = alignment_metrics / (
max_metrics_per_instance + self.eps) * max_ious_per_instance
alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1)
assigned_scores = assigned_scores * alignment_metrics
return assigned_labels, assigned_bboxes, assigned_scores
================================================
FILE: ppdet/modeling/assigners/uniform_assigner.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from ppdet.modeling.bbox_utils import batch_bbox_overlaps
from ppdet.modeling.transformers import bbox_xyxy_to_cxcywh
__all__ = ['UniformAssigner']
def batch_p_dist(x, y, p=2):
"""
calculate pairwise p_dist, the first index of x and y are batch
return [x.shape[0], y.shape[0]]
"""
x = x.unsqueeze(1)
diff = x - y
return paddle.norm(diff, p=p, axis=list(range(2, diff.dim())))
@register
class UniformAssigner(nn.Layer):
def __init__(self, pos_ignore_thr, neg_ignore_thr, match_times=4):
super(UniformAssigner, self).__init__()
self.pos_ignore_thr = pos_ignore_thr
self.neg_ignore_thr = neg_ignore_thr
self.match_times = match_times
def forward(self, bbox_pred, anchor, gt_bboxes, gt_labels=None):
num_bboxes = bbox_pred.shape[0]
num_gts = gt_bboxes.shape[0]
match_labels = paddle.full([num_bboxes], -1, dtype=paddle.int32)
pred_ious = batch_bbox_overlaps(bbox_pred, gt_bboxes)
pred_max_iou = pred_ious.max(axis=1)
neg_ignore = pred_max_iou > self.neg_ignore_thr
# exclude potential ignored neg samples first, deal with pos samples later
#match_labels: -2(ignore), -1(neg) or >=0(pos_inds)
match_labels = paddle.where(neg_ignore,
paddle.full_like(match_labels, -2),
match_labels)
bbox_pred_c = bbox_xyxy_to_cxcywh(bbox_pred)
anchor_c = bbox_xyxy_to_cxcywh(anchor)
gt_bboxes_c = bbox_xyxy_to_cxcywh(gt_bboxes)
bbox_pred_dist = batch_p_dist(bbox_pred_c, gt_bboxes_c, p=1)
anchor_dist = batch_p_dist(anchor_c, gt_bboxes_c, p=1)
top_pred = bbox_pred_dist.topk(
k=self.match_times, axis=0, largest=False)[1]
top_anchor = anchor_dist.topk(
k=self.match_times, axis=0, largest=False)[1]
tar_pred = paddle.arange(num_gts).expand([self.match_times, num_gts])
tar_anchor = paddle.arange(num_gts).expand([self.match_times, num_gts])
pos_places = paddle.concat([top_pred, top_anchor]).reshape([-1])
pos_inds = paddle.concat([tar_pred, tar_anchor]).reshape([-1])
pos_anchor = anchor[pos_places]
pos_tar_bbox = gt_bboxes[pos_inds]
pos_ious = batch_bbox_overlaps(
pos_anchor, pos_tar_bbox, is_aligned=True)
pos_ignore = pos_ious < self.pos_ignore_thr
pos_inds = paddle.where(pos_ignore,
paddle.full_like(pos_inds, -2), pos_inds)
match_labels[pos_places] = pos_inds
match_labels.stop_gradient = True
pos_keep = ~pos_ignore
if pos_keep.sum() > 0:
pos_places_keep = pos_places[pos_keep]
pos_bbox_pred = bbox_pred[pos_places_keep].reshape([-1, 4])
pos_bbox_tar = pos_tar_bbox[pos_keep].reshape([-1, 4]).detach()
else:
pos_bbox_pred = None
pos_bbox_tar = None
return match_labels, pos_bbox_pred, pos_bbox_tar
================================================
FILE: ppdet/modeling/assigners/utils.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn.functional as F
__all__ = [
'pad_gt', 'gather_topk_anchors', 'check_points_inside_bboxes',
'compute_max_iou_anchor', 'compute_max_iou_gt',
'generate_anchors_for_grid_cell'
]
def pad_gt(gt_labels, gt_bboxes, gt_scores=None):
r""" Pad 0 in gt_labels and gt_bboxes.
Args:
gt_labels (Tensor|List[Tensor], int64): Label of gt_bboxes,
shape is [B, n, 1] or [[n_1, 1], [n_2, 1], ...], here n = sum(n_i)
gt_bboxes (Tensor|List[Tensor], float32): Ground truth bboxes,
shape is [B, n, 4] or [[n_1, 4], [n_2, 4], ...], here n = sum(n_i)
gt_scores (Tensor|List[Tensor]|None, float32): Score of gt_bboxes,
shape is [B, n, 1] or [[n_1, 4], [n_2, 4], ...], here n = sum(n_i)
Returns:
pad_gt_labels (Tensor, int64): shape[B, n, 1]
pad_gt_bboxes (Tensor, float32): shape[B, n, 4]
pad_gt_scores (Tensor, float32): shape[B, n, 1]
pad_gt_mask (Tensor, float32): shape[B, n, 1], 1 means bbox, 0 means no bbox
"""
if isinstance(gt_labels, paddle.Tensor) and isinstance(gt_bboxes,
paddle.Tensor):
assert gt_labels.ndim == gt_bboxes.ndim and \
gt_bboxes.ndim == 3
pad_gt_mask = (
gt_bboxes.sum(axis=-1, keepdim=True) > 0).astype(gt_bboxes.dtype)
if gt_scores is None:
gt_scores = pad_gt_mask.clone()
assert gt_labels.ndim == gt_scores.ndim
return gt_labels, gt_bboxes, gt_scores, pad_gt_mask
elif isinstance(gt_labels, list) and isinstance(gt_bboxes, list):
assert len(gt_labels) == len(gt_bboxes), \
'The number of `gt_labels` and `gt_bboxes` is not equal. '
num_max_boxes = max([len(a) for a in gt_bboxes])
batch_size = len(gt_bboxes)
# pad label and bbox
pad_gt_labels = paddle.zeros(
[batch_size, num_max_boxes, 1], dtype=gt_labels[0].dtype)
pad_gt_bboxes = paddle.zeros(
[batch_size, num_max_boxes, 4], dtype=gt_bboxes[0].dtype)
pad_gt_scores = paddle.zeros(
[batch_size, num_max_boxes, 1], dtype=gt_bboxes[0].dtype)
pad_gt_mask = paddle.zeros(
[batch_size, num_max_boxes, 1], dtype=gt_bboxes[0].dtype)
for i, (label, bbox) in enumerate(zip(gt_labels, gt_bboxes)):
if len(label) > 0 and len(bbox) > 0:
pad_gt_labels[i, :len(label)] = label
pad_gt_bboxes[i, :len(bbox)] = bbox
pad_gt_mask[i, :len(bbox)] = 1.
if gt_scores is not None:
pad_gt_scores[i, :len(gt_scores[i])] = gt_scores[i]
if gt_scores is None:
pad_gt_scores = pad_gt_mask.clone()
return pad_gt_labels, pad_gt_bboxes, pad_gt_scores, pad_gt_mask
else:
raise ValueError('The input `gt_labels` or `gt_bboxes` is invalid! ')
def gather_topk_anchors(metrics, topk, largest=True, topk_mask=None, eps=1e-9):
r"""
Args:
metrics (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors
topk (int): The number of top elements to look for along the axis.
largest (bool) : largest is a flag, if set to true,
algorithm will sort by descending order, otherwise sort by
ascending order. Default: True
topk_mask (Tensor, float32): shape[B, n, 1], ignore bbox mask,
Default: None
eps (float): Default: 1e-9
Returns:
is_in_topk (Tensor, float32): shape[B, n, L], value=1. means selected
"""
num_anchors = metrics.shape[-1]
topk_metrics, topk_idxs = paddle.topk(
metrics, topk, axis=-1, largest=largest)
if topk_mask is None:
topk_mask = (
topk_metrics.max(axis=-1, keepdim=True) > eps).astype(metrics.dtype)
is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(
axis=-2).astype(metrics.dtype)
return is_in_topk * topk_mask
def check_points_inside_bboxes(points,
bboxes,
center_radius_tensor=None,
eps=1e-9,
sm_use=False):
r"""
Args:
points (Tensor, float32): shape[L, 2], "xy" format, L: num_anchors
bboxes (Tensor, float32): shape[B, n, 4], "xmin, ymin, xmax, ymax" format
center_radius_tensor (Tensor, float32): shape [L, 1]. Default: None.
eps (float): Default: 1e-9
Returns:
is_in_bboxes (Tensor, float32): shape[B, n, L], value=1. means selected
"""
points = points.unsqueeze([0, 1])
x, y = points.chunk(2, axis=-1)
xmin, ymin, xmax, ymax = bboxes.unsqueeze(2).chunk(4, axis=-1)
# check whether `points` is in `bboxes`
l = x - xmin
t = y - ymin
r = xmax - x
b = ymax - y
delta_ltrb = paddle.concat([l, t, r, b], axis=-1)
is_in_bboxes = (delta_ltrb.min(axis=-1) > eps)
if center_radius_tensor is not None:
# check whether `points` is in `center_radius`
center_radius_tensor = center_radius_tensor.unsqueeze([0, 1])
cx = (xmin + xmax) * 0.5
cy = (ymin + ymax) * 0.5
l = x - (cx - center_radius_tensor)
t = y - (cy - center_radius_tensor)
r = (cx + center_radius_tensor) - x
b = (cy + center_radius_tensor) - y
delta_ltrb_c = paddle.concat([l, t, r, b], axis=-1)
is_in_center = (delta_ltrb_c.min(axis=-1) > eps)
if sm_use:
return is_in_bboxes.astype(bboxes.dtype), is_in_center.astype(
bboxes.dtype)
else:
return (paddle.logical_and(is_in_bboxes, is_in_center),
paddle.logical_or(is_in_bboxes, is_in_center))
return is_in_bboxes.astype(bboxes.dtype)
def compute_max_iou_anchor(ious):
r"""
For each anchor, find the GT with the largest IOU.
Args:
ious (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors
Returns:
is_max_iou (Tensor, float32): shape[B, n, L], value=1. means selected
"""
num_max_boxes = ious.shape[-2]
max_iou_index = ious.argmax(axis=-2)
is_max_iou = F.one_hot(max_iou_index, num_max_boxes).transpose([0, 2, 1])
return is_max_iou.astype(ious.dtype)
def compute_max_iou_gt(ious):
r"""
For each GT, find the anchor with the largest IOU.
Args:
ious (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors
Returns:
is_max_iou (Tensor, float32): shape[B, n, L], value=1. means selected
"""
num_anchors = ious.shape[-1]
max_iou_index = ious.argmax(axis=-1)
is_max_iou = F.one_hot(max_iou_index, num_anchors)
return is_max_iou.astype(ious.dtype)
def generate_anchors_for_grid_cell(feats,
fpn_strides,
grid_cell_size=5.0,
grid_cell_offset=0.5,
dtype='float32'):
r"""
Like ATSS, generate anchors based on grid size.
Args:
feats (List[Tensor]): shape[s, (b, c, h, w)]
fpn_strides (tuple|list): shape[s], stride for each scale feature
grid_cell_size (float): anchor size
grid_cell_offset (float): The range is between 0 and 1.
Returns:
anchors (Tensor): shape[l, 4], "xmin, ymin, xmax, ymax" format.
anchor_points (Tensor): shape[l, 2], "x, y" format.
num_anchors_list (List[int]): shape[s], contains [s_1, s_2, ...].
stride_tensor (Tensor): shape[l, 1], contains the stride for each scale.
"""
assert len(feats) == len(fpn_strides)
anchors = []
anchor_points = []
num_anchors_list = []
stride_tensor = []
for feat, stride in zip(feats, fpn_strides):
_, _, h, w = feat.shape
cell_half_size = grid_cell_size * stride * 0.5
shift_x = (paddle.arange(end=w) + grid_cell_offset) * stride
shift_y = (paddle.arange(end=h) + grid_cell_offset) * stride
shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
anchor = paddle.stack(
[
shift_x - cell_half_size, shift_y - cell_half_size,
shift_x + cell_half_size, shift_y + cell_half_size
],
axis=-1).astype(dtype)
anchor_point = paddle.stack([shift_x, shift_y], axis=-1).astype(dtype)
anchors.append(anchor.reshape([-1, 4]))
anchor_points.append(anchor_point.reshape([-1, 2]))
num_anchors_list.append(len(anchors[-1]))
stride_tensor.append(
paddle.full(
[num_anchors_list[-1], 1], stride, dtype=dtype))
anchors = paddle.concat(anchors)
anchors.stop_gradient = True
anchor_points = paddle.concat(anchor_points)
anchor_points.stop_gradient = True
stride_tensor = paddle.concat(stride_tensor)
stride_tensor.stop_gradient = True
return anchors, anchor_points, num_anchors_list, stride_tensor
================================================
FILE: ppdet/modeling/backbones/__init__.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import vgg
from . import resnet
from . import darknet
from . import mobilenet_v1
from . import mobilenet_v3
from . import hrnet
from . import lite_hrnet
from . import blazenet
from . import ghostnet
from . import senet
from . import res2net
from . import dla
from . import shufflenet_v2
from . import swin_transformer
from . import lcnet
from . import hardnet
from . import esnet
from . import cspresnet
from . import csp_darknet
from . import convnext
from . import vision_transformer
from . import mobileone
from . import trans_encoder
from . import focalnet
from . import vit_mae
from . import hgnet_v2
from . import clrnet_resnet
from .vgg import *
from .resnet import *
from .darknet import *
from .mobilenet_v1 import *
from .mobilenet_v3 import *
from .hrnet import *
from .lite_hrnet import *
from .blazenet import *
from .ghostnet import *
from .senet import *
from .res2net import *
from .dla import *
from .shufflenet_v2 import *
from .swin_transformer import *
from .lcnet import *
from .hardnet import *
from .esnet import *
from .cspresnet import *
from .csp_darknet import *
from .convnext import *
from .vision_transformer import *
from .mobileone import *
from .trans_encoder import *
from .focalnet import *
from .vitpose import *
from .vit_mae import *
from .hgnet_v2 import *
from .clrnet_resnet import *
================================================
FILE: ppdet/modeling/backbones/blazenet.py
================================================
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import KaimingNormal
from ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec
__all__ = ['BlazeNet']
def hard_swish(x):
return x * F.relu6(x + 3) / 6.
class ConvBNLayer(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
padding,
num_groups=1,
act='relu',
conv_lr=0.1,
conv_decay=0.,
norm_decay=0.,
norm_type='bn',
name=None):
super(ConvBNLayer, self).__init__()
self.act = act
self._conv = nn.Conv2D(
in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=num_groups,
weight_attr=ParamAttr(
learning_rate=conv_lr, initializer=KaimingNormal()),
bias_attr=False)
if norm_type in ['bn', 'sync_bn']:
self._batch_norm = nn.BatchNorm2D(out_channels)
def forward(self, x):
x = self._conv(x)
x = self._batch_norm(x)
if self.act == "relu":
x = F.relu(x)
elif self.act == "relu6":
x = F.relu6(x)
elif self.act == 'leaky':
x = F.leaky_relu(x)
elif self.act == 'hard_swish':
x = hard_swish(x)
return x
class BlazeBlock(nn.Layer):
def __init__(self,
in_channels,
out_channels1,
out_channels2,
double_channels=None,
stride=1,
use_5x5kernel=True,
act='relu',
name=None):
super(BlazeBlock, self).__init__()
assert stride in [1, 2]
self.use_pool = not stride == 1
self.use_double_block = double_channels is not None
self.conv_dw = []
if use_5x5kernel:
self.conv_dw.append(
self.add_sublayer(
name + "1_dw",
ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels1,
kernel_size=5,
stride=stride,
padding=2,
num_groups=out_channels1,
name=name + "1_dw")))
else:
self.conv_dw.append(
self.add_sublayer(
name + "1_dw_1",
ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels1,
kernel_size=3,
stride=1,
padding=1,
num_groups=out_channels1,
name=name + "1_dw_1")))
self.conv_dw.append(
self.add_sublayer(
name + "1_dw_2",
ConvBNLayer(
in_channels=out_channels1,
out_channels=out_channels1,
kernel_size=3,
stride=stride,
padding=1,
num_groups=out_channels1,
name=name + "1_dw_2")))
self.act = act if self.use_double_block else None
self.conv_pw = ConvBNLayer(
in_channels=out_channels1,
out_channels=out_channels2,
kernel_size=1,
stride=1,
padding=0,
act=self.act,
name=name + "1_sep")
if self.use_double_block:
self.conv_dw2 = []
if use_5x5kernel:
self.conv_dw2.append(
self.add_sublayer(
name + "2_dw",
ConvBNLayer(
in_channels=out_channels2,
out_channels=out_channels2,
kernel_size=5,
stride=1,
padding=2,
num_groups=out_channels2,
name=name + "2_dw")))
else:
self.conv_dw2.append(
self.add_sublayer(
name + "2_dw_1",
ConvBNLayer(
in_channels=out_channels2,
out_channels=out_channels2,
kernel_size=3,
stride=1,
padding=1,
num_groups=out_channels2,
name=name + "1_dw_1")))
self.conv_dw2.append(
self.add_sublayer(
name + "2_dw_2",
ConvBNLayer(
in_channels=out_channels2,
out_channels=out_channels2,
kernel_size=3,
stride=1,
padding=1,
num_groups=out_channels2,
name=name + "2_dw_2")))
self.conv_pw2 = ConvBNLayer(
in_channels=out_channels2,
out_channels=double_channels,
kernel_size=1,
stride=1,
padding=0,
name=name + "2_sep")
# shortcut
if self.use_pool:
shortcut_channel = double_channels or out_channels2
self._shortcut = []
self._shortcut.append(
self.add_sublayer(
name + '_shortcut_pool',
nn.MaxPool2D(
kernel_size=stride, stride=stride, ceil_mode=True)))
self._shortcut.append(
self.add_sublayer(
name + '_shortcut_conv',
ConvBNLayer(
in_channels=in_channels,
out_channels=shortcut_channel,
kernel_size=1,
stride=1,
padding=0,
name="shortcut" + name)))
def forward(self, x):
y = x
for conv_dw_block in self.conv_dw:
y = conv_dw_block(y)
y = self.conv_pw(y)
if self.use_double_block:
for conv_dw2_block in self.conv_dw2:
y = conv_dw2_block(y)
y = self.conv_pw2(y)
if self.use_pool:
for shortcut in self._shortcut:
x = shortcut(x)
return F.relu(paddle.add(x, y))
@register
@serializable
class BlazeNet(nn.Layer):
"""
BlazeFace, see https://arxiv.org/abs/1907.05047
Args:
blaze_filters (list): number of filter for each blaze block.
double_blaze_filters (list): number of filter for each double_blaze block.
use_5x5kernel (bool): whether or not filter size is 5x5 in depth-wise conv.
"""
def __init__(
self,
blaze_filters=[[24, 24], [24, 24], [24, 48, 2], [48, 48], [48, 48]],
double_blaze_filters=[[48, 24, 96, 2], [96, 24, 96], [96, 24, 96],
[96, 24, 96, 2], [96, 24, 96], [96, 24, 96]],
use_5x5kernel=True,
act=None):
super(BlazeNet, self).__init__()
conv1_num_filters = blaze_filters[0][0]
self.conv1 = ConvBNLayer(
in_channels=3,
out_channels=conv1_num_filters,
kernel_size=3,
stride=2,
padding=1,
name="conv1")
in_channels = conv1_num_filters
self.blaze_block = []
self._out_channels = []
for k, v in enumerate(blaze_filters):
assert len(v) in [2, 3], \
"blaze_filters {} not in [2, 3]"
if len(v) == 2:
self.blaze_block.append(
self.add_sublayer(
'blaze_{}'.format(k),
BlazeBlock(
in_channels,
v[0],
v[1],
use_5x5kernel=use_5x5kernel,
act=act,
name='blaze_{}'.format(k))))
elif len(v) == 3:
self.blaze_block.append(
self.add_sublayer(
'blaze_{}'.format(k),
BlazeBlock(
in_channels,
v[0],
v[1],
stride=v[2],
use_5x5kernel=use_5x5kernel,
act=act,
name='blaze_{}'.format(k))))
in_channels = v[1]
for k, v in enumerate(double_blaze_filters):
assert len(v) in [3, 4], \
"blaze_filters {} not in [3, 4]"
if len(v) == 3:
self.blaze_block.append(
self.add_sublayer(
'double_blaze_{}'.format(k),
BlazeBlock(
in_channels,
v[0],
v[1],
double_channels=v[2],
use_5x5kernel=use_5x5kernel,
act=act,
name='double_blaze_{}'.format(k))))
elif len(v) == 4:
self.blaze_block.append(
self.add_sublayer(
'double_blaze_{}'.format(k),
BlazeBlock(
in_channels,
v[0],
v[1],
double_channels=v[2],
stride=v[3],
use_5x5kernel=use_5x5kernel,
act=act,
name='double_blaze_{}'.format(k))))
in_channels = v[2]
self._out_channels.append(in_channels)
def forward(self, inputs):
outs = []
y = self.conv1(inputs['image'])
for block in self.blaze_block:
y = block(y)
outs.append(y)
return [outs[-4], outs[-1]]
@property
def out_shape(self):
return [
ShapeSpec(channels=c)
for c in [self._out_channels[-4], self._out_channels[-1]]
]
================================================
FILE: ppdet/modeling/backbones/clrnet_resnet.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
from paddle.utils.download import get_weights_path_from_url
from ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec
__all__ = ['CLRResNet']
model_urls = {
'resnet18':
'https://x2paddle.bj.bcebos.com/vision/models/resnet18-pt.pdparams',
'resnet34':
'https://x2paddle.bj.bcebos.com/vision/models/resnet34-pt.pdparams',
'resnet50':
'https://x2paddle.bj.bcebos.com/vision/models/resnet50-pt.pdparams',
'resnet101':
'https://x2paddle.bj.bcebos.com/vision/models/resnet101-pt.pdparams',
'resnet152':
'https://x2paddle.bj.bcebos.com/vision/models/resnet152-pt.pdparams',
'resnext50_32x4d':
'https://x2paddle.bj.bcebos.com/vision/models/resnext50_32x4d-pt.pdparams',
'resnext101_32x8d':
'https://x2paddle.bj.bcebos.com/vision/models/resnext101_32x8d-pt.pdparams',
'wide_resnet50_2':
'https://x2paddle.bj.bcebos.com/vision/models/wide_resnet50_2-pt.pdparams',
'wide_resnet101_2':
'https://x2paddle.bj.bcebos.com/vision/models/wide_resnet101_2-pt.pdparams',
}
class BasicBlock(nn.Layer):
expansion = 1
def __init__(self,
inplanes,
planes,
stride=1,
downsample=None,
groups=1,
base_width=64,
dilation=1,
norm_layer=None):
super(BasicBlock, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2D
if dilation > 1:
raise NotImplementedError(
"Dilation > 1 not supported in BasicBlock")
self.conv1 = nn.Conv2D(
inplanes, planes, 3, padding=1, stride=stride, bias_attr=False)
self.bn1 = norm_layer(planes)
self.relu = nn.ReLU()
self.conv2 = nn.Conv2D(planes, planes, 3, padding=1, bias_attr=False)
self.bn2 = norm_layer(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class BottleneckBlock(nn.Layer):
expansion = 4
def __init__(self,
inplanes,
planes,
stride=1,
downsample=None,
groups=1,
base_width=64,
dilation=1,
norm_layer=None):
super(BottleneckBlock, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2D
width = int(planes * (base_width / 64.)) * groups
self.conv1 = nn.Conv2D(inplanes, width, 1, bias_attr=False)
self.bn1 = norm_layer(width)
self.conv2 = nn.Conv2D(
width,
width,
3,
padding=dilation,
stride=stride,
groups=groups,
dilation=dilation,
bias_attr=False)
self.bn2 = norm_layer(width)
self.conv3 = nn.Conv2D(
width, planes * self.expansion, 1, bias_attr=False)
self.bn3 = norm_layer(planes * self.expansion)
self.relu = nn.ReLU()
self.downsample = downsample
self.stride = stride
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class ResNet(nn.Layer):
"""ResNet model from
`"Deep Residual Learning for Image Recognition" `_.
Args:
Block (BasicBlock|BottleneckBlock): Block module of model.
depth (int, optional): Layers of ResNet, Default: 50.
width (int, optional): Base width per convolution group for each convolution block, Default: 64.
num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer
will not be defined. Default: 1000.
with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
groups (int, optional): Number of groups for each convolution block, Default: 1.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of ResNet model.
Examples:
.. code-block:: python
import paddle
from paddle.vision.models import ResNet
from paddle.vision.models.resnet import BottleneckBlock, BasicBlock
# build ResNet with 18 layers
resnet18 = ResNet(BasicBlock, 18)
# build ResNet with 50 layers
resnet50 = ResNet(BottleneckBlock, 50)
# build Wide ResNet model
wide_resnet50_2 = ResNet(BottleneckBlock, 50, width=64*2)
# build ResNeXt model
resnext50_32x4d = ResNet(BottleneckBlock, 50, width=4, groups=32)
x = paddle.rand([1, 3, 224, 224])
out = resnet18(x)
print(out.shape)
# [1, 1000]
"""
def __init__(self, block, depth=50, width=64, with_pool=True, groups=1):
super(ResNet, self).__init__()
layer_cfg = {
18: [2, 2, 2, 2],
34: [3, 4, 6, 3],
50: [3, 4, 6, 3],
101: [3, 4, 23, 3],
152: [3, 8, 36, 3]
}
layers = layer_cfg[depth]
self.groups = groups
self.base_width = width
self.with_pool = with_pool
self._norm_layer = nn.BatchNorm2D
self.inplanes = 64
self.dilation = 1
self.conv1 = nn.Conv2D(
3,
self.inplanes,
kernel_size=7,
stride=2,
padding=3,
bias_attr=False)
self.bn1 = self._norm_layer(self.inplanes)
self.relu = nn.ReLU()
self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
if with_pool:
self.avgpool = nn.AdaptiveAvgPool2D((1, 1))
ch_out_list = [64, 128, 256, 512]
block = BottleneckBlock if depth >= 50 else BasicBlock
self._out_channels = [block.expansion * v for v in ch_out_list]
self._out_strides = [4, 8, 16, 32]
self.return_idx = [0, 1, 2, 3]
def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
norm_layer = self._norm_layer
downsample = None
previous_dilation = self.dilation
if dilate:
self.dilation *= stride
stride = 1
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2D(
self.inplanes,
planes * block.expansion,
1,
stride=stride,
bias_attr=False),
norm_layer(planes * block.expansion), )
layers = []
layers.append(
block(self.inplanes, planes, stride, downsample, self.groups,
self.base_width, previous_dilation, norm_layer))
self.inplanes = planes * block.expansion
for _ in range(1, blocks):
layers.append(
block(
self.inplanes,
planes,
groups=self.groups,
base_width=self.base_width,
norm_layer=norm_layer))
return nn.Sequential(*layers)
@property
def out_shape(self):
return [
ShapeSpec(
channels=self._out_channels[i], stride=self._out_strides[i])
for i in self.return_idx
]
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
out_layers = []
x = self.layer1(x)
out_layers.append(x)
x = self.layer2(x)
out_layers.append(x)
x = self.layer3(x)
out_layers.append(x)
x = self.layer4(x)
out_layers.append(x)
if self.with_pool:
x = self.avgpool(x)
return out_layers
@register
@serializable
class CLRResNet(nn.Layer):
def __init__(self,
resnet='resnet18',
pretrained=True,
out_conv=False,
fea_stride=8,
out_channel=128,
in_channels=[64, 128, 256, 512],
cfg=None):
super(CLRResNet, self).__init__()
self.cfg = cfg
self.in_channels = in_channels
self.model = eval(resnet)(pretrained=pretrained)
self.out = None
if out_conv:
out_channel = 512
for chan in reversed(self.in_channels):
if chan < 0: continue
out_channel = chan
break
self.out = nn.Conv2D(
out_channel * self.model.expansion,
cfg.featuremap_out_channel,
kernel_size=1,
bias_attr=False)
@property
def out_shape(self):
return self.model.out_shape
def forward(self, x):
x = self.model(x)
if self.out:
x[-1] = self.out(x[-1])
return x
def _resnet(arch, Block, depth, pretrained, **kwargs):
model = ResNet(Block, depth, **kwargs)
if pretrained:
assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format(
arch)
weight_path = get_weights_path_from_url(model_urls[arch])
param = paddle.load(weight_path)
model.set_dict(param)
return model
def resnet18(pretrained=False, **kwargs):
"""ResNet 18-layer model from
`"Deep Residual Learning for Image Recognition" `_.
Args:
pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
on ImageNet. Default: False.
**kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of ResNet 18-layer model.
Examples:
.. code-block:: python
import paddle
from paddle.vision.models import resnet18
# build model
model = resnet18()
# build model and load imagenet pretrained weight
# model = resnet18(pretrained=True)
x = paddle.rand([1, 3, 224, 224])
out = model(x)
print(out.shape)
# [1, 1000]
"""
return _resnet('resnet18', BasicBlock, 18, pretrained, **kwargs)
def resnet34(pretrained=False, **kwargs):
"""ResNet 34-layer model from
`"Deep Residual Learning for Image Recognition" `_.
Args:
pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
on ImageNet. Default: False.
**kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of ResNet 34-layer model.
Examples:
.. code-block:: python
import paddle
from paddle.vision.models import resnet34
# build model
model = resnet34()
# build model and load imagenet pretrained weight
# model = resnet34(pretrained=True)
x = paddle.rand([1, 3, 224, 224])
out = model(x)
print(out.shape)
# [1, 1000]
"""
return _resnet('resnet34', BasicBlock, 34, pretrained, **kwargs)
def resnet50(pretrained=False, **kwargs):
"""ResNet 50-layer model from
`"Deep Residual Learning for Image Recognition" `_.
Args:
pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
on ImageNet. Default: False.
**kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of ResNet 50-layer model.
Examples:
.. code-block:: python
import paddle
from paddle.vision.models import resnet50
# build model
model = resnet50()
# build model and load imagenet pretrained weight
# model = resnet50(pretrained=True)
x = paddle.rand([1, 3, 224, 224])
out = model(x)
print(out.shape)
# [1, 1000]
"""
return _resnet('resnet50', BottleneckBlock, 50, pretrained, **kwargs)
def resnet101(pretrained=False, **kwargs):
"""ResNet 101-layer model from
`"Deep Residual Learning for Image Recognition" `_.
Args:
pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
on ImageNet. Default: False.
**kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of ResNet 101-layer.
Examples:
.. code-block:: python
import paddle
from paddle.vision.models import resnet101
# build model
model = resnet101()
# build model and load imagenet pretrained weight
# model = resnet101(pretrained=True)
x = paddle.rand([1, 3, 224, 224])
out = model(x)
print(out.shape)
# [1, 1000]
"""
return _resnet('resnet101', BottleneckBlock, 101, pretrained, **kwargs)
def resnet152(pretrained=False, **kwargs):
"""ResNet 152-layer model from
`"Deep Residual Learning for Image Recognition" `_.
Args:
pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
on ImageNet. Default: False.
**kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of ResNet 152-layer model.
Examples:
.. code-block:: python
import paddle
from paddle.vision.models import resnet152
# build model
model = resnet152()
# build model and load imagenet pretrained weight
# model = resnet152(pretrained=True)
x = paddle.rand([1, 3, 224, 224])
out = model(x)
print(out.shape)
# [1, 1000]
"""
return _resnet('resnet152', BottleneckBlock, 152, pretrained, **kwargs)
def resnext50_32x4d(pretrained=False, **kwargs):
"""ResNeXt-50 32x4d model from
`"Aggregated Residual Transformations for Deep Neural Networks" `_.
Args:
pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
on ImageNet. Default: False.
**kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of ResNeXt-50 32x4d model.
Examples:
.. code-block:: python
import paddle
from paddle.vision.models import resnext50_32x4d
# build model
model = resnext50_32x4d()
# build model and load imagenet pretrained weight
# model = resnext50_32x4d(pretrained=True)
x = paddle.rand([1, 3, 224, 224])
out = model(x)
print(out.shape)
# [1, 1000]
"""
kwargs['groups'] = 32
kwargs['width'] = 4
return _resnet('resnext50_32x4d', BottleneckBlock, 50, pretrained, **kwargs)
def resnext50_64x4d(pretrained=False, **kwargs):
"""ResNeXt-50 64x4d model from
`"Aggregated Residual Transformations for Deep Neural Networks" `_.
Args:
pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
on ImageNet. Default: False.
**kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of ResNeXt-50 64x4d model.
Examples:
.. code-block:: python
import paddle
from paddle.vision.models import resnext50_64x4d
# build model
model = resnext50_64x4d()
# build model and load imagenet pretrained weight
# model = resnext50_64x4d(pretrained=True)
x = paddle.rand([1, 3, 224, 224])
out = model(x)
print(out.shape)
# [1, 1000]
"""
kwargs['groups'] = 64
kwargs['width'] = 4
return _resnet('resnext50_64x4d', BottleneckBlock, 50, pretrained, **kwargs)
def resnext101_32x4d(pretrained=False, **kwargs):
"""ResNeXt-101 32x4d model from
`"Aggregated Residual Transformations for Deep Neural Networks" `_.
Args:
pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
on ImageNet. Default: False.
**kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of ResNeXt-101 32x4d model.
Examples:
.. code-block:: python
import paddle
from paddle.vision.models import resnext101_32x4d
# build model
model = resnext101_32x4d()
# build model and load imagenet pretrained weight
# model = resnext101_32x4d(pretrained=True)
x = paddle.rand([1, 3, 224, 224])
out = model(x)
print(out.shape)
# [1, 1000]
"""
kwargs['groups'] = 32
kwargs['width'] = 4
return _resnet('resnext101_32x4d', BottleneckBlock, 101, pretrained,
**kwargs)
def resnext101_64x4d(pretrained=False, **kwargs):
"""ResNeXt-101 64x4d model from
`"Aggregated Residual Transformations for Deep Neural Networks" `_.
Args:
pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
on ImageNet. Default: False.
**kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of ResNeXt-101 64x4d model.
Examples:
.. code-block:: python
import paddle
from paddle.vision.models import resnext101_64x4d
# build model
model = resnext101_64x4d()
# build model and load imagenet pretrained weight
# model = resnext101_64x4d(pretrained=True)
x = paddle.rand([1, 3, 224, 224])
out = model(x)
print(out.shape)
# [1, 1000]
"""
kwargs['groups'] = 64
kwargs['width'] = 4
return _resnet('resnext101_64x4d', BottleneckBlock, 101, pretrained,
**kwargs)
def resnext152_32x4d(pretrained=False, **kwargs):
"""ResNeXt-152 32x4d model from
`"Aggregated Residual Transformations for Deep Neural Networks" `_.
Args:
pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
on ImageNet. Default: False.
**kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of ResNeXt-152 32x4d model.
Examples:
.. code-block:: python
import paddle
from paddle.vision.models import resnext152_32x4d
# build model
model = resnext152_32x4d()
# build model and load imagenet pretrained weight
# model = resnext152_32x4d(pretrained=True)
x = paddle.rand([1, 3, 224, 224])
out = model(x)
print(out.shape)
# [1, 1000]
"""
kwargs['groups'] = 32
kwargs['width'] = 4
return _resnet('resnext152_32x4d', BottleneckBlock, 152, pretrained,
**kwargs)
def resnext152_64x4d(pretrained=False, **kwargs):
"""ResNeXt-152 64x4d model from
`"Aggregated Residual Transformations for Deep Neural Networks" `_.
Args:
pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
on ImageNet. Default: False.
**kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of ResNeXt-152 64x4d model.
Examples:
.. code-block:: python
import paddle
from paddle.vision.models import resnext152_64x4d
# build model
model = resnext152_64x4d()
# build model and load imagenet pretrained weight
# model = resnext152_64x4d(pretrained=True)
x = paddle.rand([1, 3, 224, 224])
out = model(x)
print(out.shape)
# [1, 1000]
"""
kwargs['groups'] = 64
kwargs['width'] = 4
return _resnet('resnext152_64x4d', BottleneckBlock, 152, pretrained,
**kwargs)
def wide_resnet50_2(pretrained=False, **kwargs):
"""Wide ResNet-50-2 model from
`"Wide Residual Networks" `_.
Args:
pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
on ImageNet. Default: False.
**kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of Wide ResNet-50-2 model.
Examples:
.. code-block:: python
import paddle
from paddle.vision.models import wide_resnet50_2
# build model
model = wide_resnet50_2()
# build model and load imagenet pretrained weight
# model = wide_resnet50_2(pretrained=True)
x = paddle.rand([1, 3, 224, 224])
out = model(x)
print(out.shape)
# [1, 1000]
"""
kwargs['width'] = 64 * 2
return _resnet('wide_resnet50_2', BottleneckBlock, 50, pretrained, **kwargs)
def wide_resnet101_2(pretrained=False, **kwargs):
"""Wide ResNet-101-2 model from
`"Wide Residual Networks" `_.
Args:
pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
on ImageNet. Default: False.
**kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of Wide ResNet-101-2 model.
Examples:
.. code-block:: python
import paddle
from paddle.vision.models import wide_resnet101_2
# build model
model = wide_resnet101_2()
# build model and load imagenet pretrained weight
# model = wide_resnet101_2(pretrained=True)
x = paddle.rand([1, 3, 224, 224])
out = model(x)
print(out.shape)
# [1, 1000]
"""
kwargs['width'] = 64 * 2
return _resnet('wide_resnet101_2', BottleneckBlock, 101, pretrained,
**kwargs)
================================================
FILE: ppdet/modeling/backbones/convnext.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''
Modified from https://github.com/facebookresearch/ConvNeXt
Copyright (c) Meta Platforms, Inc. and affiliates.
All rights reserved.
This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
'''
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import Constant
import numpy as np
from ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec
from .transformer_utils import DropPath, trunc_normal_, zeros_
__all__ = ['ConvNeXt']
class Block(nn.Layer):
r""" ConvNeXt Block. There are two equivalent implementations:
(1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
(2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
We use (2) as we find it slightly faster in Pypaddle
Args:
dim (int): Number of input channels.
drop_path (float): Stochastic depth rate. Default: 0.0
layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
"""
def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
super().__init__()
self.dwconv = nn.Conv2D(
dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv
self.norm = LayerNorm(dim, eps=1e-6)
self.pwconv1 = nn.Linear(
dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers
self.act = nn.GELU()
self.pwconv2 = nn.Linear(4 * dim, dim)
if layer_scale_init_value > 0:
self.gamma = self.create_parameter(
shape=(dim, ),
attr=ParamAttr(initializer=Constant(layer_scale_init_value)))
else:
self.gamma = None
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity(
)
def forward(self, x):
input = x
x = self.dwconv(x)
x = x.transpose([0, 2, 3, 1])
x = self.norm(x)
x = self.pwconv1(x)
x = self.act(x)
x = self.pwconv2(x)
if self.gamma is not None:
x = self.gamma * x
x = x.transpose([0, 3, 1, 2])
x = input + self.drop_path(x)
return x
class LayerNorm(nn.Layer):
r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
shape (batch_size, height, width, channels) while channels_first corresponds to inputs
with shape (batch_size, channels, height, width).
"""
def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
super().__init__()
self.weight = self.create_parameter(
shape=(normalized_shape, ),
attr=ParamAttr(initializer=Constant(1.)))
self.bias = self.create_parameter(
shape=(normalized_shape, ),
attr=ParamAttr(initializer=Constant(0.)))
self.eps = eps
self.data_format = data_format
if self.data_format not in ["channels_last", "channels_first"]:
raise NotImplementedError
self.normalized_shape = (normalized_shape, )
def forward(self, x):
if self.data_format == "channels_last":
return F.layer_norm(x, self.normalized_shape, self.weight,
self.bias, self.eps)
elif self.data_format == "channels_first":
u = x.mean(1, keepdim=True)
s = (x - u).pow(2).mean(1, keepdim=True)
x = (x - u) / paddle.sqrt(s + self.eps)
x = self.weight[:, None, None] * x + self.bias[:, None, None]
return x
@register
@serializable
class ConvNeXt(nn.Layer):
r""" ConvNeXt
A Pypaddle impl of : `A ConvNet for the 2020s` -
https://arxiv.org/pdf/2201.03545.pdf
Args:
in_chans (int): Number of input image channels. Default: 3
depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
drop_path_rate (float): Stochastic depth rate. Default: 0.
layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
"""
arch_settings = {
'tiny': {
'depths': [3, 3, 9, 3],
'dims': [96, 192, 384, 768]
},
'small': {
'depths': [3, 3, 27, 3],
'dims': [96, 192, 384, 768]
},
'base': {
'depths': [3, 3, 27, 3],
'dims': [128, 256, 512, 1024]
},
'large': {
'depths': [3, 3, 27, 3],
'dims': [192, 384, 768, 1536]
},
'xlarge': {
'depths': [3, 3, 27, 3],
'dims': [256, 512, 1024, 2048]
},
}
def __init__(
self,
arch='tiny',
in_chans=3,
drop_path_rate=0.,
layer_scale_init_value=1e-6,
return_idx=[1, 2, 3],
norm_output=True,
pretrained=None, ):
super().__init__()
depths = self.arch_settings[arch]['depths']
dims = self.arch_settings[arch]['dims']
self.downsample_layers = nn.LayerList(
) # stem and 3 intermediate downsampling conv layers
stem = nn.Sequential(
nn.Conv2D(
in_chans, dims[0], kernel_size=4, stride=4),
LayerNorm(
dims[0], eps=1e-6, data_format="channels_first"))
self.downsample_layers.append(stem)
for i in range(3):
downsample_layer = nn.Sequential(
LayerNorm(
dims[i], eps=1e-6, data_format="channels_first"),
nn.Conv2D(
dims[i], dims[i + 1], kernel_size=2, stride=2), )
self.downsample_layers.append(downsample_layer)
self.stages = nn.LayerList(
) # 4 feature resolution stages, each consisting of multiple residual blocks
dp_rates = [x for x in np.linspace(0, drop_path_rate, sum(depths))]
cur = 0
for i in range(4):
stage = nn.Sequential(* [
Block(
dim=dims[i],
drop_path=dp_rates[cur + j],
layer_scale_init_value=layer_scale_init_value)
for j in range(depths[i])
])
self.stages.append(stage)
cur += depths[i]
self.return_idx = return_idx
self.dims = [dims[i] for i in return_idx] # [::-1]
self.norm_output = norm_output
if norm_output:
self.norms = nn.LayerList([
LayerNorm(
c, eps=1e-6, data_format="channels_first")
for c in self.dims
])
self.apply(self._init_weights)
if pretrained is not None:
if 'http' in pretrained: #URL
path = paddle.utils.download.get_weights_path_from_url(
pretrained)
else: #model in local path
path = pretrained
self.set_state_dict(paddle.load(path))
def _init_weights(self, m):
if isinstance(m, (nn.Conv2D, nn.Linear)):
trunc_normal_(m.weight)
zeros_(m.bias)
def forward_features(self, x):
output = []
for i in range(4):
x = self.downsample_layers[i](x)
x = self.stages[i](x)
output.append(x)
outputs = [output[i] for i in self.return_idx]
if self.norm_output:
outputs = [self.norms[i](out) for i, out in enumerate(outputs)]
return outputs
def forward(self, x):
x = self.forward_features(x['image'])
return x
@property
def out_shape(self):
return [ShapeSpec(channels=c) for c in self.dims]
================================================
FILE: ppdet/modeling/backbones/csp_darknet.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register, serializable
from ppdet.modeling.initializer import conv_init_
from ..shape_spec import ShapeSpec
__all__ = [
'CSPDarkNet', 'BaseConv', 'DWConv', 'BottleNeck', 'SPPLayer', 'SPPFLayer'
]
class BaseConv(nn.Layer):
def __init__(self,
in_channels,
out_channels,
ksize,
stride,
groups=1,
bias=False,
act="silu"):
super(BaseConv, self).__init__()
self.conv = nn.Conv2D(
in_channels,
out_channels,
kernel_size=ksize,
stride=stride,
padding=(ksize - 1) // 2,
groups=groups,
bias_attr=bias)
self.bn = nn.BatchNorm2D(
out_channels,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
self._init_weights()
def _init_weights(self):
conv_init_(self.conv)
def forward(self, x):
# use 'x * F.sigmoid(x)' replace 'silu'
x = self.bn(self.conv(x))
y = x * F.sigmoid(x)
return y
class DWConv(nn.Layer):
"""Depthwise Conv"""
def __init__(self,
in_channels,
out_channels,
ksize,
stride=1,
bias=False,
act="silu"):
super(DWConv, self).__init__()
self.dw_conv = BaseConv(
in_channels,
in_channels,
ksize=ksize,
stride=stride,
groups=in_channels,
bias=bias,
act=act)
self.pw_conv = BaseConv(
in_channels,
out_channels,
ksize=1,
stride=1,
groups=1,
bias=bias,
act=act)
def forward(self, x):
return self.pw_conv(self.dw_conv(x))
class Focus(nn.Layer):
"""Focus width and height information into channel space, used in YOLOX."""
def __init__(self,
in_channels,
out_channels,
ksize=3,
stride=1,
bias=False,
act="silu"):
super(Focus, self).__init__()
self.conv = BaseConv(
in_channels * 4,
out_channels,
ksize=ksize,
stride=stride,
bias=bias,
act=act)
def forward(self, inputs):
# inputs [bs, C, H, W] -> outputs [bs, 4C, W/2, H/2]
top_left = inputs[:, :, 0::2, 0::2]
top_right = inputs[:, :, 0::2, 1::2]
bottom_left = inputs[:, :, 1::2, 0::2]
bottom_right = inputs[:, :, 1::2, 1::2]
outputs = paddle.concat(
[top_left, bottom_left, top_right, bottom_right], 1)
return self.conv(outputs)
class BottleNeck(nn.Layer):
def __init__(self,
in_channels,
out_channels,
shortcut=True,
expansion=0.5,
depthwise=False,
bias=False,
act="silu"):
super(BottleNeck, self).__init__()
hidden_channels = int(out_channels * expansion)
Conv = DWConv if depthwise else BaseConv
self.conv1 = BaseConv(
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
self.conv2 = Conv(
hidden_channels,
out_channels,
ksize=3,
stride=1,
bias=bias,
act=act)
self.add_shortcut = shortcut and in_channels == out_channels
def forward(self, x):
y = self.conv2(self.conv1(x))
if self.add_shortcut:
y = y + x
return y
class SPPLayer(nn.Layer):
"""Spatial Pyramid Pooling (SPP) layer used in YOLOv3-SPP and YOLOX"""
def __init__(self,
in_channels,
out_channels,
kernel_sizes=(5, 9, 13),
bias=False,
act="silu"):
super(SPPLayer, self).__init__()
hidden_channels = in_channels // 2
self.conv1 = BaseConv(
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
self.maxpoolings = nn.LayerList([
nn.MaxPool2D(
kernel_size=ks, stride=1, padding=ks // 2)
for ks in kernel_sizes
])
conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
self.conv2 = BaseConv(
conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)
def forward(self, x):
x = self.conv1(x)
x = paddle.concat([x] + [mp(x) for mp in self.maxpoolings], axis=1)
x = self.conv2(x)
return x
class SPPFLayer(nn.Layer):
""" Spatial Pyramid Pooling - Fast (SPPF) layer used in YOLOv5 by Glenn Jocher,
equivalent to SPP(k=(5, 9, 13))
"""
def __init__(self,
in_channels,
out_channels,
ksize=5,
bias=False,
act='silu'):
super(SPPFLayer, self).__init__()
hidden_channels = in_channels // 2
self.conv1 = BaseConv(
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
self.maxpooling = nn.MaxPool2D(
kernel_size=ksize, stride=1, padding=ksize // 2)
conv2_channels = hidden_channels * 4
self.conv2 = BaseConv(
conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)
def forward(self, x):
x = self.conv1(x)
y1 = self.maxpooling(x)
y2 = self.maxpooling(y1)
y3 = self.maxpooling(y2)
concats = paddle.concat([x, y1, y2, y3], axis=1)
out = self.conv2(concats)
return out
class CSPLayer(nn.Layer):
"""CSP (Cross Stage Partial) layer with 3 convs, named C3 in YOLOv5"""
def __init__(self,
in_channels,
out_channels,
num_blocks=1,
shortcut=True,
expansion=0.5,
depthwise=False,
bias=False,
act="silu"):
super(CSPLayer, self).__init__()
hidden_channels = int(out_channels * expansion)
self.conv1 = BaseConv(
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
self.conv2 = BaseConv(
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
self.bottlenecks = nn.Sequential(* [
BottleNeck(
hidden_channels,
hidden_channels,
shortcut=shortcut,
expansion=1.0,
depthwise=depthwise,
bias=bias,
act=act) for _ in range(num_blocks)
])
self.conv3 = BaseConv(
hidden_channels * 2,
out_channels,
ksize=1,
stride=1,
bias=bias,
act=act)
def forward(self, x):
x_1 = self.conv1(x)
x_1 = self.bottlenecks(x_1)
x_2 = self.conv2(x)
x = paddle.concat([x_1, x_2], axis=1)
x = self.conv3(x)
return x
@register
@serializable
class CSPDarkNet(nn.Layer):
"""
CSPDarkNet backbone.
Args:
arch (str): Architecture of CSPDarkNet, from {P5, P6, X}, default as X,
and 'X' means used in YOLOX, 'P5/P6' means used in YOLOv5.
depth_mult (float): Depth multiplier, multiply number of channels in
each layer, default as 1.0.
width_mult (float): Width multiplier, multiply number of blocks in
CSPLayer, default as 1.0.
depthwise (bool): Whether to use depth-wise conv layer.
act (str): Activation function type, default as 'silu'.
return_idx (list): Index of stages whose feature maps are returned.
"""
__shared__ = ['depth_mult', 'width_mult', 'act', 'trt']
# in_channels, out_channels, num_blocks, add_shortcut, use_spp(use_sppf)
# 'X' means setting used in YOLOX, 'P5/P6' means setting used in YOLOv5.
arch_settings = {
'X': [[64, 128, 3, True, False], [128, 256, 9, True, False],
[256, 512, 9, True, False], [512, 1024, 3, False, True]],
'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
[256, 512, 9, True, False], [512, 1024, 3, True, True]],
'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
[256, 512, 9, True, False], [512, 768, 3, True, False],
[768, 1024, 3, True, True]],
}
def __init__(self,
arch='X',
depth_mult=1.0,
width_mult=1.0,
depthwise=False,
act='silu',
trt=False,
return_idx=[2, 3, 4]):
super(CSPDarkNet, self).__init__()
self.arch = arch
self.return_idx = return_idx
Conv = DWConv if depthwise else BaseConv
arch_setting = self.arch_settings[arch]
base_channels = int(arch_setting[0][0] * width_mult)
# Note: differences between the latest YOLOv5 and the original YOLOX
# 1. self.stem, use SPPF(in YOLOv5) or SPP(in YOLOX)
# 2. use SPPF(in YOLOv5) or SPP(in YOLOX)
# 3. put SPPF before(YOLOv5) or SPP after(YOLOX) the last cspdark block's CSPLayer
# 4. whether SPPF(SPP)'CSPLayer add shortcut, True in YOLOv5, False in YOLOX
if arch in ['P5', 'P6']:
# in the latest YOLOv5, use Conv stem, and SPPF (fast, only single spp kernal size)
self.stem = Conv(
3, base_channels, ksize=6, stride=2, bias=False, act=act)
spp_kernal_sizes = 5
elif arch in ['X']:
# in the original YOLOX, use Focus stem, and SPP (three spp kernal sizes)
self.stem = Focus(
3, base_channels, ksize=3, stride=1, bias=False, act=act)
spp_kernal_sizes = (5, 9, 13)
else:
raise AttributeError("Unsupported arch type: {}".format(arch))
_out_channels = [base_channels]
layers_num = 1
self.csp_dark_blocks = []
for i, (in_channels, out_channels, num_blocks, shortcut,
use_spp) in enumerate(arch_setting):
in_channels = int(in_channels * width_mult)
out_channels = int(out_channels * width_mult)
_out_channels.append(out_channels)
num_blocks = max(round(num_blocks * depth_mult), 1)
stage = []
conv_layer = self.add_sublayer(
'layers{}.stage{}.conv_layer'.format(layers_num, i + 1),
Conv(
in_channels, out_channels, 3, 2, bias=False, act=act))
stage.append(conv_layer)
layers_num += 1
if use_spp and arch in ['X']:
# in YOLOX use SPPLayer
spp_layer = self.add_sublayer(
'layers{}.stage{}.spp_layer'.format(layers_num, i + 1),
SPPLayer(
out_channels,
out_channels,
kernel_sizes=spp_kernal_sizes,
bias=False,
act=act))
stage.append(spp_layer)
layers_num += 1
csp_layer = self.add_sublayer(
'layers{}.stage{}.csp_layer'.format(layers_num, i + 1),
CSPLayer(
out_channels,
out_channels,
num_blocks=num_blocks,
shortcut=shortcut,
depthwise=depthwise,
bias=False,
act=act))
stage.append(csp_layer)
layers_num += 1
if use_spp and arch in ['P5', 'P6']:
# in latest YOLOv5 use SPPFLayer instead of SPPLayer
sppf_layer = self.add_sublayer(
'layers{}.stage{}.sppf_layer'.format(layers_num, i + 1),
SPPFLayer(
out_channels,
out_channels,
ksize=5,
bias=False,
act=act))
stage.append(sppf_layer)
layers_num += 1
self.csp_dark_blocks.append(nn.Sequential(*stage))
self._out_channels = [_out_channels[i] for i in self.return_idx]
self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx]
def forward(self, inputs):
x = inputs['image']
outputs = []
x = self.stem(x)
for i, layer in enumerate(self.csp_dark_blocks):
x = layer(x)
if i + 1 in self.return_idx:
outputs.append(x)
return outputs
@property
def out_shape(self):
return [
ShapeSpec(
channels=c, stride=s)
for c, s in zip(self._out_channels, self.strides)
]
================================================
FILE: ppdet/modeling/backbones/cspresnet.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddle.nn.initializer import Constant
from ppdet.modeling.ops import get_act_fn
from ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec
__all__ = ['CSPResNet', 'BasicBlock', 'EffectiveSELayer', 'ConvBNLayer']
class ConvBNLayer(nn.Layer):
def __init__(self,
ch_in,
ch_out,
filter_size=3,
stride=1,
groups=1,
padding=0,
act=None):
super(ConvBNLayer, self).__init__()
self.conv = nn.Conv2D(
in_channels=ch_in,
out_channels=ch_out,
kernel_size=filter_size,
stride=stride,
padding=padding,
groups=groups,
bias_attr=False)
self.bn = nn.BatchNorm2D(
ch_out,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
self.act = get_act_fn(act) if act is None or isinstance(act, (
str, dict)) else act
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
x = self.act(x)
return x
class RepVggBlock(nn.Layer):
def __init__(self, ch_in, ch_out, act='relu', alpha=False):
super(RepVggBlock, self).__init__()
self.ch_in = ch_in
self.ch_out = ch_out
self.conv1 = ConvBNLayer(
ch_in, ch_out, 3, stride=1, padding=1, act=None)
self.conv2 = ConvBNLayer(
ch_in, ch_out, 1, stride=1, padding=0, act=None)
self.act = get_act_fn(act) if act is None or isinstance(act, (
str, dict)) else act
if alpha:
self.alpha = self.create_parameter(
shape=[1],
attr=ParamAttr(initializer=Constant(value=1.)),
dtype="float32")
else:
self.alpha = None
def forward(self, x):
if hasattr(self, 'conv'):
y = self.conv(x)
else:
if self.alpha:
y = self.conv1(x) + self.alpha * self.conv2(x)
else:
y = self.conv1(x) + self.conv2(x)
y = self.act(y)
return y
def convert_to_deploy(self):
if not hasattr(self, 'conv'):
self.conv = nn.Conv2D(
in_channels=self.ch_in,
out_channels=self.ch_out,
kernel_size=3,
stride=1,
padding=1,
groups=1)
kernel, bias = self.get_equivalent_kernel_bias()
self.conv.weight.set_value(kernel)
self.conv.bias.set_value(bias)
self.__delattr__('conv1')
self.__delattr__('conv2')
def get_equivalent_kernel_bias(self):
kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
if self.alpha:
return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor(
kernel1x1), bias3x3 + self.alpha * bias1x1
else:
return kernel3x3 + self._pad_1x1_to_3x3_tensor(
kernel1x1), bias3x3 + bias1x1
def _pad_1x1_to_3x3_tensor(self, kernel1x1):
if kernel1x1 is None:
return 0
else:
return nn.functional.pad(kernel1x1, [1, 1, 1, 1])
def _fuse_bn_tensor(self, branch):
if branch is None:
return 0, 0
kernel = branch.conv.weight
running_mean = branch.bn._mean
running_var = branch.bn._variance
gamma = branch.bn.weight
beta = branch.bn.bias
eps = branch.bn._epsilon
std = (running_var + eps).sqrt()
t = (gamma / std).reshape((-1, 1, 1, 1))
return kernel * t, beta - running_mean * gamma / std
class BasicBlock(nn.Layer):
def __init__(self,
ch_in,
ch_out,
act='relu',
shortcut=True,
use_alpha=False):
super(BasicBlock, self).__init__()
assert ch_in == ch_out
self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act)
self.conv2 = RepVggBlock(ch_out, ch_out, act=act, alpha=use_alpha)
self.shortcut = shortcut
def forward(self, x):
y = self.conv1(x)
y = self.conv2(y)
if self.shortcut:
return paddle.add(x, y)
else:
return y
class EffectiveSELayer(nn.Layer):
""" Effective Squeeze-Excitation
From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
"""
def __init__(self, channels, act='hardsigmoid'):
super(EffectiveSELayer, self).__init__()
self.fc = nn.Conv2D(channels, channels, kernel_size=1, padding=0)
self.act = get_act_fn(act) if act is None or isinstance(act, (
str, dict)) else act
def forward(self, x):
x_se = x.mean((2, 3), keepdim=True)
x_se = self.fc(x_se)
return x * self.act(x_se)
class CSPResStage(nn.Layer):
def __init__(self,
block_fn,
ch_in,
ch_out,
n,
stride,
act='relu',
attn='eca',
use_alpha=False):
super(CSPResStage, self).__init__()
ch_mid = (ch_in + ch_out) // 2
if stride == 2:
self.conv_down = ConvBNLayer(
ch_in, ch_mid, 3, stride=2, padding=1, act=act)
else:
self.conv_down = None
self.conv1 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
self.conv2 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
self.blocks = nn.Sequential(*[
block_fn(
ch_mid // 2,
ch_mid // 2,
act=act,
shortcut=True,
use_alpha=use_alpha) for i in range(n)
])
if attn:
self.attn = EffectiveSELayer(ch_mid, act='hardsigmoid')
else:
self.attn = None
self.conv3 = ConvBNLayer(ch_mid, ch_out, 1, act=act)
def forward(self, x):
if self.conv_down is not None:
x = self.conv_down(x)
y1 = self.conv1(x)
y2 = self.blocks(self.conv2(x))
y = paddle.concat([y1, y2], axis=1)
if self.attn is not None:
y = self.attn(y)
y = self.conv3(y)
return y
@register
@serializable
class CSPResNet(nn.Layer):
__shared__ = ['width_mult', 'depth_mult', 'trt']
def __init__(self,
layers=[3, 6, 6, 3],
channels=[64, 128, 256, 512, 1024],
act='swish',
return_idx=[1, 2, 3],
depth_wise=False,
use_large_stem=False,
width_mult=1.0,
depth_mult=1.0,
trt=False,
use_checkpoint=False,
use_alpha=False,
**args):
super(CSPResNet, self).__init__()
self.use_checkpoint = use_checkpoint
channels = [max(round(c * width_mult), 1) for c in channels]
layers = [max(round(l * depth_mult), 1) for l in layers]
act = get_act_fn(
act, trt=trt) if act is None or isinstance(act,
(str, dict)) else act
if use_large_stem:
self.stem = nn.Sequential(
('conv1', ConvBNLayer(
3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
('conv2', ConvBNLayer(
channels[0] // 2,
channels[0] // 2,
3,
stride=1,
padding=1,
act=act)), ('conv3', ConvBNLayer(
channels[0] // 2,
channels[0],
3,
stride=1,
padding=1,
act=act)))
else:
self.stem = nn.Sequential(
('conv1', ConvBNLayer(
3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
('conv2', ConvBNLayer(
channels[0] // 2,
channels[0],
3,
stride=1,
padding=1,
act=act)))
n = len(channels) - 1
self.stages = nn.Sequential(*[(str(i), CSPResStage(
BasicBlock,
channels[i],
channels[i + 1],
layers[i],
2,
act=act,
use_alpha=use_alpha)) for i in range(n)])
self._out_channels = channels[1:]
self._out_strides = [4 * 2**i for i in range(n)]
self.return_idx = return_idx
if use_checkpoint:
paddle.seed(0)
def forward(self, inputs):
x = inputs['image']
x = self.stem(x)
outs = []
for idx, stage in enumerate(self.stages):
if self.use_checkpoint and self.training:
x = paddle.distributed.fleet.utils.recompute(
stage, x, **{"preserve_rng_state": True})
else:
x = stage(x)
if idx in self.return_idx:
outs.append(x)
return outs
@property
def out_shape(self):
return [
ShapeSpec(
channels=self._out_channels[i], stride=self._out_strides[i])
for i in self.return_idx
]
================================================
FILE: ppdet/modeling/backbones/darknet.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register, serializable
from ppdet.modeling.ops import batch_norm, mish
from ..shape_spec import ShapeSpec
__all__ = ['DarkNet', 'ConvBNLayer']
class ConvBNLayer(nn.Layer):
def __init__(self,
ch_in,
ch_out,
filter_size=3,
stride=1,
groups=1,
padding=0,
norm_type='bn',
norm_decay=0.,
act="leaky",
freeze_norm=False,
data_format='NCHW',
name=''):
"""
conv + bn + activation layer
Args:
ch_in (int): input channel
ch_out (int): output channel
filter_size (int): filter size, default 3
stride (int): stride, default 1
groups (int): number of groups of conv layer, default 1
padding (int): padding size, default 0
norm_type (str): batch norm type, default bn
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
act (str): activation function type, default 'leaky', which means leaky_relu
freeze_norm (bool): whether to freeze norm, default False
data_format (str): data format, NCHW or NHWC
"""
super(ConvBNLayer, self).__init__()
self.conv = nn.Conv2D(
in_channels=ch_in,
out_channels=ch_out,
kernel_size=filter_size,
stride=stride,
padding=padding,
groups=groups,
data_format=data_format,
bias_attr=False)
self.batch_norm = batch_norm(
ch_out,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format)
self.act = act
def forward(self, inputs):
out = self.conv(inputs)
out = self.batch_norm(out)
if self.act == 'leaky':
out = F.leaky_relu(out, 0.1)
else:
out = getattr(F, self.act)(out)
return out
class DownSample(nn.Layer):
def __init__(self,
ch_in,
ch_out,
filter_size=3,
stride=2,
padding=1,
norm_type='bn',
norm_decay=0.,
freeze_norm=False,
data_format='NCHW'):
"""
downsample layer
Args:
ch_in (int): input channel
ch_out (int): output channel
filter_size (int): filter size, default 3
stride (int): stride, default 2
padding (int): padding size, default 1
norm_type (str): batch norm type, default bn
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
freeze_norm (bool): whether to freeze norm, default False
data_format (str): data format, NCHW or NHWC
"""
super(DownSample, self).__init__()
self.conv_bn_layer = ConvBNLayer(
ch_in=ch_in,
ch_out=ch_out,
filter_size=filter_size,
stride=stride,
padding=padding,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format)
self.ch_out = ch_out
def forward(self, inputs):
out = self.conv_bn_layer(inputs)
return out
class BasicBlock(nn.Layer):
def __init__(self,
ch_in,
ch_out,
norm_type='bn',
norm_decay=0.,
freeze_norm=False,
data_format='NCHW'):
"""
BasicBlock layer of DarkNet
Args:
ch_in (int): input channel
ch_out (int): output channel
norm_type (str): batch norm type, default bn
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
freeze_norm (bool): whether to freeze norm, default False
data_format (str): data format, NCHW or NHWC
"""
super(BasicBlock, self).__init__()
assert ch_in == ch_out and (ch_in % 2) == 0, \
f"ch_in and ch_out should be the same even int, but the input \'ch_in is {ch_in}, \'ch_out is {ch_out}"
# example:
# --------------{conv1} --> {conv2}
# channel route: 10-->5 --> 5-->10
self.conv1 = ConvBNLayer(
ch_in=ch_in,
ch_out=int(ch_out / 2),
filter_size=1,
stride=1,
padding=0,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format)
self.conv2 = ConvBNLayer(
ch_in=int(ch_out / 2),
ch_out=ch_out,
filter_size=3,
stride=1,
padding=1,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format)
def forward(self, inputs):
conv1 = self.conv1(inputs)
conv2 = self.conv2(conv1)
out = paddle.add(x=inputs, y=conv2)
return out
class Blocks(nn.Layer):
def __init__(self,
ch_in,
ch_out,
count,
norm_type='bn',
norm_decay=0.,
freeze_norm=False,
name=None,
data_format='NCHW'):
"""
Blocks layer, which consist of some BaickBlock layers
Args:
ch_in (int): input channel
ch_out (int): output channel
count (int): number of BasicBlock layer
norm_type (str): batch norm type, default bn
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
freeze_norm (bool): whether to freeze norm, default False
name (str): layer name
data_format (str): data format, NCHW or NHWC
"""
super(Blocks, self).__init__()
self.basicblock0 = BasicBlock(
ch_in,
ch_out,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format)
self.res_out_list = []
for i in range(1, count):
block_name = '{}.{}'.format(name, i)
res_out = self.add_sublayer(
block_name,
BasicBlock(
ch_out,
ch_out,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format))
self.res_out_list.append(res_out)
self.ch_out = ch_out
def forward(self, inputs):
y = self.basicblock0(inputs)
for basic_block_i in self.res_out_list:
y = basic_block_i(y)
return y
DarkNet_cfg = {53: ([1, 2, 8, 8, 4])}
@register
@serializable
class DarkNet(nn.Layer):
__shared__ = ['norm_type', 'data_format']
def __init__(self,
depth=53,
freeze_at=-1,
return_idx=[2, 3, 4],
num_stages=5,
norm_type='bn',
norm_decay=0.,
freeze_norm=False,
data_format='NCHW'):
"""
Darknet, see https://pjreddie.com/darknet/yolo/
Args:
depth (int): depth of network
freeze_at (int): freeze the backbone at which stage
filter_size (int): filter size, default 3
return_idx (list): index of stages whose feature maps are returned
norm_type (str): batch norm type, default bn
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
data_format (str): data format, NCHW or NHWC
"""
super(DarkNet, self).__init__()
self.depth = depth
self.freeze_at = freeze_at
self.return_idx = return_idx
self.num_stages = num_stages
self.stages = DarkNet_cfg[self.depth][0:num_stages]
self.conv0 = ConvBNLayer(
ch_in=3,
ch_out=32,
filter_size=3,
stride=1,
padding=1,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format)
self.downsample0 = DownSample(
ch_in=32,
ch_out=32 * 2,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format)
self._out_channels = []
self.darknet_conv_block_list = []
self.downsample_list = []
ch_in = [64, 128, 256, 512, 1024]
for i, stage in enumerate(self.stages):
name = 'stage.{}'.format(i)
conv_block = self.add_sublayer(
name,
Blocks(
int(ch_in[i]),
int(ch_in[i]),
stage,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format,
name=name))
self.darknet_conv_block_list.append(conv_block)
if i in return_idx:
self._out_channels.append(int(ch_in[i]))
for i in range(num_stages - 1):
down_name = 'stage.{}.downsample'.format(i)
downsample = self.add_sublayer(
down_name,
DownSample(
ch_in=int(ch_in[i]),
ch_out=int(ch_in[i + 1]),
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format))
self.downsample_list.append(downsample)
def forward(self, inputs):
x = inputs['image']
out = self.conv0(x)
out = self.downsample0(out)
blocks = []
for i, conv_block_i in enumerate(self.darknet_conv_block_list):
out = conv_block_i(out)
if i == self.freeze_at:
out.stop_gradient = True
if i in self.return_idx:
blocks.append(out)
if i < self.num_stages - 1:
out = self.downsample_list[i](out)
return blocks
@property
def out_shape(self):
return [ShapeSpec(channels=c) for c in self._out_channels]
================================================
FILE: ppdet/modeling/backbones/dla.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register, serializable
from ppdet.modeling.layers import ConvNormLayer
from ..shape_spec import ShapeSpec
DLA_cfg = {34: ([1, 1, 1, 2, 2, 1], [16, 32, 64, 128, 256, 512]), }
class BasicBlock(nn.Layer):
def __init__(self, ch_in, ch_out, stride=1):
super(BasicBlock, self).__init__()
self.conv1 = ConvNormLayer(
ch_in,
ch_out,
filter_size=3,
stride=stride,
bias_on=False,
norm_decay=None)
self.conv2 = ConvNormLayer(
ch_out,
ch_out,
filter_size=3,
stride=1,
bias_on=False,
norm_decay=None)
def forward(self, inputs, residual=None):
if residual is None:
residual = inputs
out = self.conv1(inputs)
out = F.relu(out)
out = self.conv2(out)
out = paddle.add(x=out, y=residual)
out = F.relu(out)
return out
class Root(nn.Layer):
def __init__(self, ch_in, ch_out, kernel_size, residual):
super(Root, self).__init__()
self.conv = ConvNormLayer(
ch_in,
ch_out,
filter_size=1,
stride=1,
bias_on=False,
norm_decay=None)
self.residual = residual
def forward(self, inputs):
children = inputs
out = self.conv(paddle.concat(inputs, axis=1))
if self.residual:
out = paddle.add(x=out, y=children[0])
out = F.relu(out)
return out
class Tree(nn.Layer):
def __init__(self,
level,
block,
ch_in,
ch_out,
stride=1,
level_root=False,
root_dim=0,
root_kernel_size=1,
root_residual=False):
super(Tree, self).__init__()
if root_dim == 0:
root_dim = 2 * ch_out
if level_root:
root_dim += ch_in
if level == 1:
self.tree1 = block(ch_in, ch_out, stride)
self.tree2 = block(ch_out, ch_out, 1)
else:
self.tree1 = Tree(
level - 1,
block,
ch_in,
ch_out,
stride,
root_dim=0,
root_kernel_size=root_kernel_size,
root_residual=root_residual)
self.tree2 = Tree(
level - 1,
block,
ch_out,
ch_out,
1,
root_dim=root_dim + ch_out,
root_kernel_size=root_kernel_size,
root_residual=root_residual)
if level == 1:
self.root = Root(root_dim, ch_out, root_kernel_size, root_residual)
self.level_root = level_root
self.root_dim = root_dim
self.downsample = None
self.project = None
self.level = level
if stride > 1:
self.downsample = nn.MaxPool2D(stride, stride=stride)
if ch_in != ch_out:
self.project = ConvNormLayer(
ch_in,
ch_out,
filter_size=1,
stride=1,
bias_on=False,
norm_decay=None)
def forward(self, x, residual=None, children=None):
children = [] if children is None else children
bottom = self.downsample(x) if self.downsample else x
residual = self.project(bottom) if self.project else bottom
if self.level_root:
children.append(bottom)
x1 = self.tree1(x, residual)
if self.level == 1:
x2 = self.tree2(x1)
x = self.root([x2, x1] + children)
else:
children.append(x1)
x = self.tree2(x1, children=children)
return x
@register
@serializable
class DLA(nn.Layer):
"""
DLA, see https://arxiv.org/pdf/1707.06484.pdf
Args:
depth (int): DLA depth, only support 34 now.
residual_root (bool): whether use a reidual layer in the root block
pre_img (bool): add pre_img, only used in CenterTrack
pre_hm (bool): add pre_hm, only used in CenterTrack
"""
def __init__(self,
depth=34,
residual_root=False,
pre_img=False,
pre_hm=False):
super(DLA, self).__init__()
assert depth == 34, 'Only support DLA with depth of 34 now.'
if depth == 34:
block = BasicBlock
levels, channels = DLA_cfg[depth]
self.channels = channels
self.num_levels = len(levels)
self.base_layer = nn.Sequential(
ConvNormLayer(
3,
channels[0],
filter_size=7,
stride=1,
bias_on=False,
norm_decay=None),
nn.ReLU())
self.level0 = self._make_conv_level(channels[0], channels[0], levels[0])
self.level1 = self._make_conv_level(
channels[0], channels[1], levels[1], stride=2)
self.level2 = Tree(
levels[2],
block,
channels[1],
channels[2],
2,
level_root=False,
root_residual=residual_root)
self.level3 = Tree(
levels[3],
block,
channels[2],
channels[3],
2,
level_root=True,
root_residual=residual_root)
self.level4 = Tree(
levels[4],
block,
channels[3],
channels[4],
2,
level_root=True,
root_residual=residual_root)
self.level5 = Tree(
levels[5],
block,
channels[4],
channels[5],
2,
level_root=True,
root_residual=residual_root)
if pre_img:
self.pre_img_layer = nn.Sequential(
ConvNormLayer(
3,
channels[0],
filter_size=7,
stride=1,
bias_on=False,
norm_decay=None),
nn.ReLU())
if pre_hm:
self.pre_hm_layer = nn.Sequential(
ConvNormLayer(
1,
channels[0],
filter_size=7,
stride=1,
bias_on=False,
norm_decay=None),
nn.ReLU())
self.pre_img = pre_img
self.pre_hm = pre_hm
def _make_conv_level(self, ch_in, ch_out, conv_num, stride=1):
modules = []
for i in range(conv_num):
modules.extend([
ConvNormLayer(
ch_in,
ch_out,
filter_size=3,
stride=stride if i == 0 else 1,
bias_on=False,
norm_decay=None), nn.ReLU()
])
ch_in = ch_out
return nn.Sequential(*modules)
@property
def out_shape(self):
return [
ShapeSpec(channels=self.channels[i]) for i in range(self.num_levels)
]
def forward(self, inputs):
outs = []
feats = self.base_layer(inputs['image'])
if self.pre_img and 'pre_image' in inputs and inputs[
'pre_image'] is not None:
feats = feats + self.pre_img_layer(inputs['pre_image'])
if self.pre_hm and 'pre_hm' in inputs and inputs['pre_hm'] is not None:
feats = feats + self.pre_hm_layer(inputs['pre_hm'])
for i in range(self.num_levels):
feats = getattr(self, 'level{}'.format(i))(feats)
outs.append(feats)
return outs
================================================
FILE: ppdet/modeling/backbones/esnet.py
================================================
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm
from paddle.nn.initializer import KaimingNormal
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register, serializable
from numbers import Integral
from ..shape_spec import ShapeSpec
from ppdet.modeling.ops import channel_shuffle
from ppdet.modeling.backbones.shufflenet_v2 import ConvBNLayer
__all__ = ['ESNet']
def make_divisible(v, divisor=16, min_value=None):
if min_value is None:
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
if new_v < 0.9 * v:
new_v += divisor
return new_v
class SEModule(nn.Layer):
def __init__(self, channel, reduction=4):
super(SEModule, self).__init__()
self.avg_pool = AdaptiveAvgPool2D(1)
self.conv1 = Conv2D(
in_channels=channel,
out_channels=channel // reduction,
kernel_size=1,
stride=1,
padding=0,
weight_attr=ParamAttr(),
bias_attr=ParamAttr())
self.conv2 = Conv2D(
in_channels=channel // reduction,
out_channels=channel,
kernel_size=1,
stride=1,
padding=0,
weight_attr=ParamAttr(),
bias_attr=ParamAttr())
def forward(self, inputs):
outputs = self.avg_pool(inputs)
outputs = self.conv1(outputs)
outputs = F.relu(outputs)
outputs = self.conv2(outputs)
outputs = F.hardsigmoid(outputs)
return paddle.multiply(x=inputs, y=outputs)
class InvertedResidual(nn.Layer):
def __init__(self,
in_channels,
mid_channels,
out_channels,
stride,
act="relu"):
super(InvertedResidual, self).__init__()
self._conv_pw = ConvBNLayer(
in_channels=in_channels // 2,
out_channels=mid_channels // 2,
kernel_size=1,
stride=1,
padding=0,
groups=1,
act=act)
self._conv_dw = ConvBNLayer(
in_channels=mid_channels // 2,
out_channels=mid_channels // 2,
kernel_size=3,
stride=stride,
padding=1,
groups=mid_channels // 2,
act=None)
self._se = SEModule(mid_channels)
self._conv_linear = ConvBNLayer(
in_channels=mid_channels,
out_channels=out_channels // 2,
kernel_size=1,
stride=1,
padding=0,
groups=1,
act=act)
def forward(self, inputs):
x1, x2 = paddle.split(
inputs,
num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2],
axis=1)
x2 = self._conv_pw(x2)
x3 = self._conv_dw(x2)
x3 = paddle.concat([x2, x3], axis=1)
x3 = self._se(x3)
x3 = self._conv_linear(x3)
out = paddle.concat([x1, x3], axis=1)
return channel_shuffle(out, 2)
class InvertedResidualDS(nn.Layer):
def __init__(self,
in_channels,
mid_channels,
out_channels,
stride,
act="relu"):
super(InvertedResidualDS, self).__init__()
# branch1
self._conv_dw_1 = ConvBNLayer(
in_channels=in_channels,
out_channels=in_channels,
kernel_size=3,
stride=stride,
padding=1,
groups=in_channels,
act=None)
self._conv_linear_1 = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels // 2,
kernel_size=1,
stride=1,
padding=0,
groups=1,
act=act)
# branch2
self._conv_pw_2 = ConvBNLayer(
in_channels=in_channels,
out_channels=mid_channels // 2,
kernel_size=1,
stride=1,
padding=0,
groups=1,
act=act)
self._conv_dw_2 = ConvBNLayer(
in_channels=mid_channels // 2,
out_channels=mid_channels // 2,
kernel_size=3,
stride=stride,
padding=1,
groups=mid_channels // 2,
act=None)
self._se = SEModule(mid_channels // 2)
self._conv_linear_2 = ConvBNLayer(
in_channels=mid_channels // 2,
out_channels=out_channels // 2,
kernel_size=1,
stride=1,
padding=0,
groups=1,
act=act)
self._conv_dw_mv1 = ConvBNLayer(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=3,
stride=1,
padding=1,
groups=out_channels,
act="hard_swish")
self._conv_pw_mv1 = ConvBNLayer(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=1,
stride=1,
padding=0,
groups=1,
act="hard_swish")
def forward(self, inputs):
x1 = self._conv_dw_1(inputs)
x1 = self._conv_linear_1(x1)
x2 = self._conv_pw_2(inputs)
x2 = self._conv_dw_2(x2)
x2 = self._se(x2)
x2 = self._conv_linear_2(x2)
out = paddle.concat([x1, x2], axis=1)
out = self._conv_dw_mv1(out)
out = self._conv_pw_mv1(out)
return out
@register
@serializable
class ESNet(nn.Layer):
def __init__(self,
scale=1.0,
act="hard_swish",
feature_maps=[4, 11, 14],
channel_ratio=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]):
super(ESNet, self).__init__()
self.scale = scale
if isinstance(feature_maps, Integral):
feature_maps = [feature_maps]
self.feature_maps = feature_maps
stage_repeats = [3, 7, 3]
stage_out_channels = [
-1, 24, make_divisible(128 * scale), make_divisible(256 * scale),
make_divisible(512 * scale), 1024
]
self._out_channels = []
self._feature_idx = 0
# 1. conv1
self._conv1 = ConvBNLayer(
in_channels=3,
out_channels=stage_out_channels[1],
kernel_size=3,
stride=2,
padding=1,
act=act)
self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
self._feature_idx += 1
# 2. bottleneck sequences
self._block_list = []
arch_idx = 0
for stage_id, num_repeat in enumerate(stage_repeats):
for i in range(num_repeat):
channels_scales = channel_ratio[arch_idx]
mid_c = make_divisible(
int(stage_out_channels[stage_id + 2] * channels_scales),
divisor=8)
if i == 0:
block = self.add_sublayer(
name=str(stage_id + 2) + '_' + str(i + 1),
sublayer=InvertedResidualDS(
in_channels=stage_out_channels[stage_id + 1],
mid_channels=mid_c,
out_channels=stage_out_channels[stage_id + 2],
stride=2,
act=act))
else:
block = self.add_sublayer(
name=str(stage_id + 2) + '_' + str(i + 1),
sublayer=InvertedResidual(
in_channels=stage_out_channels[stage_id + 2],
mid_channels=mid_c,
out_channels=stage_out_channels[stage_id + 2],
stride=1,
act=act))
self._block_list.append(block)
arch_idx += 1
self._feature_idx += 1
self._update_out_channels(stage_out_channels[stage_id + 2],
self._feature_idx, self.feature_maps)
def _update_out_channels(self, channel, feature_idx, feature_maps):
if feature_idx in feature_maps:
self._out_channels.append(channel)
def forward(self, inputs):
y = self._conv1(inputs['image'])
y = self._max_pool(y)
outs = []
for i, inv in enumerate(self._block_list):
y = inv(y)
if i + 2 in self.feature_maps:
outs.append(y)
return outs
@property
def out_shape(self):
return [ShapeSpec(channels=c) for c in self._out_channels]
================================================
FILE: ppdet/modeling/backbones/focalnet.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/microsoft/FocalNet/blob/main/classification/focalnet.py
"""
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.modeling.shape_spec import ShapeSpec
from ppdet.core.workspace import register, serializable
from .transformer_utils import DropPath, Identity
from .transformer_utils import add_parameter, to_2tuple
from .transformer_utils import ones_, zeros_, trunc_normal_
from .swin_transformer import Mlp
__all__ = ['FocalNet']
MODEL_cfg = {
'focalnet_T_224_1k_srf': dict(
embed_dim=96,
depths=[2, 2, 6, 2],
focal_levels=[2, 2, 2, 2],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.2,
use_conv_embed=False,
use_postln=False,
use_postln_in_modulation=False,
use_layerscale=False,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_srf_pretrained.pdparams',
),
'focalnet_S_224_1k_srf': dict(
embed_dim=96,
depths=[2, 2, 18, 2],
focal_levels=[2, 2, 2, 2],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.3,
use_conv_embed=False,
use_postln=False,
use_postln_in_modulation=False,
use_layerscale=False,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_srf_pretrained.pdparams',
),
'focalnet_B_224_1k_srf': dict(
embed_dim=128,
depths=[2, 2, 18, 2],
focal_levels=[2, 2, 2, 2],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.5,
use_conv_embed=False,
use_postln=False,
use_postln_in_modulation=False,
use_layerscale=False,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_srf_pretrained.pdparams',
),
'focalnet_T_224_1k_lrf': dict(
embed_dim=96,
depths=[2, 2, 6, 2],
focal_levels=[3, 3, 3, 3],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.2,
use_conv_embed=False,
use_postln=False,
use_postln_in_modulation=False,
use_layerscale=False,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_lrf_pretrained.pdparams',
),
'focalnet_S_224_1k_lrf': dict(
embed_dim=96,
depths=[2, 2, 18, 2],
focal_levels=[3, 3, 3, 3],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.3,
use_conv_embed=False,
use_postln=False,
use_postln_in_modulation=False,
use_layerscale=False,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_lrf_pretrained.pdparams',
),
'focalnet_B_224_1k_lrf': dict(
embed_dim=128,
depths=[2, 2, 18, 2],
focal_levels=[3, 3, 3, 3],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.5,
use_conv_embed=False,
use_postln=False,
use_postln_in_modulation=False,
use_layerscale=False,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_lrf_pretrained.pdparams',
),
'focalnet_L_384_22k_fl3': dict(
embed_dim=192,
depths=[2, 2, 18, 2],
focal_levels=[3, 3, 3, 3],
focal_windows=[5, 5, 5, 5],
drop_path_rate=0.5,
use_conv_embed=True,
use_postln=True,
use_postln_in_modulation=False,
use_layerscale=True,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_pretrained.pdparams',
),
'focalnet_L_384_22k_fl4': dict(
embed_dim=192,
depths=[2, 2, 18, 2],
focal_levels=[4, 4, 4, 4],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.5,
use_conv_embed=True,
use_postln=True,
use_postln_in_modulation=False,
use_layerscale=True,
normalize_modulator=True, #
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_fl4_pretrained.pdparams',
),
'focalnet_XL_384_22k_fl3': dict(
embed_dim=256,
depths=[2, 2, 18, 2],
focal_levels=[3, 3, 3, 3],
focal_windows=[5, 5, 5, 5],
drop_path_rate=0.5,
use_conv_embed=True,
use_postln=True,
use_postln_in_modulation=False,
use_layerscale=True,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_pretrained.pdparams',
),
'focalnet_XL_384_22k_fl4': dict(
embed_dim=256,
depths=[2, 2, 18, 2],
focal_levels=[4, 4, 4, 4],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.5,
use_conv_embed=True,
use_postln=True,
use_postln_in_modulation=False,
use_layerscale=True,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_fl4_pretrained.pdparams',
),
'focalnet_H_224_22k_fl3': dict(
embed_dim=352,
depths=[2, 2, 18, 2],
focal_levels=[3, 3, 3, 3],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.5,
use_conv_embed=True,
use_postln=True,
use_postln_in_modulation=True, #
use_layerscale=True,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_pretrained.pdparams',
),
'focalnet_H_224_22k_fl4': dict(
embed_dim=352,
depths=[2, 2, 18, 2],
focal_levels=[4, 4, 4, 4],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.5,
use_conv_embed=True,
use_postln=True,
use_postln_in_modulation=True, #
use_layerscale=True,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_fl4_pretrained.pdparams',
),
}
class FocalModulation(nn.Layer):
"""
Args:
dim (int): Number of input channels.
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
focal_level (int): Number of focal levels
focal_window (int): Focal window size at focal level 1
focal_factor (int): Step to increase the focal window. Default: 2
use_postln_in_modulation (bool): Whether use post-modulation layernorm
normalize_modulator (bool): Whether use normalize in modulator
"""
def __init__(self,
dim,
proj_drop=0.,
focal_level=2,
focal_window=7,
focal_factor=2,
use_postln_in_modulation=False,
normalize_modulator=False):
super().__init__()
self.dim = dim
# specific args for focalv3
self.focal_level = focal_level
self.focal_window = focal_window
self.focal_factor = focal_factor
self.use_postln_in_modulation = use_postln_in_modulation
self.normalize_modulator = normalize_modulator
self.f = nn.Linear(
dim, 2 * dim + (self.focal_level + 1), bias_attr=True)
self.h = nn.Conv2D(
dim,
dim,
kernel_size=1,
stride=1,
padding=0,
groups=1,
bias_attr=True)
self.act = nn.GELU()
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
self.focal_layers = nn.LayerList()
if self.use_postln_in_modulation:
self.ln = nn.LayerNorm(dim)
for k in range(self.focal_level):
kernel_size = self.focal_factor * k + self.focal_window
self.focal_layers.append(
nn.Sequential(
nn.Conv2D(
dim,
dim,
kernel_size=kernel_size,
stride=1,
groups=dim,
padding=kernel_size // 2,
bias_attr=False),
nn.GELU()))
def forward(self, x):
""" Forward function.
Args:
x: input features with shape of (B, H, W, C)
"""
_, _, _, C = x.shape
x = self.f(x)
x = x.transpose([0, 3, 1, 2])
q, ctx, gates = paddle.split(x, (C, C, self.focal_level + 1), 1)
ctx_all = 0
for l in range(self.focal_level):
ctx = self.focal_layers[l](ctx)
ctx_all = ctx_all + ctx * gates[:, l:l + 1]
ctx_global = self.act(ctx.mean(2, keepdim=True).mean(3, keepdim=True))
ctx_all = ctx_all + ctx_global * gates[:, self.focal_level:]
if self.normalize_modulator:
ctx_all = ctx_all / (self.focal_level + 1)
x_out = q * self.h(ctx_all)
x_out = x_out.transpose([0, 2, 3, 1])
if self.use_postln_in_modulation:
x_out = self.ln(x_out)
x_out = self.proj(x_out)
x_out = self.proj_drop(x_out)
return x_out
class FocalModulationBlock(nn.Layer):
""" Focal Modulation Block.
Args:
dim (int): Number of input channels.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
drop (float, optional): Dropout rate. Default: 0.0
drop_path (float, optional): Stochastic depth rate. Default: 0.0
act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
focal_level (int): number of focal levels
focal_window (int): focal kernel size at level 1
use_postln (bool): Whether use layernorm after modulation. Default: False.
use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
normalize_modulator (bool): Whether use normalize in modulator
use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False
layerscale_value (float): Value for layer scale. Default: 1e-4
"""
def __init__(self,
dim,
mlp_ratio=4.,
drop=0.,
drop_path=0.,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm,
focal_level=2,
focal_window=9,
use_postln=False,
use_postln_in_modulation=False,
normalize_modulator=False,
use_layerscale=False,
layerscale_value=1e-4):
super().__init__()
self.dim = dim
self.mlp_ratio = mlp_ratio
self.focal_window = focal_window
self.focal_level = focal_level
self.use_postln = use_postln
self.use_layerscale = use_layerscale
self.norm1 = norm_layer(dim)
self.modulation = FocalModulation(
dim,
proj_drop=drop,
focal_level=self.focal_level,
focal_window=self.focal_window,
use_postln_in_modulation=use_postln_in_modulation,
normalize_modulator=normalize_modulator)
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)
self.H = None
self.W = None
self.gamma_1 = 1.0
self.gamma_2 = 1.0
if self.use_layerscale:
self.gamma_1 = add_parameter(self,
layerscale_value * paddle.ones([dim]))
self.gamma_2 = add_parameter(self,
layerscale_value * paddle.ones([dim]))
def forward(self, x):
"""
Args:
x: Input feature, tensor size (B, H*W, C).
"""
B, L, C = x.shape
H, W = self.H, self.W
assert L == H * W, "input feature has wrong size"
shortcut = x
if not self.use_postln:
x = self.norm1(x)
x = x.reshape([-1, H, W, C])
# FM
x = self.modulation(x).reshape([-1, H * W, C])
if self.use_postln:
x = self.norm1(x)
# FFN
x = shortcut + self.drop_path(self.gamma_1 * x)
if self.use_postln:
x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x)))
else:
x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
return x
class BasicLayer(nn.Layer):
""" A basic focal modulation layer for one stage.
Args:
dim (int): Number of feature channels
depth (int): Depths of this stage.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
drop (float, optional): Dropout rate. Default: 0.0
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
focal_level (int): Number of focal levels
focal_window (int): Focal window size at focal level 1
use_conv_embed (bool): Whether use overlapped convolution for patch embedding
use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False
layerscale_value (float): Value of layerscale
use_postln (bool): Whether use layernorm after modulation. Default: False.
use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
normalize_modulator (bool): Whether use normalize in modulator
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
"""
def __init__(self,
dim,
depth,
mlp_ratio=4.,
drop=0.,
drop_path=0.,
norm_layer=nn.LayerNorm,
downsample=None,
focal_level=2,
focal_window=9,
use_conv_embed=False,
use_layerscale=False,
layerscale_value=1e-4,
use_postln=False,
use_postln_in_modulation=False,
normalize_modulator=False,
use_checkpoint=False):
super().__init__()
self.depth = depth
self.use_checkpoint = use_checkpoint
# build blocks
self.blocks = nn.LayerList([
FocalModulationBlock(
dim=dim,
mlp_ratio=mlp_ratio,
drop=drop,
drop_path=drop_path[i]
if isinstance(drop_path, np.ndarray) else drop_path,
act_layer=nn.GELU,
norm_layer=norm_layer,
focal_level=focal_level,
focal_window=focal_window,
use_postln=use_postln,
use_postln_in_modulation=use_postln_in_modulation,
normalize_modulator=normalize_modulator,
use_layerscale=use_layerscale,
layerscale_value=layerscale_value) for i in range(depth)
])
# patch merging layer
if downsample is not None:
self.downsample = downsample(
patch_size=2,
in_chans=dim,
embed_dim=2 * dim,
use_conv_embed=use_conv_embed,
norm_layer=norm_layer,
is_stem=False)
else:
self.downsample = None
def forward(self, x, H, W):
"""
Args:
x: Input feature, tensor size (B, H*W, C).
"""
for blk in self.blocks:
blk.H, blk.W = H, W
x = blk(x)
if self.downsample is not None:
x_reshaped = x.transpose([0, 2, 1]).reshape(
[x.shape[0], x.shape[-1], H, W])
x_down = self.downsample(x_reshaped)
x_down = x_down.flatten(2).transpose([0, 2, 1])
Wh, Ww = (H + 1) // 2, (W + 1) // 2
return x, H, W, x_down, Wh, Ww
else:
return x, H, W, x, H, W
class PatchEmbed(nn.Layer):
""" Image to Patch Embedding
Args:
patch_size (int): Patch token size. Default: 4.
in_chans (int): Number of input image channels. Default: 3.
embed_dim (int): Number of linear projection output channels. Default: 96.
norm_layer (nn.Layer, optional): Normalization layer. Default: None
use_conv_embed (bool): Whether use overlapped convolution for patch embedding. Default: False
is_stem (bool): Is the stem block or not.
"""
def __init__(self,
patch_size=4,
in_chans=3,
embed_dim=96,
norm_layer=None,
use_conv_embed=False,
is_stem=False):
super().__init__()
patch_size = to_2tuple(patch_size)
self.patch_size = patch_size
self.in_chans = in_chans
self.embed_dim = embed_dim
if use_conv_embed:
# if we choose to use conv embedding, then we treat the stem and non-stem differently
if is_stem:
kernel_size = 7
padding = 2
stride = 4
else:
kernel_size = 3
padding = 1
stride = 2
self.proj = nn.Conv2D(
in_chans,
embed_dim,
kernel_size=kernel_size,
stride=stride,
padding=padding)
else:
self.proj = nn.Conv2D(
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
if norm_layer is not None:
self.norm = norm_layer(embed_dim)
else:
self.norm = None
def forward(self, x):
_, _, H, W = x.shape
if W % self.patch_size[1] != 0:
# for 3D tensor: [pad_left, pad_right]
# for 4D tensor: [pad_left, pad_right, pad_top, pad_bottom]
x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
W += W % self.patch_size[1]
if H % self.patch_size[0] != 0:
x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])
H += H % self.patch_size[0]
x = self.proj(x)
if self.norm is not None:
_, _, Wh, Ww = x.shape
x = x.flatten(2).transpose([0, 2, 1])
x = self.norm(x)
x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])
return x
@register
@serializable
class FocalNet(nn.Layer):
""" FocalNet backbone
Args:
arch (str): Architecture of FocalNet
out_indices (Sequence[int]): Output from which stages.
frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-1 means not freezing any parameters.
patch_size (int | tuple(int)): Patch size. Default: 4.
in_chans (int): Number of input image channels. Default: 3.
embed_dim (int): Number of linear projection output channels. Default: 96.
depths (tuple[int]): Depths of each FocalNet Transformer stage.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
drop_rate (float): Dropout rate.
drop_path_rate (float): Stochastic depth rate. Default: 0.2.
norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
patch_norm (bool): If True, add normalization after patch embedding. Default: True.
focal_levels (Sequence[int]): Number of focal levels at four stages
focal_windows (Sequence[int]): Focal window sizes at first focal level at four stages
use_conv_embed (bool): Whether use overlapped convolution for patch embedding
use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False
layerscale_value (float): Value of layerscale
use_postln (bool): Whether use layernorm after modulation. Default: False.
use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
normalize_modulator (bool): Whether use normalize in modulator
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
"""
def __init__(
self,
arch='focalnet_T_224_1k_srf',
out_indices=(0, 1, 2, 3),
frozen_stages=-1,
patch_size=4,
in_chans=3,
embed_dim=96,
depths=[2, 2, 6, 2],
mlp_ratio=4.,
drop_rate=0.,
drop_path_rate=0.2, # 0.5 better for large+ models
norm_layer=nn.LayerNorm,
patch_norm=True,
focal_levels=[2, 2, 2, 2],
focal_windows=[3, 3, 3, 3],
use_conv_embed=False,
use_layerscale=False,
layerscale_value=1e-4,
use_postln=False,
use_postln_in_modulation=False,
normalize_modulator=False,
use_checkpoint=False,
pretrained=None):
super(FocalNet, self).__init__()
assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch)
embed_dim = MODEL_cfg[arch]['embed_dim']
depths = MODEL_cfg[arch]['depths']
drop_path_rate = MODEL_cfg[arch]['drop_path_rate']
focal_levels = MODEL_cfg[arch]['focal_levels']
focal_windows = MODEL_cfg[arch]['focal_windows']
use_conv_embed = MODEL_cfg[arch]['use_conv_embed']
use_layerscale = MODEL_cfg[arch]['use_layerscale']
use_postln = MODEL_cfg[arch]['use_postln']
use_postln_in_modulation = MODEL_cfg[arch]['use_postln_in_modulation']
normalize_modulator = MODEL_cfg[arch]['normalize_modulator']
if pretrained is None:
pretrained = MODEL_cfg[arch]['pretrained']
self.out_indices = out_indices
self.frozen_stages = frozen_stages
self.num_layers = len(depths)
self.patch_norm = patch_norm
# split image into non-overlapping patches
self.patch_embed = PatchEmbed(
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim,
norm_layer=norm_layer if self.patch_norm else None,
use_conv_embed=use_conv_embed,
is_stem=True)
self.pos_drop = nn.Dropout(p=drop_rate)
# stochastic depth decay rule
dpr = np.linspace(0, drop_path_rate, sum(depths))
# build layers
self.layers = nn.LayerList()
for i_layer in range(self.num_layers):
layer = BasicLayer(
dim=int(embed_dim * 2**i_layer),
depth=depths[i_layer],
mlp_ratio=mlp_ratio,
drop=drop_rate,
drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
norm_layer=norm_layer,
downsample=PatchEmbed
if (i_layer < self.num_layers - 1) else None,
focal_level=focal_levels[i_layer],
focal_window=focal_windows[i_layer],
use_conv_embed=use_conv_embed,
use_layerscale=use_layerscale,
layerscale_value=layerscale_value,
use_postln=use_postln,
use_postln_in_modulation=use_postln_in_modulation,
normalize_modulator=normalize_modulator,
use_checkpoint=use_checkpoint)
self.layers.append(layer)
num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
self.num_features = num_features
# add a norm layer for each output
for i_layer in out_indices:
layer = norm_layer(num_features[i_layer])
layer_name = f'norm{i_layer}'
self.add_sublayer(layer_name, layer)
self.apply(self._init_weights)
self._freeze_stages()
if pretrained:
if 'http' in pretrained: #URL
path = paddle.utils.download.get_weights_path_from_url(
pretrained)
else: #model in local path
path = pretrained
self.set_state_dict(paddle.load(path))
def _freeze_stages(self):
if self.frozen_stages >= 0:
self.patch_embed.eval()
for param in self.patch_embed.parameters():
param.stop_gradient = True
if self.frozen_stages >= 2:
self.pos_drop.eval()
for i in range(0, self.frozen_stages - 1):
m = self.layers[i]
m.eval()
for param in m.parameters():
param.stop_gradient = True
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight)
if isinstance(m, nn.Linear) and m.bias is not None:
zeros_(m.bias)
elif isinstance(m, nn.LayerNorm):
zeros_(m.bias)
ones_(m.weight)
def forward(self, x):
x = self.patch_embed(x['image'])
B, _, Wh, Ww = x.shape
x = x.flatten(2).transpose([0, 2, 1])
x = self.pos_drop(x)
outs = []
for i in range(self.num_layers):
layer = self.layers[i]
x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
if i in self.out_indices:
norm_layer = getattr(self, f'norm{i}')
x_out = norm_layer(x_out)
out = x_out.reshape([-1, H, W, self.num_features[i]]).transpose(
(0, 3, 1, 2))
outs.append(out)
return outs
@property
def out_shape(self):
out_strides = [4, 8, 16, 32]
return [
ShapeSpec(
channels=self.num_features[i], stride=out_strides[i])
for i in self.out_indices
]
================================================
FILE: ppdet/modeling/backbones/ghostnet.py
================================================
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle
from paddle import ParamAttr
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn import AdaptiveAvgPool2D, Linear
from paddle.nn.initializer import Uniform
from ppdet.core.workspace import register, serializable
from numbers import Integral
from ..shape_spec import ShapeSpec
from .mobilenet_v3 import make_divisible, ConvBNLayer
__all__ = ['GhostNet']
class ExtraBlockDW(nn.Layer):
def __init__(self,
in_c,
ch_1,
ch_2,
stride,
lr_mult,
conv_decay=0.,
norm_type='bn',
norm_decay=0.,
freeze_norm=False,
name=None):
super(ExtraBlockDW, self).__init__()
self.pointwise_conv = ConvBNLayer(
in_c=in_c,
out_c=ch_1,
filter_size=1,
stride=1,
padding=0,
act='relu6',
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_extra1")
self.depthwise_conv = ConvBNLayer(
in_c=ch_1,
out_c=ch_2,
filter_size=3,
stride=stride,
padding=1, #
num_groups=int(ch_1),
act='relu6',
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_extra2_dw")
self.normal_conv = ConvBNLayer(
in_c=ch_2,
out_c=ch_2,
filter_size=1,
stride=1,
padding=0,
act='relu6',
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_extra2_sep")
def forward(self, inputs):
x = self.pointwise_conv(inputs)
x = self.depthwise_conv(x)
x = self.normal_conv(x)
return x
class SEBlock(nn.Layer):
def __init__(self, num_channels, lr_mult, reduction_ratio=4, name=None):
super(SEBlock, self).__init__()
self.pool2d_gap = AdaptiveAvgPool2D(1)
self._num_channels = num_channels
stdv = 1.0 / math.sqrt(num_channels * 1.0)
med_ch = num_channels // reduction_ratio
self.squeeze = Linear(
num_channels,
med_ch,
weight_attr=ParamAttr(
learning_rate=lr_mult, initializer=Uniform(-stdv, stdv)),
bias_attr=ParamAttr(learning_rate=lr_mult))
stdv = 1.0 / math.sqrt(med_ch * 1.0)
self.excitation = Linear(
med_ch,
num_channels,
weight_attr=ParamAttr(
learning_rate=lr_mult, initializer=Uniform(-stdv, stdv)),
bias_attr=ParamAttr(learning_rate=lr_mult))
def forward(self, inputs):
pool = self.pool2d_gap(inputs)
pool = paddle.squeeze(pool, axis=[2, 3])
squeeze = self.squeeze(pool)
squeeze = F.relu(squeeze)
excitation = self.excitation(squeeze)
excitation = paddle.clip(x=excitation, min=0, max=1)
excitation = paddle.unsqueeze(excitation, axis=[2, 3])
out = paddle.multiply(inputs, excitation)
return out
class GhostModule(nn.Layer):
def __init__(self,
in_channels,
output_channels,
kernel_size=1,
ratio=2,
dw_size=3,
stride=1,
relu=True,
lr_mult=1.,
conv_decay=0.,
norm_type='bn',
norm_decay=0.,
freeze_norm=False,
name=None):
super(GhostModule, self).__init__()
init_channels = int(math.ceil(output_channels / ratio))
new_channels = int(init_channels * (ratio - 1))
self.primary_conv = ConvBNLayer(
in_c=in_channels,
out_c=init_channels,
filter_size=kernel_size,
stride=stride,
padding=int((kernel_size - 1) // 2),
num_groups=1,
act="relu" if relu else None,
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_primary_conv")
self.cheap_operation = ConvBNLayer(
in_c=init_channels,
out_c=new_channels,
filter_size=dw_size,
stride=1,
padding=int((dw_size - 1) // 2),
num_groups=init_channels,
act="relu" if relu else None,
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_cheap_operation")
def forward(self, inputs):
x = self.primary_conv(inputs)
y = self.cheap_operation(x)
out = paddle.concat([x, y], axis=1)
return out
class GhostBottleneck(nn.Layer):
def __init__(self,
in_channels,
hidden_dim,
output_channels,
kernel_size,
stride,
use_se,
lr_mult,
conv_decay=0.,
norm_type='bn',
norm_decay=0.,
freeze_norm=False,
return_list=False,
name=None):
super(GhostBottleneck, self).__init__()
self._stride = stride
self._use_se = use_se
self._num_channels = in_channels
self._output_channels = output_channels
self.return_list = return_list
self.ghost_module_1 = GhostModule(
in_channels=in_channels,
output_channels=hidden_dim,
kernel_size=1,
stride=1,
relu=True,
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_ghost_module_1")
if stride == 2:
self.depthwise_conv = ConvBNLayer(
in_c=hidden_dim,
out_c=hidden_dim,
filter_size=kernel_size,
stride=stride,
padding=int((kernel_size - 1) // 2),
num_groups=hidden_dim,
act=None,
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name +
"_depthwise_depthwise" # looks strange due to an old typo, will be fixed later.
)
if use_se:
self.se_block = SEBlock(hidden_dim, lr_mult, name=name + "_se")
self.ghost_module_2 = GhostModule(
in_channels=hidden_dim,
output_channels=output_channels,
kernel_size=1,
relu=False,
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_ghost_module_2")
if stride != 1 or in_channels != output_channels:
self.shortcut_depthwise = ConvBNLayer(
in_c=in_channels,
out_c=in_channels,
filter_size=kernel_size,
stride=stride,
padding=int((kernel_size - 1) // 2),
num_groups=in_channels,
act=None,
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name +
"_shortcut_depthwise_depthwise" # looks strange due to an old typo, will be fixed later.
)
self.shortcut_conv = ConvBNLayer(
in_c=in_channels,
out_c=output_channels,
filter_size=1,
stride=1,
padding=0,
num_groups=1,
act=None,
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_shortcut_conv")
def forward(self, inputs):
y = self.ghost_module_1(inputs)
x = y
if self._stride == 2:
x = self.depthwise_conv(x)
if self._use_se:
x = self.se_block(x)
x = self.ghost_module_2(x)
if self._stride == 1 and self._num_channels == self._output_channels:
shortcut = inputs
else:
shortcut = self.shortcut_depthwise(inputs)
shortcut = self.shortcut_conv(shortcut)
x = paddle.add(x=x, y=shortcut)
if self.return_list:
return [y, x]
else:
return x
@register
@serializable
class GhostNet(nn.Layer):
__shared__ = ['norm_type']
def __init__(
self,
scale=1.3,
feature_maps=[6, 12, 15],
with_extra_blocks=False,
extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]],
lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
conv_decay=0.,
norm_type='bn',
norm_decay=0.0,
freeze_norm=False):
super(GhostNet, self).__init__()
if isinstance(feature_maps, Integral):
feature_maps = [feature_maps]
if norm_type == 'sync_bn' and freeze_norm:
raise ValueError(
"The norm_type should not be sync_bn when freeze_norm is True")
self.feature_maps = feature_maps
self.with_extra_blocks = with_extra_blocks
self.extra_block_filters = extra_block_filters
inplanes = 16
self.cfgs = [
# k, t, c, SE, s
[3, 16, 16, 0, 1],
[3, 48, 24, 0, 2],
[3, 72, 24, 0, 1],
[5, 72, 40, 1, 2],
[5, 120, 40, 1, 1],
[3, 240, 80, 0, 2],
[3, 200, 80, 0, 1],
[3, 184, 80, 0, 1],
[3, 184, 80, 0, 1],
[3, 480, 112, 1, 1],
[3, 672, 112, 1, 1],
[5, 672, 160, 1, 2], # SSDLite output
[5, 960, 160, 0, 1],
[5, 960, 160, 1, 1],
[5, 960, 160, 0, 1],
[5, 960, 160, 1, 1]
]
self.scale = scale
conv1_out_ch = int(make_divisible(inplanes * self.scale, 4))
self.conv1 = ConvBNLayer(
in_c=3,
out_c=conv1_out_ch,
filter_size=3,
stride=2,
padding=1,
num_groups=1,
act="relu",
lr_mult=1.,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name="conv1")
# build inverted residual blocks
self._out_channels = []
self.ghost_bottleneck_list = []
idx = 0
inplanes = conv1_out_ch
for k, exp_size, c, use_se, s in self.cfgs:
lr_idx = min(idx // 3, len(lr_mult_list) - 1)
lr_mult = lr_mult_list[lr_idx]
# for SSD/SSDLite, first head input is after ResidualUnit expand_conv
return_list = self.with_extra_blocks and idx + 2 in self.feature_maps
ghost_bottleneck = self.add_sublayer(
"_ghostbottleneck_" + str(idx),
sublayer=GhostBottleneck(
in_channels=inplanes,
hidden_dim=int(make_divisible(exp_size * self.scale, 4)),
output_channels=int(make_divisible(c * self.scale, 4)),
kernel_size=k,
stride=s,
use_se=use_se,
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
return_list=return_list,
name="_ghostbottleneck_" + str(idx)))
self.ghost_bottleneck_list.append(ghost_bottleneck)
inplanes = int(make_divisible(c * self.scale, 4))
idx += 1
self._update_out_channels(
int(make_divisible(exp_size * self.scale, 4))
if return_list else inplanes, idx + 1, feature_maps)
if self.with_extra_blocks:
self.extra_block_list = []
extra_out_c = int(make_divisible(self.scale * self.cfgs[-1][1], 4))
lr_idx = min(idx // 3, len(lr_mult_list) - 1)
lr_mult = lr_mult_list[lr_idx]
conv_extra = self.add_sublayer(
"conv" + str(idx + 2),
sublayer=ConvBNLayer(
in_c=inplanes,
out_c=extra_out_c,
filter_size=1,
stride=1,
padding=0,
num_groups=1,
act="relu6",
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name="conv" + str(idx + 2)))
self.extra_block_list.append(conv_extra)
idx += 1
self._update_out_channels(extra_out_c, idx + 1, feature_maps)
for j, block_filter in enumerate(self.extra_block_filters):
in_c = extra_out_c if j == 0 else self.extra_block_filters[j -
1][1]
conv_extra = self.add_sublayer(
"conv" + str(idx + 2),
sublayer=ExtraBlockDW(
in_c,
block_filter[0],
block_filter[1],
stride=2,
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name='conv' + str(idx + 2)))
self.extra_block_list.append(conv_extra)
idx += 1
self._update_out_channels(block_filter[1], idx + 1,
feature_maps)
def _update_out_channels(self, channel, feature_idx, feature_maps):
if feature_idx in feature_maps:
self._out_channels.append(channel)
def forward(self, inputs):
x = self.conv1(inputs['image'])
outs = []
for idx, ghost_bottleneck in enumerate(self.ghost_bottleneck_list):
x = ghost_bottleneck(x)
if idx + 2 in self.feature_maps:
if isinstance(x, list):
outs.append(x[0])
x = x[1]
else:
outs.append(x)
if not self.with_extra_blocks:
return outs
for i, block in enumerate(self.extra_block_list):
idx = i + len(self.ghost_bottleneck_list)
x = block(x)
if idx + 2 in self.feature_maps:
outs.append(x)
return outs
@property
def out_shape(self):
return [ShapeSpec(channels=c) for c in self._out_channels]
================================================
FILE: ppdet/modeling/backbones/hardnet.py
================================================
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
from ppdet.core.workspace import register
from ..shape_spec import ShapeSpec
__all__ = ['HarDNet']
def ConvLayer(in_channels,
out_channels,
kernel_size=3,
stride=1,
bias_attr=False):
layer = nn.Sequential(
('conv', nn.Conv2D(
in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=kernel_size // 2,
groups=1,
bias_attr=bias_attr)), ('norm', nn.BatchNorm2D(out_channels)),
('relu', nn.ReLU6()))
return layer
def DWConvLayer(in_channels,
out_channels,
kernel_size=3,
stride=1,
bias_attr=False):
layer = nn.Sequential(
('dwconv', nn.Conv2D(
in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=1,
groups=out_channels,
bias_attr=bias_attr)), ('norm', nn.BatchNorm2D(out_channels)))
return layer
def CombConvLayer(in_channels, out_channels, kernel_size=1, stride=1):
layer = nn.Sequential(
('layer1', ConvLayer(
in_channels, out_channels, kernel_size=kernel_size)),
('layer2', DWConvLayer(
out_channels, out_channels, stride=stride)))
return layer
class HarDBlock(nn.Layer):
def __init__(self,
in_channels,
growth_rate,
grmul,
n_layers,
keepBase=False,
residual_out=False,
dwconv=False):
super().__init__()
self.keepBase = keepBase
self.links = []
layers_ = []
self.out_channels = 0
for i in range(n_layers):
outch, inch, link = self.get_link(i + 1, in_channels, growth_rate,
grmul)
self.links.append(link)
if dwconv:
layers_.append(CombConvLayer(inch, outch))
else:
layers_.append(ConvLayer(inch, outch))
if (i % 2 == 0) or (i == n_layers - 1):
self.out_channels += outch
self.layers = nn.LayerList(layers_)
def get_out_ch(self):
return self.out_channels
def get_link(self, layer, base_ch, growth_rate, grmul):
if layer == 0:
return base_ch, 0, []
out_channels = growth_rate
link = []
for i in range(10):
dv = 2**i
if layer % dv == 0:
k = layer - dv
link.append(k)
if i > 0:
out_channels *= grmul
out_channels = int(int(out_channels + 1) / 2) * 2
in_channels = 0
for i in link:
ch, _, _ = self.get_link(i, base_ch, growth_rate, grmul)
in_channels += ch
return out_channels, in_channels, link
def forward(self, x):
layers_ = [x]
for layer in range(len(self.layers)):
link = self.links[layer]
tin = []
for i in link:
tin.append(layers_[i])
if len(tin) > 1:
x = paddle.concat(tin, 1)
else:
x = tin[0]
out = self.layers[layer](x)
layers_.append(out)
t = len(layers_)
out_ = []
for i in range(t):
if (i == 0 and self.keepBase) or (i == t - 1) or (i % 2 == 1):
out_.append(layers_[i])
out = paddle.concat(out_, 1)
return out
@register
class HarDNet(nn.Layer):
def __init__(self, depth_wise=False, return_idx=[1, 3, 8, 13], arch=85):
super(HarDNet, self).__init__()
assert arch in [68, 85], "HarDNet-{} is not supported.".format(arch)
if arch == 85:
first_ch = [48, 96]
second_kernel = 3
ch_list = [192, 256, 320, 480, 720]
grmul = 1.7
gr = [24, 24, 28, 36, 48]
n_layers = [8, 16, 16, 16, 16]
elif arch == 68:
first_ch = [32, 64]
second_kernel = 3
ch_list = [128, 256, 320, 640]
grmul = 1.7
gr = [14, 16, 20, 40]
n_layers = [8, 16, 16, 16]
else:
raise ValueError("HarDNet-{} is not supported.".format(arch))
self.return_idx = return_idx
self._out_channels = [96, 214, 458, 784]
avg_pool = True
if depth_wise:
second_kernel = 1
avg_pool = False
blks = len(n_layers)
self.base = nn.LayerList([])
# First Layer: Standard Conv3x3, Stride=2
self.base.append(
ConvLayer(
in_channels=3,
out_channels=first_ch[0],
kernel_size=3,
stride=2,
bias_attr=False))
# Second Layer
self.base.append(
ConvLayer(
first_ch[0], first_ch[1], kernel_size=second_kernel))
# Avgpooling or DWConv3x3 downsampling
if avg_pool:
self.base.append(nn.AvgPool2D(kernel_size=3, stride=2, padding=1))
else:
self.base.append(DWConvLayer(first_ch[1], first_ch[1], stride=2))
# Build all HarDNet blocks
ch = first_ch[1]
for i in range(blks):
blk = HarDBlock(ch, gr[i], grmul, n_layers[i], dwconv=depth_wise)
ch = blk.out_channels
self.base.append(blk)
if i != blks - 1:
self.base.append(ConvLayer(ch, ch_list[i], kernel_size=1))
ch = ch_list[i]
if i == 0:
self.base.append(
nn.AvgPool2D(
kernel_size=2, stride=2, ceil_mode=True))
elif i != blks - 1 and i != 1 and i != 3:
self.base.append(nn.AvgPool2D(kernel_size=2, stride=2))
def forward(self, inputs):
x = inputs['image']
outs = []
for i, layer in enumerate(self.base):
x = layer(x)
if i in self.return_idx:
outs.append(x)
return outs
@property
def out_shape(self):
return [ShapeSpec(channels=self._out_channels[i]) for i in range(4)]
================================================
FILE: ppdet/modeling/backbones/hgnet_v2.py
================================================
# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import KaimingNormal, Constant
from paddle.nn import Conv2D, BatchNorm2D, ReLU, AdaptiveAvgPool2D, MaxPool2D
from paddle.regularizer import L2Decay
from paddle import ParamAttr
import copy
from ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec
__all__ = ['PPHGNetV2']
kaiming_normal_ = KaimingNormal()
zeros_ = Constant(value=0.)
ones_ = Constant(value=1.)
class LearnableAffineBlock(nn.Layer):
def __init__(self,
scale_value=1.0,
bias_value=0.0,
lr_mult=1.0,
lab_lr=0.01):
super().__init__()
self.scale = self.create_parameter(
shape=[1, ],
default_initializer=Constant(value=scale_value),
attr=ParamAttr(learning_rate=lr_mult * lab_lr))
self.add_parameter("scale", self.scale)
self.bias = self.create_parameter(
shape=[1, ],
default_initializer=Constant(value=bias_value),
attr=ParamAttr(learning_rate=lr_mult * lab_lr))
self.add_parameter("bias", self.bias)
def forward(self, x):
return self.scale * x + self.bias
class ConvBNAct(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size=3,
stride=1,
padding=1,
groups=1,
use_act=True,
use_lab=False,
lr_mult=1.0):
super().__init__()
self.use_act = use_act
self.use_lab = use_lab
self.conv = Conv2D(
in_channels,
out_channels,
kernel_size,
stride,
padding=padding
if isinstance(padding, str) else (kernel_size - 1) // 2,
groups=groups,
weight_attr=ParamAttr(learning_rate=lr_mult),
bias_attr=False)
self.bn = BatchNorm2D(
out_channels,
weight_attr=ParamAttr(
regularizer=L2Decay(0.0), learning_rate=lr_mult),
bias_attr=ParamAttr(
regularizer=L2Decay(0.0), learning_rate=lr_mult))
if self.use_act:
self.act = ReLU()
if self.use_lab:
self.lab = LearnableAffineBlock(lr_mult=lr_mult)
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
if self.use_act:
x = self.act(x)
if self.use_lab:
x = self.lab(x)
return x
class LightConvBNAct(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
groups=1,
use_lab=False,
lr_mult=1.0):
super().__init__()
self.conv1 = ConvBNAct(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
use_act=False,
use_lab=use_lab,
lr_mult=lr_mult)
self.conv2 = ConvBNAct(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=kernel_size,
groups=out_channels,
use_act=True,
use_lab=use_lab,
lr_mult=lr_mult)
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
return x
class StemBlock(nn.Layer):
def __init__(self,
in_channels,
mid_channels,
out_channels,
use_lab=False,
lr_mult=1.0):
super().__init__()
self.stem1 = ConvBNAct(
in_channels=in_channels,
out_channels=mid_channels,
kernel_size=3,
stride=2,
use_lab=use_lab,
lr_mult=lr_mult)
self.stem2a = ConvBNAct(
in_channels=mid_channels,
out_channels=mid_channels // 2,
kernel_size=2,
stride=1,
padding="SAME",
use_lab=use_lab,
lr_mult=lr_mult)
self.stem2b = ConvBNAct(
in_channels=mid_channels // 2,
out_channels=mid_channels,
kernel_size=2,
stride=1,
padding="SAME",
use_lab=use_lab,
lr_mult=lr_mult)
self.stem3 = ConvBNAct(
in_channels=mid_channels * 2,
out_channels=mid_channels,
kernel_size=3,
stride=2,
use_lab=use_lab,
lr_mult=lr_mult)
self.stem4 = ConvBNAct(
in_channels=mid_channels,
out_channels=out_channels,
kernel_size=1,
stride=1,
use_lab=use_lab,
lr_mult=lr_mult)
self.pool = nn.MaxPool2D(
kernel_size=2, stride=1, ceil_mode=True, padding="SAME")
def forward(self, x):
x = self.stem1(x)
x2 = self.stem2a(x)
x2 = self.stem2b(x2)
x1 = self.pool(x)
x = paddle.concat([x1, x2], 1)
x = self.stem3(x)
x = self.stem4(x)
return x
class HG_Block(nn.Layer):
def __init__(self,
in_channels,
mid_channels,
out_channels,
kernel_size=3,
layer_num=6,
identity=False,
light_block=True,
use_lab=False,
lr_mult=1.0):
super().__init__()
self.identity = identity
self.layers = nn.LayerList()
block_type = "LightConvBNAct" if light_block else "ConvBNAct"
for i in range(layer_num):
self.layers.append(
eval(block_type)(in_channels=in_channels
if i == 0 else mid_channels,
out_channels=mid_channels,
stride=1,
kernel_size=kernel_size,
use_lab=use_lab,
lr_mult=lr_mult))
# feature aggregation
total_channels = in_channels + layer_num * mid_channels
self.aggregation_squeeze_conv = ConvBNAct(
in_channels=total_channels,
out_channels=out_channels // 2,
kernel_size=1,
stride=1,
use_lab=use_lab,
lr_mult=lr_mult)
self.aggregation_excitation_conv = ConvBNAct(
in_channels=out_channels // 2,
out_channels=out_channels,
kernel_size=1,
stride=1,
use_lab=use_lab,
lr_mult=lr_mult)
def forward(self, x):
identity = x
output = []
output.append(x)
for layer in self.layers:
x = layer(x)
output.append(x)
x = paddle.concat(output, axis=1)
x = self.aggregation_squeeze_conv(x)
x = self.aggregation_excitation_conv(x)
if self.identity:
x += identity
return x
class HG_Stage(nn.Layer):
def __init__(self,
in_channels,
mid_channels,
out_channels,
block_num,
layer_num=6,
downsample=True,
light_block=True,
kernel_size=3,
use_lab=False,
lr_mult=1.0):
super().__init__()
self.downsample = downsample
if downsample:
self.downsample = ConvBNAct(
in_channels=in_channels,
out_channels=in_channels,
kernel_size=3,
stride=2,
groups=in_channels,
use_act=False,
use_lab=use_lab,
lr_mult=lr_mult)
blocks_list = []
for i in range(block_num):
blocks_list.append(
HG_Block(
in_channels=in_channels if i == 0 else out_channels,
mid_channels=mid_channels,
out_channels=out_channels,
kernel_size=kernel_size,
layer_num=layer_num,
identity=False if i == 0 else True,
light_block=light_block,
use_lab=use_lab,
lr_mult=lr_mult))
self.blocks = nn.Sequential(*blocks_list)
def forward(self, x):
if self.downsample:
x = self.downsample(x)
x = self.blocks(x)
return x
def _freeze_norm(m: nn.BatchNorm2D):
param_attr = ParamAttr(
learning_rate=0., regularizer=L2Decay(0.), trainable=False)
bias_attr = ParamAttr(
learning_rate=0., regularizer=L2Decay(0.), trainable=False)
global_stats = True
norm = nn.BatchNorm2D(
m._num_features,
weight_attr=param_attr,
bias_attr=bias_attr,
use_global_stats=global_stats)
for param in norm.parameters():
param.stop_gradient = True
return norm
def reset_bn(model: nn.Layer, reset_func=_freeze_norm):
if isinstance(model, nn.BatchNorm2D):
model = reset_func(model)
else:
for name, child in model.named_children():
_child = reset_bn(child, reset_func)
if _child is not child:
setattr(model, name, _child)
return model
@register
@serializable
class PPHGNetV2(nn.Layer):
"""
PPHGNetV2
Args:
stem_channels: list. Number of channels for the stem block.
stage_type: str. The stage configuration of PPHGNet. such as the number of channels, stride, etc.
use_lab: boolean. Whether to use LearnableAffineBlock in network.
lr_mult_list: list. Control the learning rate of different stages.
Returns:
model: nn.Layer. Specific PPHGNetV2 model depends on args.
"""
arch_configs = {
'S': {
'stem_channels': [3, 24, 32],
'stage_config': {
# in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
"stage1": [32, 32, 64, 1, False, False, 3, 3],
"stage2": [64, 48, 256, 1, True, False, 3, 3],
"stage3": [256, 96, 512, 2, True, True, 5, 3],
"stage4": [512, 192, 1024, 1, True, True, 5, 3],
}
},
'M': {
'stem_channels': [3, 24, 32],
'stage_config': {
# in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
"stage1": [32, 32, 96, 1, False, False, 3, 4],
"stage2": [96, 64, 384, 1, True, False, 3, 4],
"stage3": [384, 128, 768, 3, True, True, 5, 4],
"stage4": [768, 256, 1536, 1, True, True, 5, 4],
}
},
'L': {
'stem_channels': [3, 32, 48],
'stage_config': {
# in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
"stage1": [48, 48, 128, 1, False, False, 3, 6],
"stage2": [128, 96, 512, 1, True, False, 3, 6],
"stage3": [512, 192, 1024, 3, True, True, 5, 6],
"stage4": [1024, 384, 2048, 1, True, True, 5, 6],
}
},
'X': {
'stem_channels': [3, 32, 64],
'stage_config': {
# in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
"stage1": [64, 64, 128, 1, False, False, 3, 6],
"stage2": [128, 128, 512, 2, True, False, 3, 6],
"stage3": [512, 256, 1024, 5, True, True, 5, 6],
"stage4": [1024, 512, 2048, 2, True, True, 5, 6],
}
},
'H': {
'stem_channels': [3, 48, 96],
'stage_config': {
# in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
"stage1": [96, 96, 192, 2, False, False, 3, 6],
"stage2": [192, 192, 512, 3, True, False, 3, 6],
"stage3": [512, 384, 1024, 6, True, True, 5, 6],
"stage4": [1024, 768, 2048, 3, True, True, 5, 6],
}
}
}
def __init__(self,
arch,
use_lab=False,
lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
return_idx=[1, 2, 3],
freeze_stem_only=True,
freeze_at=0,
freeze_norm=True):
super().__init__()
self.use_lab = use_lab
self.return_idx = return_idx
stem_channels = self.arch_configs[arch]['stem_channels']
stage_config = self.arch_configs[arch]['stage_config']
self._out_strides = [4, 8, 16, 32]
self._out_channels = [stage_config[k][2] for k in stage_config]
# stem
self.stem = StemBlock(
in_channels=stem_channels[0],
mid_channels=stem_channels[1],
out_channels=stem_channels[2],
use_lab=use_lab,
lr_mult=lr_mult_list[0])
# stages
self.stages = nn.LayerList()
for i, k in enumerate(stage_config):
in_channels, mid_channels, out_channels, block_num, downsample, light_block, kernel_size, layer_num = stage_config[
k]
self.stages.append(
HG_Stage(
in_channels,
mid_channels,
out_channels,
block_num,
layer_num,
downsample,
light_block,
kernel_size,
use_lab,
lr_mult=lr_mult_list[i + 1]))
if freeze_at >= 0:
self._freeze_parameters(self.stem)
if not freeze_stem_only:
for i in range(min(freeze_at + 1, len(self.stages))):
self._freeze_parameters(self.stages[i])
if freeze_norm:
reset_bn(self, reset_func=_freeze_norm)
self._init_weights()
def _freeze_parameters(self, m):
for p in m.parameters():
p.stop_gradient = True
def _init_weights(self):
for m in self.sublayers():
if isinstance(m, nn.Conv2D):
kaiming_normal_(m.weight)
elif isinstance(m, (nn.BatchNorm2D)):
ones_(m.weight)
zeros_(m.bias)
elif isinstance(m, nn.Linear):
zeros_(m.bias)
@property
def out_shape(self):
return [
ShapeSpec(
channels=self._out_channels[i], stride=self._out_strides[i])
for i in self.return_idx
]
def forward(self, inputs):
x = inputs['image']
x = self.stem(x)
outs = []
for idx, stage in enumerate(self.stages):
x = stage(x)
if idx in self.return_idx:
outs.append(x)
return outs
================================================
FILE: ppdet/modeling/backbones/hrnet.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn import AdaptiveAvgPool2D, Linear
from paddle.regularizer import L2Decay
from paddle import ParamAttr
from paddle.nn.initializer import Normal, Uniform
from numbers import Integral
import math
from ppdet.core.workspace import register
from ..shape_spec import ShapeSpec
__all__ = ['HRNet']
class ConvNormLayer(nn.Layer):
def __init__(self,
ch_in,
ch_out,
filter_size,
stride=1,
norm_type='bn',
norm_groups=32,
use_dcn=False,
norm_momentum=0.9,
norm_decay=0.,
freeze_norm=False,
act=None,
name=None):
super(ConvNormLayer, self).__init__()
assert norm_type in ['bn', 'sync_bn', 'gn']
self.act = act
self.conv = nn.Conv2D(
in_channels=ch_in,
out_channels=ch_out,
kernel_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=1,
weight_attr=ParamAttr(initializer=Normal(
mean=0., std=0.01)),
bias_attr=False)
norm_lr = 0. if freeze_norm else 1.
param_attr = ParamAttr(
learning_rate=norm_lr, regularizer=L2Decay(norm_decay))
bias_attr = ParamAttr(
learning_rate=norm_lr, regularizer=L2Decay(norm_decay))
global_stats = True if freeze_norm else None
if norm_type in ['bn', 'sync_bn']:
self.norm = nn.BatchNorm2D(
ch_out,
momentum=norm_momentum,
weight_attr=param_attr,
bias_attr=bias_attr,
use_global_stats=global_stats)
elif norm_type == 'gn':
self.norm = nn.GroupNorm(
num_groups=norm_groups,
num_channels=ch_out,
weight_attr=param_attr,
bias_attr=bias_attr)
norm_params = self.norm.parameters()
if freeze_norm:
for param in norm_params:
param.stop_gradient = True
def forward(self, inputs):
out = self.conv(inputs)
out = self.norm(out)
if self.act == 'relu':
out = F.relu(out)
return out
class Layer1(nn.Layer):
def __init__(self,
num_channels,
has_se=False,
norm_momentum=0.9,
norm_decay=0.,
freeze_norm=True,
name=None):
super(Layer1, self).__init__()
self.bottleneck_block_list = []
for i in range(4):
bottleneck_block = self.add_sublayer(
"block_{}_{}".format(name, i + 1),
BottleneckBlock(
num_channels=num_channels if i == 0 else 256,
num_filters=64,
has_se=has_se,
stride=1,
downsample=True if i == 0 else False,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + '_' + str(i + 1)))
self.bottleneck_block_list.append(bottleneck_block)
def forward(self, input):
conv = input
for block_func in self.bottleneck_block_list:
conv = block_func(conv)
return conv
class TransitionLayer(nn.Layer):
def __init__(self,
in_channels,
out_channels,
norm_momentum=0.9,
norm_decay=0.,
freeze_norm=True,
name=None):
super(TransitionLayer, self).__init__()
num_in = len(in_channels)
num_out = len(out_channels)
out = []
self.conv_bn_func_list = []
for i in range(num_out):
residual = None
if i < num_in:
if in_channels[i] != out_channels[i]:
residual = self.add_sublayer(
"transition_{}_layer_{}".format(name, i + 1),
ConvNormLayer(
ch_in=in_channels[i],
ch_out=out_channels[i],
filter_size=3,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
act='relu',
name=name + '_layer_' + str(i + 1)))
else:
residual = self.add_sublayer(
"transition_{}_layer_{}".format(name, i + 1),
ConvNormLayer(
ch_in=in_channels[-1],
ch_out=out_channels[i],
filter_size=3,
stride=2,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
act='relu',
name=name + '_layer_' + str(i + 1)))
self.conv_bn_func_list.append(residual)
def forward(self, input):
outs = []
for idx, conv_bn_func in enumerate(self.conv_bn_func_list):
if conv_bn_func is None:
outs.append(input[idx])
else:
if idx < len(input):
outs.append(conv_bn_func(input[idx]))
else:
outs.append(conv_bn_func(input[-1]))
return outs
class Branches(nn.Layer):
def __init__(self,
block_num,
in_channels,
out_channels,
has_se=False,
norm_momentum=0.9,
norm_decay=0.,
freeze_norm=True,
name=None):
super(Branches, self).__init__()
self.basic_block_list = []
for i in range(len(out_channels)):
self.basic_block_list.append([])
for j in range(block_num):
in_ch = in_channels[i] if j == 0 else out_channels[i]
basic_block_func = self.add_sublayer(
"bb_{}_branch_layer_{}_{}".format(name, i + 1, j + 1),
BasicBlock(
num_channels=in_ch,
num_filters=out_channels[i],
has_se=has_se,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + '_branch_layer_' + str(i + 1) + '_' +
str(j + 1)))
self.basic_block_list[i].append(basic_block_func)
def forward(self, inputs):
outs = []
for idx, input in enumerate(inputs):
conv = input
basic_block_list = self.basic_block_list[idx]
for basic_block_func in basic_block_list:
conv = basic_block_func(conv)
outs.append(conv)
return outs
class BottleneckBlock(nn.Layer):
def __init__(self,
num_channels,
num_filters,
has_se,
stride=1,
downsample=False,
norm_momentum=0.9,
norm_decay=0.,
freeze_norm=True,
name=None):
super(BottleneckBlock, self).__init__()
self.has_se = has_se
self.downsample = downsample
self.conv1 = ConvNormLayer(
ch_in=num_channels,
ch_out=num_filters,
filter_size=1,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
act="relu",
name=name + "_conv1")
self.conv2 = ConvNormLayer(
ch_in=num_filters,
ch_out=num_filters,
filter_size=3,
stride=stride,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
act="relu",
name=name + "_conv2")
self.conv3 = ConvNormLayer(
ch_in=num_filters,
ch_out=num_filters * 4,
filter_size=1,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
act=None,
name=name + "_conv3")
if self.downsample:
self.conv_down = ConvNormLayer(
ch_in=num_channels,
ch_out=num_filters * 4,
filter_size=1,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
act=None,
name=name + "_downsample")
if self.has_se:
self.se = SELayer(
num_channels=num_filters * 4,
num_filters=num_filters * 4,
reduction_ratio=16,
name='fc' + name)
def forward(self, input):
residual = input
conv1 = self.conv1(input)
conv2 = self.conv2(conv1)
conv3 = self.conv3(conv2)
if self.downsample:
residual = self.conv_down(input)
if self.has_se:
conv3 = self.se(conv3)
y = paddle.add(x=residual, y=conv3)
y = F.relu(y)
return y
class BasicBlock(nn.Layer):
def __init__(self,
num_channels,
num_filters,
stride=1,
has_se=False,
downsample=False,
norm_momentum=0.9,
norm_decay=0.,
freeze_norm=True,
name=None):
super(BasicBlock, self).__init__()
self.has_se = has_se
self.downsample = downsample
self.conv1 = ConvNormLayer(
ch_in=num_channels,
ch_out=num_filters,
filter_size=3,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
stride=stride,
act="relu",
name=name + "_conv1")
self.conv2 = ConvNormLayer(
ch_in=num_filters,
ch_out=num_filters,
filter_size=3,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
stride=1,
act=None,
name=name + "_conv2")
if self.downsample:
self.conv_down = ConvNormLayer(
ch_in=num_channels,
ch_out=num_filters * 4,
filter_size=1,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
act=None,
name=name + "_downsample")
if self.has_se:
self.se = SELayer(
num_channels=num_filters,
num_filters=num_filters,
reduction_ratio=16,
name='fc' + name)
def forward(self, input):
residual = input
conv1 = self.conv1(input)
conv2 = self.conv2(conv1)
if self.downsample:
residual = self.conv_down(input)
if self.has_se:
conv2 = self.se(conv2)
y = paddle.add(x=residual, y=conv2)
y = F.relu(y)
return y
class SELayer(nn.Layer):
def __init__(self, num_channels, num_filters, reduction_ratio, name=None):
super(SELayer, self).__init__()
self.pool2d_gap = AdaptiveAvgPool2D(1)
self._num_channels = num_channels
med_ch = int(num_channels / reduction_ratio)
stdv = 1.0 / math.sqrt(num_channels * 1.0)
self.squeeze = Linear(
num_channels,
med_ch,
weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
stdv = 1.0 / math.sqrt(med_ch * 1.0)
self.excitation = Linear(
med_ch,
num_filters,
weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
def forward(self, input):
pool = self.pool2d_gap(input)
pool = paddle.squeeze(pool, axis=[2, 3])
squeeze = self.squeeze(pool)
squeeze = F.relu(squeeze)
excitation = self.excitation(squeeze)
excitation = F.sigmoid(excitation)
excitation = paddle.unsqueeze(excitation, axis=[2, 3])
out = input * excitation
return out
class Stage(nn.Layer):
def __init__(self,
num_channels,
num_modules,
num_filters,
has_se=False,
norm_momentum=0.9,
norm_decay=0.,
freeze_norm=True,
multi_scale_output=True,
name=None):
super(Stage, self).__init__()
self._num_modules = num_modules
self.stage_func_list = []
for i in range(num_modules):
if i == num_modules - 1 and not multi_scale_output:
stage_func = self.add_sublayer(
"stage_{}_{}".format(name, i + 1),
HighResolutionModule(
num_channels=num_channels,
num_filters=num_filters,
has_se=has_se,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
multi_scale_output=False,
name=name + '_' + str(i + 1)))
else:
stage_func = self.add_sublayer(
"stage_{}_{}".format(name, i + 1),
HighResolutionModule(
num_channels=num_channels,
num_filters=num_filters,
has_se=has_se,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + '_' + str(i + 1)))
self.stage_func_list.append(stage_func)
def forward(self, input):
out = input
for idx in range(self._num_modules):
out = self.stage_func_list[idx](out)
return out
class HighResolutionModule(nn.Layer):
def __init__(self,
num_channels,
num_filters,
has_se=False,
multi_scale_output=True,
norm_momentum=0.9,
norm_decay=0.,
freeze_norm=True,
name=None):
super(HighResolutionModule, self).__init__()
self.branches_func = Branches(
block_num=4,
in_channels=num_channels,
out_channels=num_filters,
has_se=has_se,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name)
self.fuse_func = FuseLayers(
in_channels=num_filters,
out_channels=num_filters,
multi_scale_output=multi_scale_output,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name)
def forward(self, input):
out = self.branches_func(input)
out = self.fuse_func(out)
return out
class FuseLayers(nn.Layer):
def __init__(self,
in_channels,
out_channels,
multi_scale_output=True,
norm_momentum=0.9,
norm_decay=0.,
freeze_norm=True,
name=None):
super(FuseLayers, self).__init__()
self._actual_ch = len(in_channels) if multi_scale_output else 1
self._in_channels = in_channels
self.residual_func_list = []
for i in range(self._actual_ch):
for j in range(len(in_channels)):
residual_func = None
if j > i:
residual_func = self.add_sublayer(
"residual_{}_layer_{}_{}".format(name, i + 1, j + 1),
ConvNormLayer(
ch_in=in_channels[j],
ch_out=out_channels[i],
filter_size=1,
stride=1,
act=None,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + '_layer_' + str(i + 1) + '_' +
str(j + 1)))
self.residual_func_list.append(residual_func)
elif j < i:
pre_num_filters = in_channels[j]
for k in range(i - j):
if k == i - j - 1:
residual_func = self.add_sublayer(
"residual_{}_layer_{}_{}_{}".format(
name, i + 1, j + 1, k + 1),
ConvNormLayer(
ch_in=pre_num_filters,
ch_out=out_channels[i],
filter_size=3,
stride=2,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
act=None,
name=name + '_layer_' + str(i + 1) + '_' +
str(j + 1) + '_' + str(k + 1)))
pre_num_filters = out_channels[i]
else:
residual_func = self.add_sublayer(
"residual_{}_layer_{}_{}_{}".format(
name, i + 1, j + 1, k + 1),
ConvNormLayer(
ch_in=pre_num_filters,
ch_out=out_channels[j],
filter_size=3,
stride=2,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
act="relu",
name=name + '_layer_' + str(i + 1) + '_' +
str(j + 1) + '_' + str(k + 1)))
pre_num_filters = out_channels[j]
self.residual_func_list.append(residual_func)
def forward(self, input):
outs = []
residual_func_idx = 0
for i in range(self._actual_ch):
residual = input[i]
for j in range(len(self._in_channels)):
if j > i:
y = self.residual_func_list[residual_func_idx](input[j])
residual_func_idx += 1
y = F.interpolate(y, scale_factor=2**(j - i))
residual = paddle.add(x=residual, y=y)
elif j < i:
y = input[j]
for k in range(i - j):
y = self.residual_func_list[residual_func_idx](y)
residual_func_idx += 1
residual = paddle.add(x=residual, y=y)
residual = F.relu(residual)
outs.append(residual)
return outs
@register
class HRNet(nn.Layer):
"""
HRNet, see https://arxiv.org/abs/1908.07919
Args:
width (int): the width of HRNet
has_se (bool): whether to add SE block for each stage
freeze_at (int): the stage to freeze
freeze_norm (bool): whether to freeze norm in HRNet
norm_momentum (float): momentum of BatchNorm
norm_decay (float): weight decay for normalization layer weights
return_idx (List): the stage to return
upsample (bool): whether to upsample and concat the backbone feats
"""
def __init__(self,
width=18,
has_se=False,
freeze_at=0,
freeze_norm=True,
norm_momentum=0.9,
norm_decay=0.,
return_idx=[0, 1, 2, 3],
upsample=False,
downsample=False):
super(HRNet, self).__init__()
self.width = width
self.has_se = has_se
if isinstance(return_idx, Integral):
return_idx = [return_idx]
assert len(return_idx) > 0, "need one or more return index"
self.freeze_at = freeze_at
self.return_idx = return_idx
self.upsample = upsample
self.downsample = downsample
self.channels = {
18: [[18, 36], [18, 36, 72], [18, 36, 72, 144]],
30: [[30, 60], [30, 60, 120], [30, 60, 120, 240]],
32: [[32, 64], [32, 64, 128], [32, 64, 128, 256]],
40: [[40, 80], [40, 80, 160], [40, 80, 160, 320]],
44: [[44, 88], [44, 88, 176], [44, 88, 176, 352]],
48: [[48, 96], [48, 96, 192], [48, 96, 192, 384]],
60: [[60, 120], [60, 120, 240], [60, 120, 240, 480]],
64: [[64, 128], [64, 128, 256], [64, 128, 256, 512]]
}
channels_2, channels_3, channels_4 = self.channels[width]
num_modules_2, num_modules_3, num_modules_4 = 1, 4, 3
self._out_channels = [sum(channels_4)] if self.upsample else channels_4
self._out_strides = [4] if self.upsample else [4, 8, 16, 32]
self.conv_layer1_1 = ConvNormLayer(
ch_in=3,
ch_out=64,
filter_size=3,
stride=2,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
act='relu',
name="layer1_1")
self.conv_layer1_2 = ConvNormLayer(
ch_in=64,
ch_out=64,
filter_size=3,
stride=2,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
act='relu',
name="layer1_2")
self.la1 = Layer1(
num_channels=64,
has_se=has_se,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name="layer2")
self.tr1 = TransitionLayer(
in_channels=[256],
out_channels=channels_2,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name="tr1")
self.st2 = Stage(
num_channels=channels_2,
num_modules=num_modules_2,
num_filters=channels_2,
has_se=self.has_se,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name="st2")
self.tr2 = TransitionLayer(
in_channels=channels_2,
out_channels=channels_3,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name="tr2")
self.st3 = Stage(
num_channels=channels_3,
num_modules=num_modules_3,
num_filters=channels_3,
has_se=self.has_se,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name="st3")
self.tr3 = TransitionLayer(
in_channels=channels_3,
out_channels=channels_4,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name="tr3")
self.st4 = Stage(
num_channels=channels_4,
num_modules=num_modules_4,
num_filters=channels_4,
has_se=self.has_se,
norm_momentum=norm_momentum,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
multi_scale_output=len(return_idx) > 1,
name="st4")
if self.downsample:
self.incre_modules, self.downsamp_modules, \
self.final_layer = self._make_head(channels_4, norm_momentum=norm_momentum, has_se=self.has_se)
def _make_layer(self,
block,
inplanes,
planes,
blocks,
stride=1,
norm_momentum=0.9,
has_se=False,
name=None):
downsample = None
if stride != 1 or inplanes != planes * 4:
downsample = True
layers = []
layers.append(
block(
inplanes,
planes,
has_se,
stride,
downsample,
norm_momentum=norm_momentum,
freeze_norm=False,
name=name + "_s0"))
inplanes = planes * 4
for i in range(1, blocks):
layers.append(
block(
inplanes,
planes,
has_se,
norm_momentum=norm_momentum,
freeze_norm=False,
name=name + "_s" + str(i)))
return nn.Sequential(*layers)
def _make_head(self, pre_stage_channels, norm_momentum=0.9, has_se=False):
head_block = BottleneckBlock
head_channels = [32, 64, 128, 256]
# Increasing the #channels on each resolution
# from C, 2C, 4C, 8C to 128, 256, 512, 1024
incre_modules = []
for i, channels in enumerate(pre_stage_channels):
incre_module = self._make_layer(
head_block,
channels,
head_channels[i],
1,
stride=1,
norm_momentum=norm_momentum,
has_se=has_se,
name='incre' + str(i))
incre_modules.append(incre_module)
incre_modules = nn.LayerList(incre_modules)
# downsampling modules
downsamp_modules = []
for i in range(len(pre_stage_channels) - 1):
in_channels = head_channels[i] * 4
out_channels = head_channels[i + 1] * 4
downsamp_module = nn.Sequential(
nn.Conv2D(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=3,
stride=2,
padding=1),
nn.BatchNorm2D(
out_channels, momentum=norm_momentum),
nn.ReLU())
downsamp_modules.append(downsamp_module)
downsamp_modules = nn.LayerList(downsamp_modules)
final_layer = nn.Sequential(
nn.Conv2D(
in_channels=head_channels[3] * 4,
out_channels=2048,
kernel_size=1,
stride=1,
padding=0),
nn.BatchNorm2D(
2048, momentum=norm_momentum),
nn.ReLU())
return incre_modules, downsamp_modules, final_layer
def forward(self, inputs):
x = inputs['image']
conv1 = self.conv_layer1_1(x)
conv2 = self.conv_layer1_2(conv1)
la1 = self.la1(conv2)
tr1 = self.tr1([la1])
st2 = self.st2(tr1)
tr2 = self.tr2(st2)
st3 = self.st3(tr2)
tr3 = self.tr3(st3)
st4 = self.st4(tr3)
if self.upsample:
# Upsampling
x0_h, x0_w = st4[0].shape[2:4]
x1 = F.upsample(st4[1], size=(x0_h, x0_w), mode='bilinear')
x2 = F.upsample(st4[2], size=(x0_h, x0_w), mode='bilinear')
x3 = F.upsample(st4[3], size=(x0_h, x0_w), mode='bilinear')
x = paddle.concat([st4[0], x1, x2, x3], 1)
return x
if self.downsample:
y = self.incre_modules[0](st4[0])
for i in range(len(self.downsamp_modules)):
y = self.incre_modules[i+1](st4[i+1]) + \
self.downsamp_modules[i](y)
y = self.final_layer(y)
return y
res = []
for i, layer in enumerate(st4):
if i == self.freeze_at:
layer.stop_gradient = True
if i in self.return_idx:
res.append(layer)
return res
@property
def out_shape(self):
if self.upsample:
self.return_idx = [0]
return [
ShapeSpec(
channels=self._out_channels[i], stride=self._out_strides[i])
for i in self.return_idx
]
================================================
FILE: ppdet/modeling/backbones/lcnet.py
================================================
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
from paddle import ParamAttr
from paddle.nn import Conv2D
from paddle.regularizer import L2Decay
from paddle.nn.initializer import KaimingNormal
from ppdet.core.workspace import register, serializable
from numbers import Integral
from ..shape_spec import ShapeSpec
__all__ = ['LCNet']
NET_CONFIG = {
"blocks2":
#k, in_c, out_c, s, use_se
[[3, 16, 32, 1, False], ],
"blocks3": [
[3, 32, 64, 2, False],
[3, 64, 64, 1, False],
],
"blocks4": [
[3, 64, 128, 2, False],
[3, 128, 128, 1, False],
],
"blocks5": [
[3, 128, 256, 2, False],
[5, 256, 256, 1, False],
[5, 256, 256, 1, False],
[5, 256, 256, 1, False],
[5, 256, 256, 1, False],
[5, 256, 256, 1, False],
],
"blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]]
}
def make_divisible(v, divisor=8, min_value=None):
if min_value is None:
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
if new_v < 0.9 * v:
new_v += divisor
return new_v
class ConvBNLayer(nn.Layer):
def __init__(self,
num_channels,
filter_size,
num_filters,
stride,
num_groups=1,
act='hard_swish'):
super().__init__()
self.conv = Conv2D(
in_channels=num_channels,
out_channels=num_filters,
kernel_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=num_groups,
weight_attr=ParamAttr(initializer=KaimingNormal()),
bias_attr=False)
self.bn = nn.BatchNorm2D(
num_filters,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
if act == 'hard_swish':
self.act = nn.Hardswish()
elif act == 'relu6':
self.act = nn.ReLU6()
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
x = self.act(x)
return x
class DepthwiseSeparable(nn.Layer):
def __init__(self,
num_channels,
num_filters,
stride,
dw_size=3,
use_se=False,
act='hard_swish'):
super().__init__()
self.use_se = use_se
self.dw_conv = ConvBNLayer(
num_channels=num_channels,
num_filters=num_channels,
filter_size=dw_size,
stride=stride,
num_groups=num_channels,
act=act)
if use_se:
self.se = SEModule(num_channels)
self.pw_conv = ConvBNLayer(
num_channels=num_channels,
filter_size=1,
num_filters=num_filters,
stride=1,
act=act)
def forward(self, x):
x = self.dw_conv(x)
if self.use_se:
x = self.se(x)
x = self.pw_conv(x)
return x
class AdaptiveAvgPool2D(nn.AdaptiveAvgPool2D):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if paddle.device.get_device().startswith("npu"):
self.device = "npu"
else:
self.device = None
if isinstance(self._output_size, int) and self._output_size == 1:
self._gap = True
elif isinstance(self._output_size, tuple) and self._output_size[
0] == 1 and self._output_size[1] == 1:
self._gap = True
else:
self._gap = False
def forward(self, x):
if self.device == "npu" and self._gap:
# Global Average Pooling
N, C, _, _ = x.shape
x_mean = paddle.mean(x, axis=[2, 3])
x_mean = paddle.reshape(x_mean, [N, C, 1, 1])
return x_mean
else:
return super(AdaptiveAvgPool2D, self).forward(x)
class SEModule(nn.Layer):
def __init__(self, channel, reduction=4):
super().__init__()
self.avg_pool = AdaptiveAvgPool2D(1)
self.conv1 = Conv2D(
in_channels=channel,
out_channels=channel // reduction,
kernel_size=1,
stride=1,
padding=0)
self.relu = nn.ReLU()
self.conv2 = Conv2D(
in_channels=channel // reduction,
out_channels=channel,
kernel_size=1,
stride=1,
padding=0)
self.hardsigmoid = nn.Hardsigmoid()
def forward(self, x):
identity = x
x = self.avg_pool(x)
x = self.conv1(x)
x = self.relu(x)
x = self.conv2(x)
x = self.hardsigmoid(x)
x = paddle.multiply(x=identity, y=x)
return x
@register
@serializable
class LCNet(nn.Layer):
def __init__(self, scale=1.0, feature_maps=[3, 4, 5], act='hard_swish'):
super().__init__()
self.scale = scale
self.feature_maps = feature_maps
out_channels = []
self.conv1 = ConvBNLayer(
num_channels=3,
filter_size=3,
num_filters=make_divisible(16 * scale),
stride=2,
act=act)
self.blocks2 = nn.Sequential(* [
DepthwiseSeparable(
num_channels=make_divisible(in_c * scale),
num_filters=make_divisible(out_c * scale),
dw_size=k,
stride=s,
use_se=se,
act=act)
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"])
])
self.blocks3 = nn.Sequential(* [
DepthwiseSeparable(
num_channels=make_divisible(in_c * scale),
num_filters=make_divisible(out_c * scale),
dw_size=k,
stride=s,
use_se=se,
act=act)
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"])
])
out_channels.append(
make_divisible(NET_CONFIG["blocks3"][-1][2] * scale))
self.blocks4 = nn.Sequential(* [
DepthwiseSeparable(
num_channels=make_divisible(in_c * scale),
num_filters=make_divisible(out_c * scale),
dw_size=k,
stride=s,
use_se=se,
act=act)
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"])
])
out_channels.append(
make_divisible(NET_CONFIG["blocks4"][-1][2] * scale))
self.blocks5 = nn.Sequential(* [
DepthwiseSeparable(
num_channels=make_divisible(in_c * scale),
num_filters=make_divisible(out_c * scale),
dw_size=k,
stride=s,
use_se=se,
act=act)
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"])
])
out_channels.append(
make_divisible(NET_CONFIG["blocks5"][-1][2] * scale))
self.blocks6 = nn.Sequential(* [
DepthwiseSeparable(
num_channels=make_divisible(in_c * scale),
num_filters=make_divisible(out_c * scale),
dw_size=k,
stride=s,
use_se=se,
act=act)
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"])
])
out_channels.append(
make_divisible(NET_CONFIG["blocks6"][-1][2] * scale))
self._out_channels = [
ch for idx, ch in enumerate(out_channels) if idx + 2 in feature_maps
]
def forward(self, inputs):
x = inputs['image']
outs = []
x = self.conv1(x)
x = self.blocks2(x)
x = self.blocks3(x)
outs.append(x)
x = self.blocks4(x)
outs.append(x)
x = self.blocks5(x)
outs.append(x)
x = self.blocks6(x)
outs.append(x)
outs = [o for i, o in enumerate(outs) if i + 2 in self.feature_maps]
return outs
@property
def out_shape(self):
return [ShapeSpec(channels=c) for c in self._out_channels]
================================================
FILE: ppdet/modeling/backbones/lite_hrnet.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on
https://github.com/HRNet/Lite-HRNet/blob/hrnet/models/backbones/litehrnet.py
"""
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from numbers import Integral
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddle.nn.initializer import Normal, Constant
from ppdet.core.workspace import register
from ppdet.modeling.shape_spec import ShapeSpec
from ppdet.modeling.ops import channel_shuffle
from .. import layers as L
__all__ = ['LiteHRNet']
class ConvNormLayer(nn.Layer):
def __init__(self,
ch_in,
ch_out,
filter_size,
stride=1,
groups=1,
norm_type=None,
norm_groups=32,
norm_decay=0.,
freeze_norm=False,
act=None):
super(ConvNormLayer, self).__init__()
self.act = act
norm_lr = 0. if freeze_norm else 1.
if norm_type is not None:
assert norm_type in ['bn', 'sync_bn', 'gn'], \
"norm_type should be one of ['bn', 'sync_bn', 'gn'], but got {}".format(norm_type)
param_attr = ParamAttr(
initializer=Constant(1.0),
learning_rate=norm_lr,
regularizer=L2Decay(norm_decay), )
bias_attr = ParamAttr(
learning_rate=norm_lr, regularizer=L2Decay(norm_decay))
global_stats = True if freeze_norm else None
if norm_type in ['bn', 'sync_bn']:
self.norm = nn.BatchNorm2D(
ch_out,
weight_attr=param_attr,
bias_attr=bias_attr,
use_global_stats=global_stats, )
elif norm_type == 'gn':
self.norm = nn.GroupNorm(
num_groups=norm_groups,
num_channels=ch_out,
weight_attr=param_attr,
bias_attr=bias_attr)
norm_params = self.norm.parameters()
if freeze_norm:
for param in norm_params:
param.stop_gradient = True
conv_bias_attr = False
else:
conv_bias_attr = True
self.norm = None
self.conv = nn.Conv2D(
in_channels=ch_in,
out_channels=ch_out,
kernel_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
weight_attr=ParamAttr(initializer=Normal(
mean=0., std=0.001)),
bias_attr=conv_bias_attr)
def forward(self, inputs):
out = self.conv(inputs)
if self.norm is not None:
out = self.norm(out)
if self.act == 'relu':
out = F.relu(out)
elif self.act == 'sigmoid':
out = F.sigmoid(out)
return out
class DepthWiseSeparableConvNormLayer(nn.Layer):
def __init__(self,
ch_in,
ch_out,
filter_size,
stride=1,
dw_norm_type=None,
pw_norm_type=None,
norm_decay=0.,
freeze_norm=False,
dw_act=None,
pw_act=None):
super(DepthWiseSeparableConvNormLayer, self).__init__()
self.depthwise_conv = ConvNormLayer(
ch_in=ch_in,
ch_out=ch_in,
filter_size=filter_size,
stride=stride,
groups=ch_in,
norm_type=dw_norm_type,
act=dw_act,
norm_decay=norm_decay,
freeze_norm=freeze_norm, )
self.pointwise_conv = ConvNormLayer(
ch_in=ch_in,
ch_out=ch_out,
filter_size=1,
stride=1,
norm_type=pw_norm_type,
act=pw_act,
norm_decay=norm_decay,
freeze_norm=freeze_norm, )
def forward(self, x):
x = self.depthwise_conv(x)
x = self.pointwise_conv(x)
return x
class CrossResolutionWeightingModule(nn.Layer):
def __init__(self,
channels,
ratio=16,
norm_type='bn',
freeze_norm=False,
norm_decay=0.):
super(CrossResolutionWeightingModule, self).__init__()
self.channels = channels
total_channel = sum(channels)
self.conv1 = ConvNormLayer(
ch_in=total_channel,
ch_out=total_channel // ratio,
filter_size=1,
stride=1,
norm_type=norm_type,
act='relu',
freeze_norm=freeze_norm,
norm_decay=norm_decay)
self.conv2 = ConvNormLayer(
ch_in=total_channel // ratio,
ch_out=total_channel,
filter_size=1,
stride=1,
norm_type=norm_type,
act='sigmoid',
freeze_norm=freeze_norm,
norm_decay=norm_decay)
def forward(self, x):
mini_size = x[-1].shape[-2:]
out = [F.adaptive_avg_pool2d(s, mini_size) for s in x[:-1]] + [x[-1]]
out = paddle.concat(out, 1)
out = self.conv1(out)
out = self.conv2(out)
out = paddle.split(out, self.channels, 1)
out = [
s * F.interpolate(
a, s.shape[-2:], mode='nearest') for s, a in zip(x, out)
]
return out
class SpatialWeightingModule(nn.Layer):
def __init__(self, in_channel, ratio=16, freeze_norm=False, norm_decay=0.):
super(SpatialWeightingModule, self).__init__()
self.global_avgpooling = nn.AdaptiveAvgPool2D(1)
self.conv1 = ConvNormLayer(
ch_in=in_channel,
ch_out=in_channel // ratio,
filter_size=1,
stride=1,
act='relu',
freeze_norm=freeze_norm,
norm_decay=norm_decay)
self.conv2 = ConvNormLayer(
ch_in=in_channel // ratio,
ch_out=in_channel,
filter_size=1,
stride=1,
act='sigmoid',
freeze_norm=freeze_norm,
norm_decay=norm_decay)
def forward(self, x):
out = self.global_avgpooling(x)
out = self.conv1(out)
out = self.conv2(out)
return x * out
class ConditionalChannelWeightingBlock(nn.Layer):
def __init__(self,
in_channels,
stride,
reduce_ratio,
norm_type='bn',
freeze_norm=False,
norm_decay=0.):
super(ConditionalChannelWeightingBlock, self).__init__()
assert stride in [1, 2]
branch_channels = [channel // 2 for channel in in_channels]
self.cross_resolution_weighting = CrossResolutionWeightingModule(
branch_channels,
ratio=reduce_ratio,
norm_type=norm_type,
freeze_norm=freeze_norm,
norm_decay=norm_decay)
self.depthwise_convs = nn.LayerList([
ConvNormLayer(
channel,
channel,
filter_size=3,
stride=stride,
groups=channel,
norm_type=norm_type,
freeze_norm=freeze_norm,
norm_decay=norm_decay) for channel in branch_channels
])
self.spatial_weighting = nn.LayerList([
SpatialWeightingModule(
channel,
ratio=4,
freeze_norm=freeze_norm,
norm_decay=norm_decay) for channel in branch_channels
])
def forward(self, x):
x = [s.chunk(2, axis=1) for s in x]
x1 = [s[0] for s in x]
x2 = [s[1] for s in x]
x2 = self.cross_resolution_weighting(x2)
x2 = [dw(s) for s, dw in zip(x2, self.depthwise_convs)]
x2 = [sw(s) for s, sw in zip(x2, self.spatial_weighting)]
out = [paddle.concat([s1, s2], axis=1) for s1, s2 in zip(x1, x2)]
out = [channel_shuffle(s, groups=2) for s in out]
return out
class ShuffleUnit(nn.Layer):
def __init__(self,
in_channel,
out_channel,
stride,
norm_type='bn',
freeze_norm=False,
norm_decay=0.):
super(ShuffleUnit, self).__init__()
branch_channel = out_channel // 2
self.stride = stride
if self.stride == 1:
assert in_channel == branch_channel * 2, \
"when stride=1, in_channel {} should equal to branch_channel*2 {}".format(in_channel, branch_channel * 2)
if stride > 1:
self.branch1 = nn.Sequential(
ConvNormLayer(
ch_in=in_channel,
ch_out=in_channel,
filter_size=3,
stride=self.stride,
groups=in_channel,
norm_type=norm_type,
freeze_norm=freeze_norm,
norm_decay=norm_decay),
ConvNormLayer(
ch_in=in_channel,
ch_out=branch_channel,
filter_size=1,
stride=1,
norm_type=norm_type,
act='relu',
freeze_norm=freeze_norm,
norm_decay=norm_decay), )
self.branch2 = nn.Sequential(
ConvNormLayer(
ch_in=branch_channel if stride == 1 else in_channel,
ch_out=branch_channel,
filter_size=1,
stride=1,
norm_type=norm_type,
act='relu',
freeze_norm=freeze_norm,
norm_decay=norm_decay),
ConvNormLayer(
ch_in=branch_channel,
ch_out=branch_channel,
filter_size=3,
stride=self.stride,
groups=branch_channel,
norm_type=norm_type,
freeze_norm=freeze_norm,
norm_decay=norm_decay),
ConvNormLayer(
ch_in=branch_channel,
ch_out=branch_channel,
filter_size=1,
stride=1,
norm_type=norm_type,
act='relu',
freeze_norm=freeze_norm,
norm_decay=norm_decay), )
def forward(self, x):
if self.stride > 1:
x1 = self.branch1(x)
x2 = self.branch2(x)
else:
x1, x2 = x.chunk(2, axis=1)
x2 = self.branch2(x2)
out = paddle.concat([x1, x2], axis=1)
out = channel_shuffle(out, groups=2)
return out
class IterativeHead(nn.Layer):
def __init__(self,
in_channels,
norm_type='bn',
freeze_norm=False,
norm_decay=0.):
super(IterativeHead, self).__init__()
num_branches = len(in_channels)
self.in_channels = in_channels[::-1]
projects = []
for i in range(num_branches):
if i != num_branches - 1:
projects.append(
DepthWiseSeparableConvNormLayer(
ch_in=self.in_channels[i],
ch_out=self.in_channels[i + 1],
filter_size=3,
stride=1,
dw_act=None,
pw_act='relu',
dw_norm_type=norm_type,
pw_norm_type=norm_type,
freeze_norm=freeze_norm,
norm_decay=norm_decay))
else:
projects.append(
DepthWiseSeparableConvNormLayer(
ch_in=self.in_channels[i],
ch_out=self.in_channels[i],
filter_size=3,
stride=1,
dw_act=None,
pw_act='relu',
dw_norm_type=norm_type,
pw_norm_type=norm_type,
freeze_norm=freeze_norm,
norm_decay=norm_decay))
self.projects = nn.LayerList(projects)
def forward(self, x):
x = x[::-1]
y = []
last_x = None
for i, s in enumerate(x):
if last_x is not None:
last_x = F.interpolate(
last_x,
size=s.shape[-2:],
mode='bilinear',
align_corners=True)
s = s + last_x
s = self.projects[i](s)
y.append(s)
last_x = s
return y[::-1]
class Stem(nn.Layer):
def __init__(self,
in_channel,
stem_channel,
out_channel,
expand_ratio,
norm_type='bn',
freeze_norm=False,
norm_decay=0.):
super(Stem, self).__init__()
self.conv1 = ConvNormLayer(
in_channel,
stem_channel,
filter_size=3,
stride=2,
norm_type=norm_type,
act='relu',
freeze_norm=freeze_norm,
norm_decay=norm_decay)
mid_channel = int(round(stem_channel * expand_ratio))
branch_channel = stem_channel // 2
if stem_channel == out_channel:
inc_channel = out_channel - branch_channel
else:
inc_channel = out_channel - stem_channel
self.branch1 = nn.Sequential(
ConvNormLayer(
ch_in=branch_channel,
ch_out=branch_channel,
filter_size=3,
stride=2,
groups=branch_channel,
norm_type=norm_type,
freeze_norm=freeze_norm,
norm_decay=norm_decay),
ConvNormLayer(
ch_in=branch_channel,
ch_out=inc_channel,
filter_size=1,
stride=1,
norm_type=norm_type,
act='relu',
freeze_norm=freeze_norm,
norm_decay=norm_decay), )
self.expand_conv = ConvNormLayer(
ch_in=branch_channel,
ch_out=mid_channel,
filter_size=1,
stride=1,
norm_type=norm_type,
act='relu',
freeze_norm=freeze_norm,
norm_decay=norm_decay)
self.depthwise_conv = ConvNormLayer(
ch_in=mid_channel,
ch_out=mid_channel,
filter_size=3,
stride=2,
groups=mid_channel,
norm_type=norm_type,
freeze_norm=freeze_norm,
norm_decay=norm_decay)
self.linear_conv = ConvNormLayer(
ch_in=mid_channel,
ch_out=branch_channel
if stem_channel == out_channel else stem_channel,
filter_size=1,
stride=1,
norm_type=norm_type,
act='relu',
freeze_norm=freeze_norm,
norm_decay=norm_decay)
def forward(self, x):
x = self.conv1(x)
x1, x2 = x.chunk(2, axis=1)
x1 = self.branch1(x1)
x2 = self.expand_conv(x2)
x2 = self.depthwise_conv(x2)
x2 = self.linear_conv(x2)
out = paddle.concat([x1, x2], axis=1)
out = channel_shuffle(out, groups=2)
return out
class LiteHRNetModule(nn.Layer):
def __init__(self,
num_branches,
num_blocks,
in_channels,
reduce_ratio,
module_type,
multiscale_output=False,
with_fuse=True,
norm_type='bn',
freeze_norm=False,
norm_decay=0.):
super(LiteHRNetModule, self).__init__()
assert num_branches == len(in_channels),\
"num_branches {} should equal to num_in_channels {}".format(num_branches, len(in_channels))
assert module_type in [
'LITE', 'NAIVE'
], "module_type should be one of ['LITE', 'NAIVE']"
self.num_branches = num_branches
self.in_channels = in_channels
self.multiscale_output = multiscale_output
self.with_fuse = with_fuse
self.norm_type = 'bn'
self.module_type = module_type
if self.module_type == 'LITE':
self.layers = self._make_weighting_blocks(
num_blocks,
reduce_ratio,
freeze_norm=freeze_norm,
norm_decay=norm_decay)
elif self.module_type == 'NAIVE':
self.layers = self._make_naive_branches(
num_branches,
num_blocks,
freeze_norm=freeze_norm,
norm_decay=norm_decay)
if self.with_fuse:
self.fuse_layers = self._make_fuse_layers(
freeze_norm=freeze_norm, norm_decay=norm_decay)
self.relu = nn.ReLU()
def _make_weighting_blocks(self,
num_blocks,
reduce_ratio,
stride=1,
freeze_norm=False,
norm_decay=0.):
layers = []
for i in range(num_blocks):
layers.append(
ConditionalChannelWeightingBlock(
self.in_channels,
stride=stride,
reduce_ratio=reduce_ratio,
norm_type=self.norm_type,
freeze_norm=freeze_norm,
norm_decay=norm_decay))
return nn.Sequential(*layers)
def _make_naive_branches(self,
num_branches,
num_blocks,
freeze_norm=False,
norm_decay=0.):
branches = []
for branch_idx in range(num_branches):
layers = []
for i in range(num_blocks):
layers.append(
ShuffleUnit(
self.in_channels[branch_idx],
self.in_channels[branch_idx],
stride=1,
norm_type=self.norm_type,
freeze_norm=freeze_norm,
norm_decay=norm_decay))
branches.append(nn.Sequential(*layers))
return nn.LayerList(branches)
def _make_fuse_layers(self, freeze_norm=False, norm_decay=0.):
if self.num_branches == 1:
return None
fuse_layers = []
num_out_branches = self.num_branches if self.multiscale_output else 1
for i in range(num_out_branches):
fuse_layer = []
for j in range(self.num_branches):
if j > i:
fuse_layer.append(
nn.Sequential(
L.Conv2d(
self.in_channels[j],
self.in_channels[i],
kernel_size=1,
stride=1,
padding=0,
bias=False, ),
nn.BatchNorm2D(self.in_channels[i]),
nn.Upsample(
scale_factor=2**(j - i), mode='nearest')))
elif j == i:
fuse_layer.append(None)
else:
conv_downsamples = []
for k in range(i - j):
if k == i - j - 1:
conv_downsamples.append(
nn.Sequential(
L.Conv2d(
self.in_channels[j],
self.in_channels[j],
kernel_size=3,
stride=2,
padding=1,
groups=self.in_channels[j],
bias=False, ),
nn.BatchNorm2D(self.in_channels[j]),
L.Conv2d(
self.in_channels[j],
self.in_channels[i],
kernel_size=1,
stride=1,
padding=0,
bias=False, ),
nn.BatchNorm2D(self.in_channels[i])))
else:
conv_downsamples.append(
nn.Sequential(
L.Conv2d(
self.in_channels[j],
self.in_channels[j],
kernel_size=3,
stride=2,
padding=1,
groups=self.in_channels[j],
bias=False, ),
nn.BatchNorm2D(self.in_channels[j]),
L.Conv2d(
self.in_channels[j],
self.in_channels[j],
kernel_size=1,
stride=1,
padding=0,
bias=False, ),
nn.BatchNorm2D(self.in_channels[j]),
nn.ReLU()))
fuse_layer.append(nn.Sequential(*conv_downsamples))
fuse_layers.append(nn.LayerList(fuse_layer))
return nn.LayerList(fuse_layers)
def forward(self, x):
if self.num_branches == 1:
return [self.layers[0](x[0])]
if self.module_type == 'LITE':
out = self.layers(x)
elif self.module_type == 'NAIVE':
for i in range(self.num_branches):
x[i] = self.layers[i](x[i])
out = x
if self.with_fuse:
out_fuse = []
for i in range(len(self.fuse_layers)):
y = out[0] if i == 0 else self.fuse_layers[i][0](out[0])
for j in range(self.num_branches):
if j == 0:
y += y
elif i == j:
y += out[j]
else:
y += self.fuse_layers[i][j](out[j])
if i == 0:
out[i] = y
out_fuse.append(self.relu(y))
out = out_fuse
elif not self.multiscale_output:
out = [out[0]]
return out
@register
class LiteHRNet(nn.Layer):
"""
@inproceedings{Yulitehrnet21,
title={Lite-HRNet: A Lightweight High-Resolution Network},
author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong},
booktitle={CVPR},year={2021}
}
Args:
network_type (str): the network_type should be one of ["lite_18", "lite_30", "naive", "wider_naive"],
"naive": Simply combining the shuffle block in ShuffleNet and the highresolution design pattern in HRNet.
"wider_naive": Naive network with wider channels in each block.
"lite_18": Lite-HRNet-18, which replaces the pointwise convolution in a shuffle block by conditional channel weighting.
"lite_30": Lite-HRNet-30, with more blocks compared with Lite-HRNet-18.
freeze_at (int): the stage to freeze
freeze_norm (bool): whether to freeze norm in HRNet
norm_decay (float): weight decay for normalization layer weights
return_idx (List): the stage to return
"""
def __init__(self,
network_type,
freeze_at=0,
freeze_norm=True,
norm_decay=0.,
return_idx=[0, 1, 2, 3]):
super(LiteHRNet, self).__init__()
if isinstance(return_idx, Integral):
return_idx = [return_idx]
assert network_type in ["lite_18", "lite_30", "naive", "wider_naive"], \
"the network_type should be one of [lite_18, lite_30, naive, wider_naive]"
assert len(return_idx) > 0, "need one or more return index"
self.freeze_at = freeze_at
self.freeze_norm = freeze_norm
self.norm_decay = norm_decay
self.return_idx = return_idx
self.norm_type = 'bn'
self.module_configs = {
"lite_18": {
"num_modules": [2, 4, 2],
"num_branches": [2, 3, 4],
"num_blocks": [2, 2, 2],
"module_type": ["LITE", "LITE", "LITE"],
"reduce_ratios": [8, 8, 8],
"num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]],
},
"lite_30": {
"num_modules": [3, 8, 3],
"num_branches": [2, 3, 4],
"num_blocks": [2, 2, 2],
"module_type": ["LITE", "LITE", "LITE"],
"reduce_ratios": [8, 8, 8],
"num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]],
},
"naive": {
"num_modules": [2, 4, 2],
"num_branches": [2, 3, 4],
"num_blocks": [2, 2, 2],
"module_type": ["NAIVE", "NAIVE", "NAIVE"],
"reduce_ratios": [1, 1, 1],
"num_channels": [[30, 60], [30, 60, 120], [30, 60, 120, 240]],
},
"wider_naive": {
"num_modules": [2, 4, 2],
"num_branches": [2, 3, 4],
"num_blocks": [2, 2, 2],
"module_type": ["NAIVE", "NAIVE", "NAIVE"],
"reduce_ratios": [1, 1, 1],
"num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]],
},
}
self.stages_config = self.module_configs[network_type]
self.stem = Stem(3, 32, 32, 1)
num_channels_pre_layer = [32]
for stage_idx in range(3):
num_channels = self.stages_config["num_channels"][stage_idx]
setattr(self, 'transition{}'.format(stage_idx),
self._make_transition_layer(num_channels_pre_layer,
num_channels, self.freeze_norm,
self.norm_decay))
stage, num_channels_pre_layer = self._make_stage(
self.stages_config, stage_idx, num_channels, True,
self.freeze_norm, self.norm_decay)
setattr(self, 'stage{}'.format(stage_idx), stage)
self.head_layer = IterativeHead(num_channels_pre_layer, 'bn',
self.freeze_norm, self.norm_decay)
def _make_transition_layer(self,
num_channels_pre_layer,
num_channels_cur_layer,
freeze_norm=False,
norm_decay=0.):
num_branches_pre = len(num_channels_pre_layer)
num_branches_cur = len(num_channels_cur_layer)
transition_layers = []
for i in range(num_branches_cur):
if i < num_branches_pre:
if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
transition_layers.append(
nn.Sequential(
L.Conv2d(
num_channels_pre_layer[i],
num_channels_pre_layer[i],
kernel_size=3,
stride=1,
padding=1,
groups=num_channels_pre_layer[i],
bias=False),
nn.BatchNorm2D(num_channels_pre_layer[i]),
L.Conv2d(
num_channels_pre_layer[i],
num_channels_cur_layer[i],
kernel_size=1,
stride=1,
padding=0,
bias=False, ),
nn.BatchNorm2D(num_channels_cur_layer[i]),
nn.ReLU()))
else:
transition_layers.append(None)
else:
conv_downsamples = []
for j in range(i + 1 - num_branches_pre):
conv_downsamples.append(
nn.Sequential(
L.Conv2d(
num_channels_pre_layer[-1],
num_channels_pre_layer[-1],
groups=num_channels_pre_layer[-1],
kernel_size=3,
stride=2,
padding=1,
bias=False, ),
nn.BatchNorm2D(num_channels_pre_layer[-1]),
L.Conv2d(
num_channels_pre_layer[-1],
num_channels_cur_layer[i]
if j == i - num_branches_pre else
num_channels_pre_layer[-1],
kernel_size=1,
stride=1,
padding=0,
bias=False, ),
nn.BatchNorm2D(num_channels_cur_layer[i]
if j == i - num_branches_pre else
num_channels_pre_layer[-1]),
nn.ReLU()))
transition_layers.append(nn.Sequential(*conv_downsamples))
return nn.LayerList(transition_layers)
def _make_stage(self,
stages_config,
stage_idx,
in_channels,
multiscale_output,
freeze_norm=False,
norm_decay=0.):
num_modules = stages_config["num_modules"][stage_idx]
num_branches = stages_config["num_branches"][stage_idx]
num_blocks = stages_config["num_blocks"][stage_idx]
reduce_ratio = stages_config['reduce_ratios'][stage_idx]
module_type = stages_config['module_type'][stage_idx]
modules = []
for i in range(num_modules):
if not multiscale_output and i == num_modules - 1:
reset_multiscale_output = False
else:
reset_multiscale_output = True
modules.append(
LiteHRNetModule(
num_branches,
num_blocks,
in_channels,
reduce_ratio,
module_type,
multiscale_output=reset_multiscale_output,
with_fuse=True,
freeze_norm=freeze_norm,
norm_decay=norm_decay))
in_channels = modules[-1].in_channels
return nn.Sequential(*modules), in_channels
def forward(self, inputs):
x = inputs['image']
dims = x.shape
if len(dims) == 5:
x = paddle.reshape(x, (dims[0] * dims[1], dims[2], dims[3],
dims[4])) # [6, 3, 128, 96]
x = self.stem(x)
y_list = [x]
for stage_idx in range(3):
x_list = []
transition = getattr(self, 'transition{}'.format(stage_idx))
for j in range(self.stages_config["num_branches"][stage_idx]):
if transition[j] is not None:
if j >= len(y_list):
x_list.append(transition[j](y_list[-1]))
else:
x_list.append(transition[j](y_list[j]))
else:
x_list.append(y_list[j])
y_list = getattr(self, 'stage{}'.format(stage_idx))(x_list)
x = self.head_layer(y_list)
res = []
for i, layer in enumerate(x):
if i == self.freeze_at:
layer.stop_gradient = True
if i in self.return_idx:
res.append(layer)
return res
@property
def out_shape(self):
return [
ShapeSpec(
channels=self._out_channels[i], stride=self._out_strides[i])
for i in self.return_idx
]
================================================
FILE: ppdet/modeling/backbones/mobilenet_v1.py
================================================
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddle.nn.initializer import KaimingNormal
from ppdet.core.workspace import register, serializable
from numbers import Integral
from ..shape_spec import ShapeSpec
__all__ = ['MobileNet']
class ConvBNLayer(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
padding,
num_groups=1,
act='relu',
conv_lr=1.,
conv_decay=0.,
norm_decay=0.,
norm_type='bn',
name=None):
super(ConvBNLayer, self).__init__()
self.act = act
self._conv = nn.Conv2D(
in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=num_groups,
weight_attr=ParamAttr(
learning_rate=conv_lr,
initializer=KaimingNormal(),
regularizer=L2Decay(conv_decay)),
bias_attr=False)
param_attr = ParamAttr(regularizer=L2Decay(norm_decay))
bias_attr = ParamAttr(regularizer=L2Decay(norm_decay))
if norm_type in ['sync_bn', 'bn']:
self._batch_norm = nn.BatchNorm2D(
out_channels, weight_attr=param_attr, bias_attr=bias_attr)
def forward(self, x):
x = self._conv(x)
x = self._batch_norm(x)
if self.act == "relu":
x = F.relu(x)
elif self.act == "relu6":
x = F.relu6(x)
return x
class DepthwiseSeparable(nn.Layer):
def __init__(self,
in_channels,
out_channels1,
out_channels2,
num_groups,
stride,
scale,
conv_lr=1.,
conv_decay=0.,
norm_decay=0.,
norm_type='bn',
name=None):
super(DepthwiseSeparable, self).__init__()
self._depthwise_conv = ConvBNLayer(
in_channels,
int(out_channels1 * scale),
kernel_size=3,
stride=stride,
padding=1,
num_groups=int(num_groups * scale),
conv_lr=conv_lr,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name=name + "_dw")
self._pointwise_conv = ConvBNLayer(
int(out_channels1 * scale),
int(out_channels2 * scale),
kernel_size=1,
stride=1,
padding=0,
conv_lr=conv_lr,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name=name + "_sep")
def forward(self, x):
x = self._depthwise_conv(x)
x = self._pointwise_conv(x)
return x
class ExtraBlock(nn.Layer):
def __init__(self,
in_channels,
out_channels1,
out_channels2,
num_groups=1,
stride=2,
conv_lr=1.,
conv_decay=0.,
norm_decay=0.,
norm_type='bn',
name=None):
super(ExtraBlock, self).__init__()
self.pointwise_conv = ConvBNLayer(
in_channels,
int(out_channels1),
kernel_size=1,
stride=1,
padding=0,
num_groups=int(num_groups),
act='relu6',
conv_lr=conv_lr,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name=name + "_extra1")
self.normal_conv = ConvBNLayer(
int(out_channels1),
int(out_channels2),
kernel_size=3,
stride=stride,
padding=1,
num_groups=int(num_groups),
act='relu6',
conv_lr=conv_lr,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name=name + "_extra2")
def forward(self, x):
x = self.pointwise_conv(x)
x = self.normal_conv(x)
return x
@register
@serializable
class MobileNet(nn.Layer):
__shared__ = ['norm_type']
def __init__(self,
norm_type='bn',
norm_decay=0.,
conv_decay=0.,
scale=1,
conv_learning_rate=1.0,
feature_maps=[4, 6, 13],
with_extra_blocks=False,
extra_block_filters=[[256, 512], [128, 256], [128, 256],
[64, 128]]):
super(MobileNet, self).__init__()
if isinstance(feature_maps, Integral):
feature_maps = [feature_maps]
self.feature_maps = feature_maps
self.with_extra_blocks = with_extra_blocks
self.extra_block_filters = extra_block_filters
self._out_channels = []
self.conv1 = ConvBNLayer(
in_channels=3,
out_channels=int(32 * scale),
kernel_size=3,
stride=2,
padding=1,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv1")
self.dwsl = []
dws21 = self.add_sublayer(
"conv2_1",
sublayer=DepthwiseSeparable(
in_channels=int(32 * scale),
out_channels1=32,
out_channels2=64,
num_groups=32,
stride=1,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv2_1"))
self.dwsl.append(dws21)
self._update_out_channels(int(64 * scale), len(self.dwsl), feature_maps)
dws22 = self.add_sublayer(
"conv2_2",
sublayer=DepthwiseSeparable(
in_channels=int(64 * scale),
out_channels1=64,
out_channels2=128,
num_groups=64,
stride=2,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv2_2"))
self.dwsl.append(dws22)
self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps)
# 1/4
dws31 = self.add_sublayer(
"conv3_1",
sublayer=DepthwiseSeparable(
in_channels=int(128 * scale),
out_channels1=128,
out_channels2=128,
num_groups=128,
stride=1,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv3_1"))
self.dwsl.append(dws31)
self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps)
dws32 = self.add_sublayer(
"conv3_2",
sublayer=DepthwiseSeparable(
in_channels=int(128 * scale),
out_channels1=128,
out_channels2=256,
num_groups=128,
stride=2,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv3_2"))
self.dwsl.append(dws32)
self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps)
# 1/8
dws41 = self.add_sublayer(
"conv4_1",
sublayer=DepthwiseSeparable(
in_channels=int(256 * scale),
out_channels1=256,
out_channels2=256,
num_groups=256,
stride=1,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv4_1"))
self.dwsl.append(dws41)
self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps)
dws42 = self.add_sublayer(
"conv4_2",
sublayer=DepthwiseSeparable(
in_channels=int(256 * scale),
out_channels1=256,
out_channels2=512,
num_groups=256,
stride=2,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv4_2"))
self.dwsl.append(dws42)
self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps)
# 1/16
for i in range(5):
tmp = self.add_sublayer(
"conv5_" + str(i + 1),
sublayer=DepthwiseSeparable(
in_channels=int(512 * scale),
out_channels1=512,
out_channels2=512,
num_groups=512,
stride=1,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv5_" + str(i + 1)))
self.dwsl.append(tmp)
self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps)
dws56 = self.add_sublayer(
"conv5_6",
sublayer=DepthwiseSeparable(
in_channels=int(512 * scale),
out_channels1=512,
out_channels2=1024,
num_groups=512,
stride=2,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv5_6"))
self.dwsl.append(dws56)
self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps)
# 1/32
dws6 = self.add_sublayer(
"conv6",
sublayer=DepthwiseSeparable(
in_channels=int(1024 * scale),
out_channels1=1024,
out_channels2=1024,
num_groups=1024,
stride=1,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv6"))
self.dwsl.append(dws6)
self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps)
if self.with_extra_blocks:
self.extra_blocks = []
for i, block_filter in enumerate(self.extra_block_filters):
in_c = 1024 if i == 0 else self.extra_block_filters[i - 1][1]
conv_extra = self.add_sublayer(
"conv7_" + str(i + 1),
sublayer=ExtraBlock(
in_c,
block_filter[0],
block_filter[1],
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv7_" + str(i + 1)))
self.extra_blocks.append(conv_extra)
self._update_out_channels(
block_filter[1],
len(self.dwsl) + len(self.extra_blocks), feature_maps)
def _update_out_channels(self, channel, feature_idx, feature_maps):
if feature_idx in feature_maps:
self._out_channels.append(channel)
def forward(self, inputs):
outs = []
y = self.conv1(inputs['image'])
for i, block in enumerate(self.dwsl):
y = block(y)
if i + 1 in self.feature_maps:
outs.append(y)
if not self.with_extra_blocks:
return outs
y = outs[-1]
for i, block in enumerate(self.extra_blocks):
idx = i + len(self.dwsl)
y = block(y)
if idx + 1 in self.feature_maps:
outs.append(y)
return outs
@property
def out_shape(self):
return [ShapeSpec(channels=c) for c in self._out_channels]
================================================
FILE: ppdet/modeling/backbones/mobilenet_v3.py
================================================
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register, serializable
from numbers import Integral
from ..shape_spec import ShapeSpec
__all__ = ['MobileNetV3']
def make_divisible(v, divisor=8, min_value=None):
if min_value is None:
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
if new_v < 0.9 * v:
new_v += divisor
return new_v
class ConvBNLayer(nn.Layer):
def __init__(self,
in_c,
out_c,
filter_size,
stride,
padding,
num_groups=1,
act=None,
lr_mult=1.,
conv_decay=0.,
norm_type='bn',
norm_decay=0.,
freeze_norm=False,
name=""):
super(ConvBNLayer, self).__init__()
self.act = act
self.conv = nn.Conv2D(
in_channels=in_c,
out_channels=out_c,
kernel_size=filter_size,
stride=stride,
padding=padding,
groups=num_groups,
weight_attr=ParamAttr(
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
bias_attr=False)
norm_lr = 0. if freeze_norm else lr_mult
param_attr = ParamAttr(
learning_rate=norm_lr,
regularizer=L2Decay(norm_decay),
trainable=False if freeze_norm else True)
bias_attr = ParamAttr(
learning_rate=norm_lr,
regularizer=L2Decay(norm_decay),
trainable=False if freeze_norm else True)
global_stats = True if freeze_norm else None
if norm_type in ['sync_bn', 'bn']:
self.bn = nn.BatchNorm2D(
out_c,
weight_attr=param_attr,
bias_attr=bias_attr,
use_global_stats=global_stats)
norm_params = self.bn.parameters()
if freeze_norm:
for param in norm_params:
param.stop_gradient = True
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
if self.act is not None:
if self.act == "relu":
x = F.relu(x)
elif self.act == "relu6":
x = F.relu6(x)
elif self.act == "hard_swish":
x = F.hardswish(x)
else:
raise NotImplementedError(
"The activation function is selected incorrectly.")
return x
class ResidualUnit(nn.Layer):
def __init__(self,
in_c,
mid_c,
out_c,
filter_size,
stride,
use_se,
lr_mult,
conv_decay=0.,
norm_type='bn',
norm_decay=0.,
freeze_norm=False,
act=None,
return_list=False,
name=''):
super(ResidualUnit, self).__init__()
self.if_shortcut = stride == 1 and in_c == out_c
self.use_se = use_se
self.return_list = return_list
self.expand_conv = ConvBNLayer(
in_c=in_c,
out_c=mid_c,
filter_size=1,
stride=1,
padding=0,
act=act,
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_expand")
self.bottleneck_conv = ConvBNLayer(
in_c=mid_c,
out_c=mid_c,
filter_size=filter_size,
stride=stride,
padding=int((filter_size - 1) // 2),
num_groups=mid_c,
act=act,
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_depthwise")
if self.use_se:
self.mid_se = SEModule(
mid_c, lr_mult, conv_decay, name=name + "_se")
self.linear_conv = ConvBNLayer(
in_c=mid_c,
out_c=out_c,
filter_size=1,
stride=1,
padding=0,
act=None,
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_linear")
def forward(self, inputs):
y = self.expand_conv(inputs)
x = self.bottleneck_conv(y)
if self.use_se:
x = self.mid_se(x)
x = self.linear_conv(x)
if self.if_shortcut:
x = paddle.add(inputs, x)
if self.return_list:
return [y, x]
else:
return x
class SEModule(nn.Layer):
def __init__(self, channel, lr_mult, conv_decay, reduction=4, name=""):
super(SEModule, self).__init__()
self.avg_pool = nn.AdaptiveAvgPool2D(1)
mid_channels = int(channel // reduction)
self.conv1 = nn.Conv2D(
in_channels=channel,
out_channels=mid_channels,
kernel_size=1,
stride=1,
padding=0,
weight_attr=ParamAttr(
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
bias_attr=ParamAttr(
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))
self.conv2 = nn.Conv2D(
in_channels=mid_channels,
out_channels=channel,
kernel_size=1,
stride=1,
padding=0,
weight_attr=ParamAttr(
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
bias_attr=ParamAttr(
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))
def forward(self, inputs):
outputs = self.avg_pool(inputs)
outputs = self.conv1(outputs)
outputs = F.relu(outputs)
outputs = self.conv2(outputs)
outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5)
return paddle.multiply(x=inputs, y=outputs)
class ExtraBlockDW(nn.Layer):
def __init__(self,
in_c,
ch_1,
ch_2,
stride,
lr_mult,
conv_decay=0.,
norm_type='bn',
norm_decay=0.,
freeze_norm=False,
name=None):
super(ExtraBlockDW, self).__init__()
self.pointwise_conv = ConvBNLayer(
in_c=in_c,
out_c=ch_1,
filter_size=1,
stride=1,
padding='SAME',
act='relu6',
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_extra1")
self.depthwise_conv = ConvBNLayer(
in_c=ch_1,
out_c=ch_2,
filter_size=3,
stride=stride,
padding='SAME',
num_groups=int(ch_1),
act='relu6',
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_extra2_dw")
self.normal_conv = ConvBNLayer(
in_c=ch_2,
out_c=ch_2,
filter_size=1,
stride=1,
padding='SAME',
act='relu6',
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_extra2_sep")
def forward(self, inputs):
x = self.pointwise_conv(inputs)
x = self.depthwise_conv(x)
x = self.normal_conv(x)
return x
@register
@serializable
class MobileNetV3(nn.Layer):
__shared__ = ['norm_type']
def __init__(
self,
scale=1.0,
model_name="large",
feature_maps=[6, 12, 15],
with_extra_blocks=False,
extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]],
lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
conv_decay=0.0,
multiplier=1.0,
norm_type='bn',
norm_decay=0.0,
freeze_norm=False):
super(MobileNetV3, self).__init__()
if isinstance(feature_maps, Integral):
feature_maps = [feature_maps]
if norm_type == 'sync_bn' and freeze_norm:
raise ValueError(
"The norm_type should not be sync_bn when freeze_norm is True")
self.feature_maps = feature_maps
self.with_extra_blocks = with_extra_blocks
self.extra_block_filters = extra_block_filters
inplanes = 16
if model_name == "large":
self.cfg = [
# k, exp, c, se, nl, s,
[3, 16, 16, False, "relu", 1],
[3, 64, 24, False, "relu", 2],
[3, 72, 24, False, "relu", 1],
[5, 72, 40, True, "relu", 2], # RCNN output
[5, 120, 40, True, "relu", 1],
[5, 120, 40, True, "relu", 1], # YOLOv3 output
[3, 240, 80, False, "hard_swish", 2], # RCNN output
[3, 200, 80, False, "hard_swish", 1],
[3, 184, 80, False, "hard_swish", 1],
[3, 184, 80, False, "hard_swish", 1],
[3, 480, 112, True, "hard_swish", 1],
[3, 672, 112, True, "hard_swish", 1], # YOLOv3 output
[5, 672, 160, True, "hard_swish", 2], # SSD/SSDLite/RCNN output
[5, 960, 160, True, "hard_swish", 1],
[5, 960, 160, True, "hard_swish", 1], # YOLOv3 output
]
elif model_name == "small":
self.cfg = [
# k, exp, c, se, nl, s,
[3, 16, 16, True, "relu", 2],
[3, 72, 24, False, "relu", 2], # RCNN output
[3, 88, 24, False, "relu", 1], # YOLOv3 output
[5, 96, 40, True, "hard_swish", 2], # RCNN output
[5, 240, 40, True, "hard_swish", 1],
[5, 240, 40, True, "hard_swish", 1],
[5, 120, 48, True, "hard_swish", 1],
[5, 144, 48, True, "hard_swish", 1], # YOLOv3 output
[5, 288, 96, True, "hard_swish", 2], # SSD/SSDLite/RCNN output
[5, 576, 96, True, "hard_swish", 1],
[5, 576, 96, True, "hard_swish", 1], # YOLOv3 output
]
else:
raise NotImplementedError(
"mode[{}_model] is not implemented!".format(model_name))
if multiplier != 1.0:
self.cfg[-3][2] = int(self.cfg[-3][2] * multiplier)
self.cfg[-2][1] = int(self.cfg[-2][1] * multiplier)
self.cfg[-2][2] = int(self.cfg[-2][2] * multiplier)
self.cfg[-1][1] = int(self.cfg[-1][1] * multiplier)
self.cfg[-1][2] = int(self.cfg[-1][2] * multiplier)
self.conv1 = ConvBNLayer(
in_c=3,
out_c=make_divisible(inplanes * scale),
filter_size=3,
stride=2,
padding=1,
num_groups=1,
act="hard_swish",
lr_mult=lr_mult_list[0],
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name="conv1")
self._out_channels = []
self.block_list = []
i = 0
inplanes = make_divisible(inplanes * scale)
for (k, exp, c, se, nl, s) in self.cfg:
lr_idx = min(i // 3, len(lr_mult_list) - 1)
lr_mult = lr_mult_list[lr_idx]
# for SSD/SSDLite, first head input is after ResidualUnit expand_conv
return_list = self.with_extra_blocks and i + 2 in self.feature_maps
block = self.add_sublayer(
"conv" + str(i + 2),
sublayer=ResidualUnit(
in_c=inplanes,
mid_c=make_divisible(scale * exp),
out_c=make_divisible(scale * c),
filter_size=k,
stride=s,
use_se=se,
act=nl,
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
return_list=return_list,
name="conv" + str(i + 2)))
self.block_list.append(block)
inplanes = make_divisible(scale * c)
i += 1
self._update_out_channels(
make_divisible(scale * exp)
if return_list else inplanes, i + 1, feature_maps)
if self.with_extra_blocks:
self.extra_block_list = []
extra_out_c = make_divisible(scale * self.cfg[-1][1])
lr_idx = min(i // 3, len(lr_mult_list) - 1)
lr_mult = lr_mult_list[lr_idx]
conv_extra = self.add_sublayer(
"conv" + str(i + 2),
sublayer=ConvBNLayer(
in_c=inplanes,
out_c=extra_out_c,
filter_size=1,
stride=1,
padding=0,
num_groups=1,
act="hard_swish",
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name="conv" + str(i + 2)))
self.extra_block_list.append(conv_extra)
i += 1
self._update_out_channels(extra_out_c, i + 1, feature_maps)
for j, block_filter in enumerate(self.extra_block_filters):
in_c = extra_out_c if j == 0 else self.extra_block_filters[j -
1][1]
conv_extra = self.add_sublayer(
"conv" + str(i + 2),
sublayer=ExtraBlockDW(
in_c,
block_filter[0],
block_filter[1],
stride=2,
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name='conv' + str(i + 2)))
self.extra_block_list.append(conv_extra)
i += 1
self._update_out_channels(block_filter[1], i + 1, feature_maps)
def _update_out_channels(self, channel, feature_idx, feature_maps):
if feature_idx in feature_maps:
self._out_channels.append(channel)
def forward(self, inputs):
x = self.conv1(inputs['image'])
outs = []
for idx, block in enumerate(self.block_list):
x = block(x)
if idx + 2 in self.feature_maps:
if isinstance(x, list):
outs.append(x[0])
x = x[1]
else:
outs.append(x)
if not self.with_extra_blocks:
return outs
for i, block in enumerate(self.extra_block_list):
idx = i + len(self.block_list)
x = block(x)
if idx + 2 in self.feature_maps:
outs.append(x)
return outs
@property
def out_shape(self):
return [ShapeSpec(channels=c) for c in self._out_channels]
================================================
FILE: ppdet/modeling/backbones/mobileone.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is the paddle implementation of MobileOne block, see: https://arxiv.org/pdf/2206.04040.pdf.
Some codes are based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
Ths copyright of microsoft/Swin-Transformer is as follows:
MIT License [see LICENSE for details]
"""
import paddle
import paddle.nn as nn
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddle.nn.initializer import Normal, Constant
from ppdet.modeling.ops import get_act_fn
from ppdet.modeling.layers import ConvNormLayer
class MobileOneBlock(nn.Layer):
def __init__(
self,
ch_in,
ch_out,
stride,
kernel_size,
conv_num=1,
norm_type='bn',
norm_decay=0.,
norm_groups=32,
bias_on=False,
lr_scale=1.,
freeze_norm=False,
initializer=Normal(
mean=0., std=0.01),
skip_quant=False,
act='relu', ):
super(MobileOneBlock, self).__init__()
self.ch_in = ch_in
self.ch_out = ch_out
self.kernel_size = kernel_size
self.stride = stride
self.padding = (kernel_size - 1) // 2
self.k = conv_num
self.depth_conv = nn.LayerList()
self.point_conv = nn.LayerList()
for _ in range(self.k):
self.depth_conv.append(
ConvNormLayer(
ch_in,
ch_in,
kernel_size,
stride=stride,
groups=ch_in,
norm_type=norm_type,
norm_decay=norm_decay,
norm_groups=norm_groups,
bias_on=bias_on,
lr_scale=lr_scale,
freeze_norm=freeze_norm,
initializer=initializer,
skip_quant=skip_quant))
self.point_conv.append(
ConvNormLayer(
ch_in,
ch_out,
1,
stride=1,
groups=1,
norm_type=norm_type,
norm_decay=norm_decay,
norm_groups=norm_groups,
bias_on=bias_on,
lr_scale=lr_scale,
freeze_norm=freeze_norm,
initializer=initializer,
skip_quant=skip_quant))
self.rbr_1x1 = ConvNormLayer(
ch_in,
ch_in,
1,
stride=self.stride,
groups=ch_in,
norm_type=norm_type,
norm_decay=norm_decay,
norm_groups=norm_groups,
bias_on=bias_on,
lr_scale=lr_scale,
freeze_norm=freeze_norm,
initializer=initializer,
skip_quant=skip_quant)
self.rbr_identity_st1 = nn.BatchNorm2D(
num_features=ch_in,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(
0.0))) if ch_in == ch_out and self.stride == 1 else None
self.rbr_identity_st2 = nn.BatchNorm2D(
num_features=ch_out,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(
0.0))) if ch_in == ch_out and self.stride == 1 else None
self.act = get_act_fn(act) if act is None or isinstance(act, (
str, dict)) else act
def forward(self, x):
if hasattr(self, "conv1") and hasattr(self, "conv2"):
y = self.act(self.conv2(self.act(self.conv1(x))))
else:
if self.rbr_identity_st1 is None:
id_out_st1 = 0
else:
id_out_st1 = self.rbr_identity_st1(x)
x1_1 = 0
for i in range(self.k):
x1_1 += self.depth_conv[i](x)
x1_2 = self.rbr_1x1(x)
x1 = self.act(x1_1 + x1_2 + id_out_st1)
if self.rbr_identity_st2 is None:
id_out_st2 = 0
else:
id_out_st2 = self.rbr_identity_st2(x1)
x2_1 = 0
for i in range(self.k):
x2_1 += self.point_conv[i](x1)
y = self.act(x2_1 + id_out_st2)
return y
def convert_to_deploy(self):
if not hasattr(self, 'conv1'):
self.conv1 = nn.Conv2D(
in_channels=self.ch_in,
out_channels=self.ch_in,
kernel_size=self.kernel_size,
stride=self.stride,
padding=self.padding,
groups=self.ch_in,
bias_attr=ParamAttr(
initializer=Constant(value=0.), learning_rate=1.))
if not hasattr(self, 'conv2'):
self.conv2 = nn.Conv2D(
in_channels=self.ch_in,
out_channels=self.ch_out,
kernel_size=1,
stride=1,
padding='SAME',
groups=1,
bias_attr=ParamAttr(
initializer=Constant(value=0.), learning_rate=1.))
conv1_kernel, conv1_bias, conv2_kernel, conv2_bias = self.get_equivalent_kernel_bias(
)
self.conv1.weight.set_value(conv1_kernel)
self.conv1.bias.set_value(conv1_bias)
self.conv2.weight.set_value(conv2_kernel)
self.conv2.bias.set_value(conv2_bias)
self.__delattr__('depth_conv')
self.__delattr__('point_conv')
self.__delattr__('rbr_1x1')
if hasattr(self, 'rbr_identity_st1'):
self.__delattr__('rbr_identity_st1')
if hasattr(self, 'rbr_identity_st2'):
self.__delattr__('rbr_identity_st2')
def get_equivalent_kernel_bias(self):
st1_kernel3x3, st1_bias3x3 = self._fuse_bn_tensor(self.depth_conv)
st1_kernel1x1, st1_bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
st1_kernelid, st1_biasid = self._fuse_bn_tensor(
self.rbr_identity_st1, kernel_size=self.kernel_size)
st2_kernel1x1, st2_bias1x1 = self._fuse_bn_tensor(self.point_conv)
st2_kernelid, st2_biasid = self._fuse_bn_tensor(
self.rbr_identity_st2, kernel_size=1)
conv1_kernel = st1_kernel3x3 + self._pad_1x1_to_3x3_tensor(
st1_kernel1x1) + st1_kernelid
conv1_bias = st1_bias3x3 + st1_bias1x1 + st1_biasid
conv2_kernel = st2_kernel1x1 + st2_kernelid
conv2_bias = st2_bias1x1 + st2_biasid
return conv1_kernel, conv1_bias, conv2_kernel, conv2_bias
def _pad_1x1_to_3x3_tensor(self, kernel1x1):
if kernel1x1 is None:
return 0
else:
padding_size = (self.kernel_size - 1) // 2
return nn.functional.pad(
kernel1x1,
[padding_size, padding_size, padding_size, padding_size])
def _fuse_bn_tensor(self, branch, kernel_size=3):
if branch is None:
return 0, 0
if isinstance(branch, nn.LayerList):
fused_kernels = []
fused_bias = []
for block in branch:
kernel = block.conv.weight
running_mean = block.norm._mean
running_var = block.norm._variance
gamma = block.norm.weight
beta = block.norm.bias
eps = block.norm._epsilon
std = (running_var + eps).sqrt()
t = (gamma / std).reshape((-1, 1, 1, 1))
fused_kernels.append(kernel * t)
fused_bias.append(beta - running_mean * gamma / std)
return sum(fused_kernels), sum(fused_bias)
elif isinstance(branch, ConvNormLayer):
kernel = branch.conv.weight
running_mean = branch.norm._mean
running_var = branch.norm._variance
gamma = branch.norm.weight
beta = branch.norm.bias
eps = branch.norm._epsilon
else:
assert isinstance(branch, nn.BatchNorm2D)
input_dim = self.ch_in if kernel_size == 1 else 1
kernel_value = paddle.zeros(
shape=[self.ch_in, input_dim, kernel_size, kernel_size],
dtype='float32')
if kernel_size > 1:
for i in range(self.ch_in):
kernel_value[i, i % input_dim, (kernel_size - 1) // 2, (
kernel_size - 1) // 2] = 1
elif kernel_size == 1:
for i in range(self.ch_in):
kernel_value[i, i % input_dim, 0, 0] = 1
else:
raise ValueError("Invalid kernel size recieved!")
kernel = paddle.to_tensor(kernel_value, place=branch.weight.place)
running_mean = branch._mean
running_var = branch._variance
gamma = branch.weight
beta = branch.bias
eps = branch._epsilon
std = (running_var + eps).sqrt()
t = (gamma / std).reshape((-1, 1, 1, 1))
return kernel * t, beta - running_mean * gamma / std
================================================
FILE: ppdet/modeling/backbones/name_adapter.py
================================================
class NameAdapter(object):
"""Fix the backbones variable names for pretrained weight"""
def __init__(self, model):
super(NameAdapter, self).__init__()
self.model = model
@property
def model_type(self):
return getattr(self.model, '_model_type', '')
@property
def variant(self):
return getattr(self.model, 'variant', '')
def fix_conv_norm_name(self, name):
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
# the naming rule is same as pretrained weight
if self.model_type == 'SEResNeXt':
bn_name = name + "_bn"
return bn_name
def fix_shortcut_name(self, name):
if self.model_type == 'SEResNeXt':
name = 'conv' + name + '_prj'
return name
def fix_bottleneck_name(self, name):
if self.model_type == 'SEResNeXt':
conv_name1 = 'conv' + name + '_x1'
conv_name2 = 'conv' + name + '_x2'
conv_name3 = 'conv' + name + '_x3'
shortcut_name = name
else:
conv_name1 = name + "_branch2a"
conv_name2 = name + "_branch2b"
conv_name3 = name + "_branch2c"
shortcut_name = name + "_branch1"
return conv_name1, conv_name2, conv_name3, shortcut_name
def fix_basicblock_name(self, name):
if self.model_type == 'SEResNeXt':
conv_name1 = 'conv' + name + '_x1'
conv_name2 = 'conv' + name + '_x2'
shortcut_name = name
else:
conv_name1 = name + "_branch2a"
conv_name2 = name + "_branch2b"
shortcut_name = name + "_branch1"
return conv_name1, conv_name2, shortcut_name
def fix_layer_warp_name(self, stage_num, count, i):
name = 'res' + str(stage_num)
if count > 10 and stage_num == 4:
if i == 0:
conv_name = name + "a"
else:
conv_name = name + "b" + str(i)
else:
conv_name = name + chr(ord("a") + i)
if self.model_type == 'SEResNeXt':
conv_name = str(stage_num + 2) + '_' + str(i + 1)
return conv_name
def fix_c1_stage_name(self):
return "res_conv1" if self.model_type == 'ResNeXt' else "conv1"
================================================
FILE: ppdet/modeling/backbones/res2net.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from numbers import Integral
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec
from .resnet import ConvNormLayer
__all__ = ['Res2Net', 'Res2NetC5']
Res2Net_cfg = {
50: [3, 4, 6, 3],
101: [3, 4, 23, 3],
152: [3, 8, 36, 3],
200: [3, 12, 48, 3]
}
class BottleNeck(nn.Layer):
def __init__(self,
ch_in,
ch_out,
stride,
shortcut,
width,
scales=4,
variant='b',
groups=1,
lr=1.0,
norm_type='bn',
norm_decay=0.,
freeze_norm=True,
dcn_v2=False):
super(BottleNeck, self).__init__()
self.shortcut = shortcut
self.scales = scales
self.stride = stride
if not shortcut:
if variant == 'd' and stride == 2:
self.branch1 = nn.Sequential()
self.branch1.add_sublayer(
'pool',
nn.AvgPool2D(
kernel_size=2, stride=2, padding=0, ceil_mode=True))
self.branch1.add_sublayer(
'conv',
ConvNormLayer(
ch_in=ch_in,
ch_out=ch_out,
filter_size=1,
stride=1,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr))
else:
self.branch1 = ConvNormLayer(
ch_in=ch_in,
ch_out=ch_out,
filter_size=1,
stride=stride,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr)
self.branch2a = ConvNormLayer(
ch_in=ch_in,
ch_out=width * scales,
filter_size=1,
stride=stride if variant == 'a' else 1,
groups=1,
act='relu',
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr)
self.branch2b = nn.LayerList([
ConvNormLayer(
ch_in=width,
ch_out=width,
filter_size=3,
stride=1 if variant == 'a' else stride,
groups=groups,
act='relu',
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr,
dcn_v2=dcn_v2) for _ in range(self.scales - 1)
])
self.branch2c = ConvNormLayer(
ch_in=width * scales,
ch_out=ch_out,
filter_size=1,
stride=1,
groups=1,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr)
def forward(self, inputs):
out = self.branch2a(inputs)
feature_split = paddle.split(out, self.scales, 1)
out_split = []
for i in range(self.scales - 1):
if i == 0 or self.stride == 2:
out_split.append(self.branch2b[i](feature_split[i]))
else:
out_split.append(self.branch2b[i](paddle.add(feature_split[i],
out_split[-1])))
if self.stride == 1:
out_split.append(feature_split[-1])
else:
out_split.append(F.avg_pool2d(feature_split[-1], 3, self.stride, 1))
out = self.branch2c(paddle.concat(out_split, 1))
if self.shortcut:
short = inputs
else:
short = self.branch1(inputs)
out = paddle.add(out, short)
out = F.relu(out)
return out
class Blocks(nn.Layer):
def __init__(self,
ch_in,
ch_out,
count,
stage_num,
width,
scales=4,
variant='b',
groups=1,
lr=1.0,
norm_type='bn',
norm_decay=0.,
freeze_norm=True,
dcn_v2=False):
super(Blocks, self).__init__()
self.blocks = nn.Sequential()
for i in range(count):
self.blocks.add_sublayer(
str(i),
BottleNeck(
ch_in=ch_in if i == 0 else ch_out,
ch_out=ch_out,
stride=2 if i == 0 and stage_num != 2 else 1,
shortcut=False if i == 0 else True,
width=width * (2**(stage_num - 2)),
scales=scales,
variant=variant,
groups=groups,
lr=lr,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
dcn_v2=dcn_v2))
def forward(self, inputs):
return self.blocks(inputs)
@register
@serializable
class Res2Net(nn.Layer):
"""
Res2Net, see https://arxiv.org/abs/1904.01169
Args:
depth (int): Res2Net depth, should be 50, 101, 152, 200.
width (int): Res2Net width
scales (int): Res2Net scale
variant (str): Res2Net variant, supports 'a', 'b', 'c', 'd' currently
lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5),
lower learning rate ratio is need for pretrained model
got using distillation(default as [1.0, 1.0, 1.0, 1.0]).
groups (int): The groups number of the Conv Layer.
norm_type (str): normalization type, 'bn' or 'sync_bn'
norm_decay (float): weight decay for normalization layer weights
freeze_norm (bool): freeze normalization layers
freeze_at (int): freeze the backbone at which stage
return_idx (list): index of stages whose feature maps are returned,
index 0 stands for res2
dcn_v2_stages (list): index of stages who select deformable conv v2
num_stages (int): number of stages created
"""
__shared__ = ['norm_type']
def __init__(self,
depth=50,
width=26,
scales=4,
variant='b',
lr_mult_list=[1.0, 1.0, 1.0, 1.0],
groups=1,
norm_type='bn',
norm_decay=0.,
freeze_norm=True,
freeze_at=0,
return_idx=[0, 1, 2, 3],
dcn_v2_stages=[-1],
num_stages=4):
super(Res2Net, self).__init__()
self._model_type = 'Res2Net' if groups == 1 else 'Res2NeXt'
assert depth in [50, 101, 152, 200], \
"depth {} not in [50, 101, 152, 200]"
assert variant in ['a', 'b', 'c', 'd'], "invalid Res2Net variant"
assert num_stages >= 1 and num_stages <= 4
self.depth = depth
self.variant = variant
self.norm_type = norm_type
self.norm_decay = norm_decay
self.freeze_norm = freeze_norm
self.freeze_at = freeze_at
if isinstance(return_idx, Integral):
return_idx = [return_idx]
assert max(return_idx) < num_stages, \
'the maximum return index must smaller than num_stages, ' \
'but received maximum return index is {} and num_stages ' \
'is {}'.format(max(return_idx), num_stages)
self.return_idx = return_idx
self.num_stages = num_stages
assert len(lr_mult_list) == 4, \
"lr_mult_list length must be 4 but got {}".format(len(lr_mult_list))
if isinstance(dcn_v2_stages, Integral):
dcn_v2_stages = [dcn_v2_stages]
assert max(dcn_v2_stages) < num_stages
self.dcn_v2_stages = dcn_v2_stages
block_nums = Res2Net_cfg[depth]
# C1 stage
if self.variant in ['c', 'd']:
conv_def = [
[3, 32, 3, 2, "conv1_1"],
[32, 32, 3, 1, "conv1_2"],
[32, 64, 3, 1, "conv1_3"],
]
else:
conv_def = [[3, 64, 7, 2, "conv1"]]
self.res1 = nn.Sequential()
for (c_in, c_out, k, s, _name) in conv_def:
self.res1.add_sublayer(
_name,
ConvNormLayer(
ch_in=c_in,
ch_out=c_out,
filter_size=k,
stride=s,
groups=1,
act='relu',
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=1.0))
self._in_channels = [64, 256, 512, 1024]
self._out_channels = [256, 512, 1024, 2048]
self._out_strides = [4, 8, 16, 32]
# C2-C5 stages
self.res_layers = []
for i in range(num_stages):
lr_mult = lr_mult_list[i]
stage_num = i + 2
self.res_layers.append(
self.add_sublayer(
"res{}".format(stage_num),
Blocks(
self._in_channels[i],
self._out_channels[i],
count=block_nums[i],
stage_num=stage_num,
width=width,
scales=scales,
groups=groups,
lr=lr_mult,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
dcn_v2=(i in self.dcn_v2_stages))))
@property
def out_shape(self):
return [
ShapeSpec(
channels=self._out_channels[i], stride=self._out_strides[i])
for i in self.return_idx
]
def forward(self, inputs):
x = inputs['image']
res1 = self.res1(x)
x = F.max_pool2d(res1, kernel_size=3, stride=2, padding=1)
outs = []
for idx, stage in enumerate(self.res_layers):
x = stage(x)
if idx == self.freeze_at:
x.stop_gradient = True
if idx in self.return_idx:
outs.append(x)
return outs
@register
class Res2NetC5(nn.Layer):
def __init__(self, depth=50, width=26, scales=4, variant='b'):
super(Res2NetC5, self).__init__()
feat_in, feat_out = [1024, 2048]
self.res5 = Blocks(
feat_in,
feat_out,
count=3,
stage_num=5,
width=width,
scales=scales,
variant=variant)
self.feat_out = feat_out
@property
def out_shape(self):
return [ShapeSpec(
channels=self.feat_out,
stride=32, )]
def forward(self, roi_feat, stage=0):
y = self.res5(roi_feat)
return y
================================================
FILE: ppdet/modeling/backbones/resnet.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from numbers import Integral
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register, serializable
from paddle.regularizer import L2Decay
from paddle.nn.initializer import Uniform
from paddle import ParamAttr
from paddle.nn.initializer import Constant
from paddle.vision.ops import DeformConv2D
from .name_adapter import NameAdapter
from ..shape_spec import ShapeSpec
__all__ = ['ResNet', 'Res5Head', 'Blocks', 'BasicBlock', 'BottleNeck']
ResNet_cfg = {
18: [2, 2, 2, 2],
34: [3, 4, 6, 3],
50: [3, 4, 6, 3],
101: [3, 4, 23, 3],
152: [3, 8, 36, 3],
}
class ConvNormLayer(nn.Layer):
def __init__(self,
ch_in,
ch_out,
filter_size,
stride,
groups=1,
act=None,
norm_type='bn',
norm_decay=0.,
freeze_norm=True,
lr=1.0,
dcn_v2=False):
super(ConvNormLayer, self).__init__()
assert norm_type in ['bn', 'sync_bn']
self.norm_type = norm_type
self.act = act
self.dcn_v2 = dcn_v2
if not self.dcn_v2:
self.conv = nn.Conv2D(
in_channels=ch_in,
out_channels=ch_out,
kernel_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
weight_attr=ParamAttr(learning_rate=lr),
bias_attr=False)
else:
self.offset_channel = 2 * filter_size**2
self.mask_channel = filter_size**2
self.conv_offset = nn.Conv2D(
in_channels=ch_in,
out_channels=3 * filter_size**2,
kernel_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
weight_attr=ParamAttr(initializer=Constant(0.)),
bias_attr=ParamAttr(initializer=Constant(0.)))
self.conv = DeformConv2D(
in_channels=ch_in,
out_channels=ch_out,
kernel_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
dilation=1,
groups=groups,
weight_attr=ParamAttr(learning_rate=lr),
bias_attr=False)
norm_lr = 0. if freeze_norm else lr
param_attr = ParamAttr(
learning_rate=norm_lr,
regularizer=L2Decay(norm_decay),
trainable=False if freeze_norm else True)
bias_attr = ParamAttr(
learning_rate=norm_lr,
regularizer=L2Decay(norm_decay),
trainable=False if freeze_norm else True)
global_stats = True if freeze_norm else None
if norm_type in ['sync_bn', 'bn']:
self.norm = nn.BatchNorm2D(
ch_out,
weight_attr=param_attr,
bias_attr=bias_attr,
use_global_stats=global_stats)
norm_params = self.norm.parameters()
if freeze_norm:
for param in norm_params:
param.stop_gradient = True
def forward(self, inputs):
if not self.dcn_v2:
out = self.conv(inputs)
else:
offset_mask = self.conv_offset(inputs)
offset, mask = paddle.split(
offset_mask,
num_or_sections=[self.offset_channel, self.mask_channel],
axis=1)
mask = F.sigmoid(mask)
out = self.conv(inputs, offset, mask=mask)
if self.norm_type in ['bn', 'sync_bn']:
out = self.norm(out)
if self.act:
out = getattr(F, self.act)(out)
return out
class SELayer(nn.Layer):
def __init__(self, ch, reduction_ratio=16):
super(SELayer, self).__init__()
self.pool = nn.AdaptiveAvgPool2D(1)
stdv = 1.0 / math.sqrt(ch)
c_ = ch // reduction_ratio
self.squeeze = nn.Linear(
ch,
c_,
weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),
bias_attr=True)
stdv = 1.0 / math.sqrt(c_)
self.extract = nn.Linear(
c_,
ch,
weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),
bias_attr=True)
def forward(self, inputs):
out = self.pool(inputs)
out = paddle.squeeze(out, axis=[2, 3])
out = self.squeeze(out)
out = F.relu(out)
out = self.extract(out)
out = F.sigmoid(out)
out = paddle.unsqueeze(out, axis=[2, 3])
scale = out * inputs
return scale
class BasicBlock(nn.Layer):
expansion = 1
def __init__(self,
ch_in,
ch_out,
stride,
shortcut,
variant='b',
groups=1,
base_width=64,
lr=1.0,
norm_type='bn',
norm_decay=0.,
freeze_norm=True,
dcn_v2=False,
std_senet=False):
super(BasicBlock, self).__init__()
assert groups == 1 and base_width == 64, 'BasicBlock only supports groups=1 and base_width=64'
self.shortcut = shortcut
if not shortcut:
if variant == 'd' and stride == 2:
self.short = nn.Sequential()
self.short.add_sublayer(
'pool',
nn.AvgPool2D(
kernel_size=2, stride=2, padding=0, ceil_mode=True))
self.short.add_sublayer(
'conv',
ConvNormLayer(
ch_in=ch_in,
ch_out=ch_out,
filter_size=1,
stride=1,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr))
else:
self.short = ConvNormLayer(
ch_in=ch_in,
ch_out=ch_out,
filter_size=1,
stride=stride,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr)
self.branch2a = ConvNormLayer(
ch_in=ch_in,
ch_out=ch_out,
filter_size=3,
stride=stride,
act='relu',
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr)
self.branch2b = ConvNormLayer(
ch_in=ch_out,
ch_out=ch_out,
filter_size=3,
stride=1,
act=None,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr,
dcn_v2=dcn_v2)
self.std_senet = std_senet
if self.std_senet:
self.se = SELayer(ch_out)
def forward(self, inputs):
out = self.branch2a(inputs)
out = self.branch2b(out)
if self.std_senet:
out = self.se(out)
if self.shortcut:
short = inputs
else:
short = self.short(inputs)
out = paddle.add(x=out, y=short)
out = F.relu(out)
return out
class BottleNeck(nn.Layer):
expansion = 4
def __init__(self,
ch_in,
ch_out,
stride,
shortcut,
variant='b',
groups=1,
base_width=4,
lr=1.0,
norm_type='bn',
norm_decay=0.,
freeze_norm=True,
dcn_v2=False,
std_senet=False):
super(BottleNeck, self).__init__()
if variant == 'a':
stride1, stride2 = stride, 1
else:
stride1, stride2 = 1, stride
# ResNeXt
width = int(ch_out * (base_width / 64.)) * groups
self.branch2a = ConvNormLayer(
ch_in=ch_in,
ch_out=width,
filter_size=1,
stride=stride1,
groups=1,
act='relu',
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr)
self.branch2b = ConvNormLayer(
ch_in=width,
ch_out=width,
filter_size=3,
stride=stride2,
groups=groups,
act='relu',
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr,
dcn_v2=dcn_v2)
self.branch2c = ConvNormLayer(
ch_in=width,
ch_out=ch_out * self.expansion,
filter_size=1,
stride=1,
groups=1,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr)
self.shortcut = shortcut
if not shortcut:
if variant == 'd' and stride == 2:
self.short = nn.Sequential()
self.short.add_sublayer(
'pool',
nn.AvgPool2D(
kernel_size=2, stride=2, padding=0, ceil_mode=True))
self.short.add_sublayer(
'conv',
ConvNormLayer(
ch_in=ch_in,
ch_out=ch_out * self.expansion,
filter_size=1,
stride=1,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr))
else:
self.short = ConvNormLayer(
ch_in=ch_in,
ch_out=ch_out * self.expansion,
filter_size=1,
stride=stride,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr)
self.std_senet = std_senet
if self.std_senet:
self.se = SELayer(ch_out * self.expansion)
def forward(self, inputs):
out = self.branch2a(inputs)
out = self.branch2b(out)
out = self.branch2c(out)
if self.std_senet:
out = self.se(out)
if self.shortcut:
short = inputs
else:
short = self.short(inputs)
out = paddle.add(x=out, y=short)
out = F.relu(out)
return out
class Blocks(nn.Layer):
def __init__(self,
block,
ch_in,
ch_out,
count,
name_adapter,
stage_num,
variant='b',
groups=1,
base_width=64,
lr=1.0,
norm_type='bn',
norm_decay=0.,
freeze_norm=True,
dcn_v2=False,
std_senet=False):
super(Blocks, self).__init__()
self.blocks = []
for i in range(count):
conv_name = name_adapter.fix_layer_warp_name(stage_num, count, i)
layer = self.add_sublayer(
conv_name,
block(
ch_in=ch_in,
ch_out=ch_out,
stride=2 if i == 0 and stage_num != 2 else 1,
shortcut=False if i == 0 else True,
variant=variant,
groups=groups,
base_width=base_width,
lr=lr,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
dcn_v2=dcn_v2,
std_senet=std_senet))
self.blocks.append(layer)
if i == 0:
ch_in = ch_out * block.expansion
def forward(self, inputs):
block_out = inputs
for block in self.blocks:
block_out = block(block_out)
return block_out
@register
@serializable
class ResNet(nn.Layer):
__shared__ = ['norm_type']
def __init__(self,
depth=50,
ch_in=64,
variant='b',
lr_mult_list=[1.0, 1.0, 1.0, 1.0],
groups=1,
base_width=64,
norm_type='bn',
norm_decay=0,
freeze_norm=True,
freeze_at=0,
return_idx=[0, 1, 2, 3],
dcn_v2_stages=[-1],
num_stages=4,
std_senet=False,
freeze_stem_only=False):
"""
Residual Network, see https://arxiv.org/abs/1512.03385
Args:
depth (int): ResNet depth, should be 18, 34, 50, 101, 152.
ch_in (int): output channel of first stage, default 64
variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently
lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5),
lower learning rate ratio is need for pretrained model
got using distillation(default as [1.0, 1.0, 1.0, 1.0]).
groups (int): group convolution cardinality
base_width (int): base width of each group convolution
norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel'
norm_decay (float): weight decay for normalization layer weights
freeze_norm (bool): freeze normalization layers
freeze_at (int): freeze the backbone at which stage
return_idx (list): index of the stages whose feature maps are returned
dcn_v2_stages (list): index of stages who select deformable conv v2
num_stages (int): total num of stages
std_senet (bool): whether use senet, default True
"""
super(ResNet, self).__init__()
self._model_type = 'ResNet' if groups == 1 else 'ResNeXt'
assert num_stages >= 1 and num_stages <= 4
self.depth = depth
self.variant = variant
self.groups = groups
self.base_width = base_width
self.norm_type = norm_type
self.norm_decay = norm_decay
self.freeze_norm = freeze_norm
self.freeze_at = freeze_at
if isinstance(return_idx, Integral):
return_idx = [return_idx]
assert max(return_idx) < num_stages, \
'the maximum return index must smaller than num_stages, ' \
'but received maximum return index is {} and num_stages ' \
'is {}'.format(max(return_idx), num_stages)
self.return_idx = return_idx
self.num_stages = num_stages
assert len(lr_mult_list) == 4, \
"lr_mult_list length must be 4 but got {}".format(len(lr_mult_list))
if isinstance(dcn_v2_stages, Integral):
dcn_v2_stages = [dcn_v2_stages]
assert max(dcn_v2_stages) < num_stages
if isinstance(dcn_v2_stages, Integral):
dcn_v2_stages = [dcn_v2_stages]
assert max(dcn_v2_stages) < num_stages
self.dcn_v2_stages = dcn_v2_stages
block_nums = ResNet_cfg[depth]
na = NameAdapter(self)
conv1_name = na.fix_c1_stage_name()
if variant in ['c', 'd']:
conv_def = [
[3, ch_in // 2, 3, 2, "conv1_1"],
[ch_in // 2, ch_in // 2, 3, 1, "conv1_2"],
[ch_in // 2, ch_in, 3, 1, "conv1_3"],
]
else:
conv_def = [[3, ch_in, 7, 2, conv1_name]]
self.conv1 = nn.Sequential()
for (c_in, c_out, k, s, _name) in conv_def:
self.conv1.add_sublayer(
_name,
ConvNormLayer(
ch_in=c_in,
ch_out=c_out,
filter_size=k,
stride=s,
groups=1,
act='relu',
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=1.0))
self.ch_in = ch_in
ch_out_list = [64, 128, 256, 512]
block = BottleNeck if depth >= 50 else BasicBlock
self._out_channels = [block.expansion * v for v in ch_out_list]
self._out_strides = [4, 8, 16, 32]
self.res_layers = []
for i in range(num_stages):
lr_mult = lr_mult_list[i]
stage_num = i + 2
res_name = "res{}".format(stage_num)
res_layer = self.add_sublayer(
res_name,
Blocks(
block,
self.ch_in,
ch_out_list[i],
count=block_nums[i],
name_adapter=na,
stage_num=stage_num,
variant=variant,
groups=groups,
base_width=base_width,
lr=lr_mult,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
dcn_v2=(i in self.dcn_v2_stages),
std_senet=std_senet))
self.res_layers.append(res_layer)
self.ch_in = self._out_channels[i]
if freeze_at >= 0:
self._freeze_parameters(self.conv1)
if not freeze_stem_only:
for i in range(min(freeze_at + 1, num_stages)):
self._freeze_parameters(self.res_layers[i])
def _freeze_parameters(self, m):
for p in m.parameters():
p.stop_gradient = True
@property
def out_shape(self):
return [
ShapeSpec(
channels=self._out_channels[i], stride=self._out_strides[i])
for i in self.return_idx
]
def forward(self, inputs):
x = inputs['image']
conv1 = self.conv1(x)
x = F.max_pool2d(conv1, kernel_size=3, stride=2, padding=1)
outs = []
for idx, stage in enumerate(self.res_layers):
x = stage(x)
if idx in self.return_idx:
outs.append(x)
return outs
@register
class Res5Head(nn.Layer):
def __init__(self, depth=50):
super(Res5Head, self).__init__()
feat_in, feat_out = [1024, 512]
if depth < 50:
feat_in = 256
na = NameAdapter(self)
block = BottleNeck if depth >= 50 else BasicBlock
self.res5 = Blocks(
block, feat_in, feat_out, count=3, name_adapter=na, stage_num=5)
self.feat_out = feat_out if depth < 50 else feat_out * 4
@property
def out_shape(self):
return [ShapeSpec(
channels=self.feat_out,
stride=16, )]
def forward(self, roi_feat, stage=0):
y = self.res5(roi_feat)
return y
================================================
FILE: ppdet/modeling/backbones/senet.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.nn as nn
from ppdet.core.workspace import register, serializable
from .resnet import ResNet, Blocks, BasicBlock, BottleNeck
from ..shape_spec import ShapeSpec
from .name_adapter import NameAdapter
__all__ = ['SENet', 'SERes5Head']
@register
@serializable
class SENet(ResNet):
__shared__ = ['norm_type']
def __init__(self,
depth=50,
variant='b',
lr_mult_list=[1.0, 1.0, 1.0, 1.0],
groups=1,
base_width=64,
norm_type='bn',
norm_decay=0,
freeze_norm=True,
freeze_at=0,
return_idx=[0, 1, 2, 3],
dcn_v2_stages=[-1],
std_senet=True,
num_stages=4):
"""
Squeeze-and-Excitation Networks, see https://arxiv.org/abs/1709.01507
Args:
depth (int): SENet depth, should be 50, 101, 152
variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently
lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5),
lower learning rate ratio is need for pretrained model
got using distillation(default as [1.0, 1.0, 1.0, 1.0]).
groups (int): group convolution cardinality
base_width (int): base width of each group convolution
norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel'
norm_decay (float): weight decay for normalization layer weights
freeze_norm (bool): freeze normalization layers
freeze_at (int): freeze the backbone at which stage
return_idx (list): index of the stages whose feature maps are returned
dcn_v2_stages (list): index of stages who select deformable conv v2
std_senet (bool): whether use senet, default True
num_stages (int): total num of stages
"""
super(SENet, self).__init__(
depth=depth,
variant=variant,
lr_mult_list=lr_mult_list,
ch_in=128,
groups=groups,
base_width=base_width,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
freeze_at=freeze_at,
return_idx=return_idx,
dcn_v2_stages=dcn_v2_stages,
std_senet=std_senet,
num_stages=num_stages)
@register
class SERes5Head(nn.Layer):
def __init__(self,
depth=50,
variant='b',
lr_mult=1.0,
groups=1,
base_width=64,
norm_type='bn',
norm_decay=0,
dcn_v2=False,
freeze_norm=False,
std_senet=True):
"""
SERes5Head layer
Args:
depth (int): SENet depth, should be 50, 101, 152
variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently
lr_mult (list): learning rate ratio of SERes5Head, default as 1.0.
groups (int): group convolution cardinality
base_width (int): base width of each group convolution
norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel'
norm_decay (float): weight decay for normalization layer weights
dcn_v2_stages (list): index of stages who select deformable conv v2
std_senet (bool): whether use senet, default True
"""
super(SERes5Head, self).__init__()
ch_out = 512
ch_in = 256 if depth < 50 else 1024
na = NameAdapter(self)
block = BottleNeck if depth >= 50 else BasicBlock
self.res5 = Blocks(
block,
ch_in,
ch_out,
count=3,
name_adapter=na,
stage_num=5,
variant=variant,
groups=groups,
base_width=base_width,
lr=lr_mult,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
dcn_v2=dcn_v2,
std_senet=std_senet)
self.ch_out = ch_out * block.expansion
@property
def out_shape(self):
return [ShapeSpec(
channels=self.ch_out,
stride=16, )]
def forward(self, roi_feat):
y = self.res5(roi_feat)
return y
================================================
FILE: ppdet/modeling/backbones/shufflenet_v2.py
================================================
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
from paddle import ParamAttr
import paddle.nn.functional as F
from paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm2D
from paddle.nn.initializer import KaimingNormal
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register, serializable
from numbers import Integral
from ..shape_spec import ShapeSpec
from ppdet.modeling.ops import channel_shuffle
__all__ = ['ShuffleNetV2']
class ConvBNLayer(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
padding,
groups=1,
act=None):
super(ConvBNLayer, self).__init__()
self._conv = Conv2D(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=groups,
weight_attr=ParamAttr(initializer=KaimingNormal()),
bias_attr=False)
self._batch_norm = BatchNorm2D(
out_channels,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
if act == "hard_swish":
act = 'hardswish'
self.act = act
def forward(self, inputs):
y = self._conv(inputs)
y = self._batch_norm(y)
if self.act:
y = getattr(F, self.act)(y)
return y
class InvertedResidual(nn.Layer):
def __init__(self, in_channels, out_channels, stride, act="relu"):
super(InvertedResidual, self).__init__()
self._conv_pw = ConvBNLayer(
in_channels=in_channels // 2,
out_channels=out_channels // 2,
kernel_size=1,
stride=1,
padding=0,
groups=1,
act=act)
self._conv_dw = ConvBNLayer(
in_channels=out_channels // 2,
out_channels=out_channels // 2,
kernel_size=3,
stride=stride,
padding=1,
groups=out_channels // 2,
act=None)
self._conv_linear = ConvBNLayer(
in_channels=out_channels // 2,
out_channels=out_channels // 2,
kernel_size=1,
stride=1,
padding=0,
groups=1,
act=act)
def forward(self, inputs):
x1, x2 = paddle.split(
inputs,
num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2],
axis=1)
x2 = self._conv_pw(x2)
x2 = self._conv_dw(x2)
x2 = self._conv_linear(x2)
out = paddle.concat([x1, x2], axis=1)
return channel_shuffle(out, 2)
class InvertedResidualDS(nn.Layer):
def __init__(self, in_channels, out_channels, stride, act="relu"):
super(InvertedResidualDS, self).__init__()
# branch1
self._conv_dw_1 = ConvBNLayer(
in_channels=in_channels,
out_channels=in_channels,
kernel_size=3,
stride=stride,
padding=1,
groups=in_channels,
act=None)
self._conv_linear_1 = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels // 2,
kernel_size=1,
stride=1,
padding=0,
groups=1,
act=act)
# branch2
self._conv_pw_2 = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels // 2,
kernel_size=1,
stride=1,
padding=0,
groups=1,
act=act)
self._conv_dw_2 = ConvBNLayer(
in_channels=out_channels // 2,
out_channels=out_channels // 2,
kernel_size=3,
stride=stride,
padding=1,
groups=out_channels // 2,
act=None)
self._conv_linear_2 = ConvBNLayer(
in_channels=out_channels // 2,
out_channels=out_channels // 2,
kernel_size=1,
stride=1,
padding=0,
groups=1,
act=act)
def forward(self, inputs):
x1 = self._conv_dw_1(inputs)
x1 = self._conv_linear_1(x1)
x2 = self._conv_pw_2(inputs)
x2 = self._conv_dw_2(x2)
x2 = self._conv_linear_2(x2)
out = paddle.concat([x1, x2], axis=1)
return channel_shuffle(out, 2)
@register
@serializable
class ShuffleNetV2(nn.Layer):
def __init__(self, scale=1.0, act="relu", feature_maps=[5, 13, 17]):
super(ShuffleNetV2, self).__init__()
self.scale = scale
if isinstance(feature_maps, Integral):
feature_maps = [feature_maps]
self.feature_maps = feature_maps
stage_repeats = [4, 8, 4]
if scale == 0.25:
stage_out_channels = [-1, 24, 24, 48, 96, 512]
elif scale == 0.33:
stage_out_channels = [-1, 24, 32, 64, 128, 512]
elif scale == 0.5:
stage_out_channels = [-1, 24, 48, 96, 192, 1024]
elif scale == 1.0:
stage_out_channels = [-1, 24, 116, 232, 464, 1024]
elif scale == 1.5:
stage_out_channels = [-1, 24, 176, 352, 704, 1024]
elif scale == 2.0:
stage_out_channels = [-1, 24, 244, 488, 976, 2048]
else:
raise NotImplementedError("This scale size:[" + str(scale) +
"] is not implemented!")
self._out_channels = []
self._feature_idx = 0
# 1. conv1
self._conv1 = ConvBNLayer(
in_channels=3,
out_channels=stage_out_channels[1],
kernel_size=3,
stride=2,
padding=1,
act=act)
self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
self._feature_idx += 1
# 2. bottleneck sequences
self._block_list = []
for stage_id, num_repeat in enumerate(stage_repeats):
for i in range(num_repeat):
if i == 0:
block = self.add_sublayer(
name=str(stage_id + 2) + '_' + str(i + 1),
sublayer=InvertedResidualDS(
in_channels=stage_out_channels[stage_id + 1],
out_channels=stage_out_channels[stage_id + 2],
stride=2,
act=act))
else:
block = self.add_sublayer(
name=str(stage_id + 2) + '_' + str(i + 1),
sublayer=InvertedResidual(
in_channels=stage_out_channels[stage_id + 2],
out_channels=stage_out_channels[stage_id + 2],
stride=1,
act=act))
self._block_list.append(block)
self._feature_idx += 1
self._update_out_channels(stage_out_channels[stage_id + 2],
self._feature_idx, self.feature_maps)
def _update_out_channels(self, channel, feature_idx, feature_maps):
if feature_idx in feature_maps:
self._out_channels.append(channel)
def forward(self, inputs):
y = self._conv1(inputs['image'])
y = self._max_pool(y)
outs = []
for i, inv in enumerate(self._block_list):
y = inv(y)
if i + 2 in self.feature_maps:
outs.append(y)
return outs
@property
def out_shape(self):
return [ShapeSpec(channels=c) for c in self._out_channels]
================================================
FILE: ppdet/modeling/backbones/swin_transformer.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
Ths copyright of microsoft/Swin-Transformer is as follows:
MIT License [see LICENSE for details]
"""
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.modeling.shape_spec import ShapeSpec
from ppdet.core.workspace import register, serializable
from .transformer_utils import DropPath, Identity
from .transformer_utils import add_parameter, to_2tuple
from .transformer_utils import ones_, zeros_, trunc_normal_
__all__ = ['SwinTransformer']
MODEL_cfg = {
# use 22kto1k finetune weights as default pretrained, can set by SwinTransformer.pretrained in config
'swin_T_224': dict(
pretrain_img_size=224,
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_tiny_patch4_window7_224_22kto1k_pretrained.pdparams',
),
'swin_S_224': dict(
pretrain_img_size=224,
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_small_patch4_window7_224_22kto1k_pretrained.pdparams',
),
'swin_B_224': dict(
pretrain_img_size=224,
embed_dim=128,
depths=[2, 2, 18, 2],
num_heads=[4, 8, 16, 32],
window_size=7,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window7_224_22kto1k_pretrained.pdparams',
),
'swin_L_224': dict(
pretrain_img_size=224,
embed_dim=192,
depths=[2, 2, 18, 2],
num_heads=[6, 12, 24, 48],
window_size=7,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window7_224_22kto1k_pretrained.pdparams',
),
'swin_B_384': dict(
pretrain_img_size=384,
embed_dim=128,
depths=[2, 2, 18, 2],
num_heads=[4, 8, 16, 32],
window_size=12,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window12_384_22kto1k_pretrained.pdparams',
),
'swin_L_384': dict(
pretrain_img_size=384,
embed_dim=192,
depths=[2, 2, 18, 2],
num_heads=[6, 12, 24, 48],
window_size=12,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window12_384_22kto1k_pretrained.pdparams',
),
}
class Mlp(nn.Layer):
def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer=nn.GELU,
drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
def window_partition(x, window_size):
"""
Args:
x: (B, H, W, C)
window_size (int): window size
Returns:
windows: (num_windows*B, window_size, window_size, C)
"""
B, H, W, C = x.shape
x = x.reshape(
[-1, H // window_size, window_size, W // window_size, window_size, C])
windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
[-1, window_size, window_size, C])
return windows
def window_reverse(windows, window_size, H, W):
"""
Args:
windows: (num_windows*B, window_size, window_size, C)
window_size (int): Window size
H (int): Height of image
W (int): Width of image
Returns:
x: (B, H, W, C)
"""
_, _, _, C = windows.shape
B = int(windows.shape[0] / (H * W / window_size / window_size))
x = windows.reshape(
[-1, H // window_size, W // window_size, window_size, window_size, C])
x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, H, W, C])
return x
class WindowAttention(nn.Layer):
""" Window based multi-head self attention (W-MSA) module with relative position bias.
It supports both of shifted and non-shifted window.
Args:
dim (int): Number of input channels.
window_size (tuple[int]): The height and width of the window.
num_heads (int): Number of attention heads.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
"""
def __init__(self,
dim,
window_size,
num_heads,
qkv_bias=True,
qk_scale=None,
attn_drop=0.,
proj_drop=0.):
super().__init__()
self.dim = dim
self.window_size = window_size # Wh, Ww
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim**-0.5
# define a parameter table of relative position bias
self.relative_position_bias_table = add_parameter(
self,
paddle.zeros(((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
num_heads))) # 2*Wh-1 * 2*Ww-1, nH
# get pair-wise relative position index for each token inside the window
coords_h = paddle.arange(self.window_size[0])
coords_w = paddle.arange(self.window_size[1])
coords = paddle.stack(paddle.meshgrid(
[coords_h, coords_w])) # 2, Wh, Ww
coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww
coords_flatten_1 = coords_flatten.unsqueeze(axis=2)
coords_flatten_2 = coords_flatten.unsqueeze(axis=1)
relative_coords = coords_flatten_1 - coords_flatten_2
relative_coords = relative_coords.transpose(
[1, 2, 0]) # Wh*Ww, Wh*Ww, 2
relative_coords[:, :, 0] += self.window_size[
0] - 1 # shift to start from 0
relative_coords[:, :, 1] += self.window_size[1] - 1
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
self.relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
trunc_normal_(self.relative_position_bias_table)
self.softmax = nn.Softmax(axis=-1)
def forward(self, x, mask=None):
""" Forward function.
Args:
x: input features with shape of (num_windows*B, N, C)
mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
"""
B_, N, C = x.shape
qkv = self.qkv(x).reshape(
[-1, N, 3, self.num_heads, C // self.num_heads]).transpose(
[2, 0, 3, 1, 4])
q, k, v = qkv[0], qkv[1], qkv[2]
q = q * self.scale
attn = paddle.mm(q, k.transpose([0, 1, 3, 2]))
index = self.relative_position_index.flatten()
relative_position_bias = paddle.index_select(
self.relative_position_bias_table, index)
relative_position_bias = relative_position_bias.reshape([
self.window_size[0] * self.window_size[1],
self.window_size[0] * self.window_size[1], -1
]) # Wh*Ww,Wh*Ww,nH
relative_position_bias = relative_position_bias.transpose(
[2, 0, 1]) # nH, Wh*Ww, Wh*Ww
attn = attn + relative_position_bias.unsqueeze(0)
if mask is not None:
nW = mask.shape[0]
attn = attn.reshape([-1, nW, self.num_heads, N, N
]) + mask.unsqueeze(1).unsqueeze(0)
attn = attn.reshape([-1, self.num_heads, N, N])
attn = self.softmax(attn)
else:
attn = self.softmax(attn)
attn = self.attn_drop(attn)
# x = (attn @ v).transpose(1, 2).reshape([B_, N, C])
x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([-1, N, C])
x = self.proj(x)
x = self.proj_drop(x)
return x
class SwinTransformerBlock(nn.Layer):
""" Swin Transformer Block.
Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads.
window_size (int): Window size.
shift_size (int): Shift size for SW-MSA.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
drop (float, optional): Dropout rate. Default: 0.0
attn_drop (float, optional): Attention dropout rate. Default: 0.0
drop_path (float, optional): Stochastic depth rate. Default: 0.0
act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
"""
def __init__(self,
dim,
num_heads,
window_size=7,
shift_size=0,
mlp_ratio=4.,
qkv_bias=True,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm):
super().__init__()
self.dim = dim
self.num_heads = num_heads
self.window_size = window_size
self.shift_size = shift_size
self.mlp_ratio = mlp_ratio
assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
self.norm1 = norm_layer(dim)
self.attn = WindowAttention(
dim,
window_size=to_2tuple(self.window_size),
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop)
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)
self.H = None
self.W = None
def forward(self, x, mask_matrix):
""" Forward function.
Args:
x: Input feature, tensor size (B, H*W, C).
H, W: Spatial resolution of the input feature.
mask_matrix: Attention mask for cyclic shift.
"""
B, L, C = x.shape
H, W = self.H, self.W
assert L == H * W, "input feature has wrong size"
shortcut = x
x = self.norm1(x)
x = x.reshape([-1, H, W, C])
# pad feature maps to multiples of window size
pad_l = pad_t = 0
pad_r = (self.window_size - W % self.window_size) % self.window_size
pad_b = (self.window_size - H % self.window_size) % self.window_size
x = F.pad(x, [0, pad_l, 0, pad_b, 0, pad_r, 0, pad_t],
data_format='NHWC')
_, Hp, Wp, _ = x.shape
# cyclic shift
if self.shift_size > 0:
shifted_x = paddle.roll(
x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2))
attn_mask = mask_matrix
else:
shifted_x = x
attn_mask = None
# partition windows
x_windows = window_partition(
shifted_x, self.window_size) # nW*B, window_size, window_size, C
x_windows = x_windows.reshape(
[x_windows.shape[0], self.window_size * self.window_size,
C]) # nW*B, window_size*window_size, C
# W-MSA/SW-MSA
attn_windows = self.attn(
x_windows, mask=attn_mask) # nW*B, window_size*window_size, C
# merge windows
attn_windows = attn_windows.reshape(
[x_windows.shape[0], self.window_size, self.window_size, C])
shifted_x = window_reverse(attn_windows, self.window_size, Hp,
Wp) # B H' W' C
# reverse cyclic shift
if self.shift_size > 0:
x = paddle.roll(
shifted_x,
shifts=(self.shift_size, self.shift_size),
axis=(1, 2))
else:
x = shifted_x
if pad_r > 0 or pad_b > 0:
x = x[:, :H, :W, :]
x = x.reshape([-1, H * W, C])
# FFN
x = shortcut + self.drop_path(x)
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
class PatchMerging(nn.Layer):
r""" Patch Merging Layer.
Args:
dim (int): Number of input channels.
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
"""
def __init__(self, dim, norm_layer=nn.LayerNorm):
super().__init__()
self.dim = dim
self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
self.norm = norm_layer(4 * dim)
def forward(self, x, H, W):
""" Forward function.
Args:
x: Input feature, tensor size (B, H*W, C).
H, W: Spatial resolution of the input feature.
"""
B, L, C = x.shape
assert L == H * W, "input feature has wrong size"
x = x.reshape([-1, H, W, C])
# padding
pad_input = (H % 2 == 1) or (W % 2 == 1)
if pad_input:
# paddle F.pad default data_format is 'NCHW'
x = F.pad(x, [0, 0, 0, H % 2, 0, W % 2, 0, 0], data_format='NHWC')
H += H % 2
W += W % 2
x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
x = paddle.concat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
x = x.reshape([-1, H * W // 4, 4 * C]) # B H/2*W/2 4*C
x = self.norm(x)
x = self.reduction(x)
return x
class BasicLayer(nn.Layer):
""" A basic Swin Transformer layer for one stage.
Args:
dim (int): Number of input channels.
depth (int): Number of blocks.
num_heads (int): Number of attention heads.
window_size (int): Local window size.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
drop (float, optional): Dropout rate. Default: 0.0
attn_drop (float, optional): Attention dropout rate. Default: 0.0
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
"""
def __init__(self,
dim,
depth,
num_heads,
window_size=7,
mlp_ratio=4.,
qkv_bias=True,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
norm_layer=nn.LayerNorm,
downsample=None):
super().__init__()
self.window_size = window_size
self.shift_size = window_size // 2
self.depth = depth
# build blocks
self.blocks = nn.LayerList([
SwinTransformerBlock(
dim=dim,
num_heads=num_heads,
window_size=window_size,
shift_size=0 if (i % 2 == 0) else window_size // 2,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop,
attn_drop=attn_drop,
drop_path=drop_path[i]
if isinstance(drop_path, np.ndarray) else drop_path,
norm_layer=norm_layer) for i in range(depth)
])
# patch merging layer
if downsample is not None:
self.downsample = downsample(dim=dim, norm_layer=norm_layer)
else:
self.downsample = None
def forward(self, x, H, W):
""" Forward function.
Args:
x: Input feature, tensor size (B, H*W, C).
H, W: Spatial resolution of the input feature.
"""
# calculate attention mask for SW-MSA
Hp = int(np.ceil(H / self.window_size)) * self.window_size
Wp = int(np.ceil(W / self.window_size)) * self.window_size
img_mask = paddle.zeros([1, Hp, Wp, 1], dtype='float32') # 1 Hp Wp 1
h_slices = (slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None))
w_slices = (slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None))
cnt = 0
for h in h_slices:
for w in w_slices:
img_mask[:, h, w, :] = cnt
cnt += 1
mask_windows = window_partition(
img_mask, self.window_size) # nW, window_size, window_size, 1
mask_windows = mask_windows.reshape(
[-1, self.window_size * self.window_size])
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
huns = -100.0 * paddle.ones_like(attn_mask)
attn_mask = huns * (attn_mask != 0).astype("float32")
for blk in self.blocks:
blk.H, blk.W = H, W
x = blk(x, attn_mask)
if self.downsample is not None:
x_down = self.downsample(x, H, W)
Wh, Ww = (H + 1) // 2, (W + 1) // 2
return x, H, W, x_down, Wh, Ww
else:
return x, H, W, x, H, W
class PatchEmbed(nn.Layer):
""" Image to Patch Embedding
Args:
patch_size (int): Patch token size. Default: 4.
in_chans (int): Number of input image channels. Default: 3.
embed_dim (int): Number of linear projection output channels. Default: 96.
norm_layer (nn.Layer, optional): Normalization layer. Default: None
"""
def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
super().__init__()
patch_size = to_2tuple(patch_size)
self.patch_size = patch_size
self.in_chans = in_chans
self.embed_dim = embed_dim
self.proj = nn.Conv2D(
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
if norm_layer is not None:
self.norm = norm_layer(embed_dim)
else:
self.norm = None
def forward(self, x):
# TODO # export dynamic shape
B, C, H, W = x.shape
# assert [H, W] == self.img_size[:2], "Input image size ({H}*{W}) doesn't match model ({}*{}).".format(H, W, self.img_size[0], self.img_size[1])
if W % self.patch_size[1] != 0:
x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
if H % self.patch_size[0] != 0:
x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])
x = self.proj(x)
if self.norm is not None:
_, _, Wh, Ww = x.shape
x = x.flatten(2).transpose([0, 2, 1])
x = self.norm(x)
x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])
return x
@register
@serializable
class SwinTransformer(nn.Layer):
""" Swin Transformer backbone
Args:
arch (str): Architecture of FocalNet
pretrain_img_size (int | tuple(int)): Input image size. Default 224
patch_size (int | tuple(int)): Patch size. Default: 4
in_chans (int): Number of input image channels. Default: 3
embed_dim (int): Patch embedding dimension. Default: 96
depths (tuple(int)): Depth of each Swin Transformer layer.
num_heads (tuple(int)): Number of attention heads in different layers.
window_size (int): Window size. Default: 7
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
drop_rate (float): Dropout rate. Default: 0
attn_drop_rate (float): Attention dropout rate. Default: 0
drop_path_rate (float): Stochastic depth rate. Default: 0.1
norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
patch_norm (bool): If True, add normalization after patch embedding. Default: True
"""
def __init__(self,
arch='swin_T_224',
pretrain_img_size=224,
patch_size=4,
in_chans=3,
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
mlp_ratio=4.,
qkv_bias=True,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.2,
norm_layer=nn.LayerNorm,
ape=False,
patch_norm=True,
out_indices=(0, 1, 2, 3),
frozen_stages=-1,
pretrained=None):
super(SwinTransformer, self).__init__()
assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch)
pretrain_img_size = MODEL_cfg[arch]['pretrain_img_size']
embed_dim = MODEL_cfg[arch]['embed_dim']
depths = MODEL_cfg[arch]['depths']
num_heads = MODEL_cfg[arch]['num_heads']
window_size = MODEL_cfg[arch]['window_size']
if pretrained is None:
pretrained = MODEL_cfg[arch]['pretrained']
self.num_layers = len(depths)
self.ape = ape
self.patch_norm = patch_norm
self.out_indices = out_indices
self.frozen_stages = frozen_stages
# split image into non-overlapping patches
self.patch_embed = PatchEmbed(
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim,
norm_layer=norm_layer if self.patch_norm else None)
# absolute position embedding
if self.ape:
pretrain_img_size = to_2tuple(pretrain_img_size)
patch_size = to_2tuple(patch_size)
patches_resolution = [
pretrain_img_size[0] // patch_size[0],
pretrain_img_size[1] // patch_size[1]
]
self.absolute_pos_embed = add_parameter(
self,
paddle.zeros((1, embed_dim, patches_resolution[0],
patches_resolution[1])))
trunc_normal_(self.absolute_pos_embed)
self.pos_drop = nn.Dropout(p=drop_rate)
# stochastic depth
dpr = np.linspace(0, drop_path_rate,
sum(depths)) # stochastic depth decay rule
# build layers
self.layers = nn.LayerList()
for i_layer in range(self.num_layers):
layer = BasicLayer(
dim=int(embed_dim * 2**i_layer),
depth=depths[i_layer],
num_heads=num_heads[i_layer],
window_size=window_size,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
norm_layer=norm_layer,
downsample=PatchMerging
if (i_layer < self.num_layers - 1) else None)
self.layers.append(layer)
num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
self.num_features = num_features
# add a norm layer for each output
for i_layer in out_indices:
layer = norm_layer(num_features[i_layer])
layer_name = f'norm{i_layer}'
self.add_sublayer(layer_name, layer)
self.apply(self._init_weights)
self._freeze_stages()
if pretrained:
if 'http' in pretrained: #URL
path = paddle.utils.download.get_weights_path_from_url(
pretrained)
else: #model in local path
path = pretrained
self.set_state_dict(paddle.load(path))
def _freeze_stages(self):
if self.frozen_stages >= 0:
self.patch_embed.eval()
for param in self.patch_embed.parameters():
param.stop_gradient = True
if self.frozen_stages >= 1 and self.ape:
self.absolute_pos_embed.stop_gradient = True
if self.frozen_stages >= 2:
self.pos_drop.eval()
for i in range(0, self.frozen_stages - 1):
m = self.layers[i]
m.eval()
for param in m.parameters():
param.stop_gradient = True
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight)
if isinstance(m, nn.Linear) and m.bias is not None:
zeros_(m.bias)
elif isinstance(m, nn.LayerNorm):
zeros_(m.bias)
ones_(m.weight)
def forward(self, x):
"""Forward function."""
x = self.patch_embed(x['image'])
B, _, Wh, Ww = x.shape
if self.ape:
# interpolate the position embedding to the corresponding size
absolute_pos_embed = F.interpolate(
self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
x = (x + absolute_pos_embed).flatten(2).transpose([0, 2, 1])
else:
x = x.flatten(2).transpose([0, 2, 1])
x = self.pos_drop(x)
outs = []
for i in range(self.num_layers):
layer = self.layers[i]
x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
if i in self.out_indices:
norm_layer = getattr(self, f'norm{i}')
x_out = norm_layer(x_out)
out = x_out.reshape((-1, H, W, self.num_features[i])).transpose(
(0, 3, 1, 2))
outs.append(out)
return outs
@property
def out_shape(self):
out_strides = [4, 8, 16, 32]
return [
ShapeSpec(
channels=self.num_features[i], stride=out_strides[i])
for i in self.out_indices
]
================================================
FILE: ppdet/modeling/backbones/trans_encoder.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn import ReLU, Swish, GELU
import math
from ppdet.core.workspace import register
from ..shape_spec import ShapeSpec
__all__ = ['TransEncoder']
class BertEmbeddings(nn.Layer):
def __init__(self, word_size, position_embeddings_size, word_type_size,
hidden_size, dropout_prob):
super(BertEmbeddings, self).__init__()
self.word_embeddings = nn.Embedding(
word_size, hidden_size, padding_idx=0)
self.position_embeddings = nn.Embedding(position_embeddings_size,
hidden_size)
self.token_type_embeddings = nn.Embedding(word_type_size, hidden_size)
self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
self.dropout = nn.Dropout(dropout_prob)
def forward(self, x, token_type_ids=None, position_ids=None):
seq_len = x.shape[1]
if position_ids is None:
position_ids = paddle.arange(seq_len).unsqueeze(0).expand_as(x)
if token_type_ids is None:
token_type_ids = paddle.zeros(x.shape)
word_embs = self.word_embeddings(x)
position_embs = self.position_embeddings(position_ids)
token_type_embs = self.token_type_embeddings(token_type_ids)
embs_cmb = word_embs + position_embs + token_type_embs
embs_out = self.layernorm(embs_cmb)
embs_out = self.dropout(embs_out)
return embs_out
class BertSelfAttention(nn.Layer):
def __init__(self,
hidden_size,
num_attention_heads,
attention_probs_dropout_prob,
output_attentions=False):
super(BertSelfAttention, self).__init__()
if hidden_size % num_attention_heads != 0:
raise ValueError(
"The hidden_size must be a multiple of the number of attention "
"heads, but got {} % {} != 0" %
(hidden_size, num_attention_heads))
self.num_attention_heads = num_attention_heads
self.attention_head_size = int(hidden_size / num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(hidden_size, self.all_head_size)
self.key = nn.Linear(hidden_size, self.all_head_size)
self.value = nn.Linear(hidden_size, self.all_head_size)
self.dropout = nn.Dropout(attention_probs_dropout_prob)
self.output_attentions = output_attentions
def forward(self, x, attention_mask, head_mask=None):
query = self.query(x)
key = self.key(x)
value = self.value(x)
query_dim1, query_dim2 = query.shape[:-1]
new_shape = [
query_dim1, query_dim2, self.num_attention_heads,
self.attention_head_size
]
query = query.reshape(new_shape).transpose(perm=(0, 2, 1, 3))
key = key.reshape(new_shape).transpose(perm=(0, 2, 3, 1))
value = value.reshape(new_shape).transpose(perm=(0, 2, 1, 3))
attention = paddle.matmul(query,
key) / math.sqrt(self.attention_head_size)
attention = attention + attention_mask
attention_value = F.softmax(attention, axis=-1)
attention_value = self.dropout(attention_value)
if head_mask is not None:
attention_value = attention_value * head_mask
context = paddle.matmul(attention_value, value).transpose(perm=(0, 2, 1,
3))
ctx_dim1, ctx_dim2 = context.shape[:-2]
new_context_shape = [
ctx_dim1,
ctx_dim2,
self.all_head_size,
]
context = context.reshape(new_context_shape)
if self.output_attentions:
return (context, attention_value)
else:
return (context, )
class BertAttention(nn.Layer):
def __init__(self,
hidden_size,
num_attention_heads,
attention_probs_dropout_prob,
fc_dropout_prob,
output_attentions=False):
super(BertAttention, self).__init__()
self.bert_selfattention = BertSelfAttention(
hidden_size, num_attention_heads, attention_probs_dropout_prob,
output_attentions)
self.fc = nn.Linear(hidden_size, hidden_size)
self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
self.dropout = nn.Dropout(fc_dropout_prob)
def forward(self, x, attention_mask, head_mask=None):
attention_feats = self.bert_selfattention(x, attention_mask, head_mask)
features = self.fc(attention_feats[0])
features = self.dropout(features)
features = self.layernorm(features + x)
if len(attention_feats) == 2:
return (features, attention_feats[1])
else:
return (features, )
class BertFeedForward(nn.Layer):
def __init__(self,
hidden_size,
intermediate_size,
num_attention_heads,
attention_probs_dropout_prob,
fc_dropout_prob,
act_fn='ReLU',
output_attentions=False):
super(BertFeedForward, self).__init__()
self.fc1 = nn.Linear(hidden_size, intermediate_size)
self.act_fn = eval(act_fn)
self.fc2 = nn.Linear(intermediate_size, hidden_size)
self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
self.dropout = nn.Dropout(fc_dropout_prob)
def forward(self, x):
features = self.fc1(x)
features = self.act_fn(features)
features = self.fc2(features)
features = self.dropout(features)
features = self.layernorm(features + x)
return features
class BertLayer(nn.Layer):
def __init__(self,
hidden_size,
intermediate_size,
num_attention_heads,
attention_probs_dropout_prob,
fc_dropout_prob,
act_fn='ReLU',
output_attentions=False):
super(BertLayer, self).__init__()
self.attention = BertAttention(hidden_size, num_attention_heads,
attention_probs_dropout_prob,
output_attentions)
self.feed_forward = BertFeedForward(
hidden_size, intermediate_size, num_attention_heads,
attention_probs_dropout_prob, fc_dropout_prob, act_fn,
output_attentions)
def forward(self, x, attention_mask, head_mask=None):
attention_feats = self.attention(x, attention_mask, head_mask)
features = self.feed_forward(attention_feats[0])
if len(attention_feats) == 2:
return (features, attention_feats[1])
else:
return (features, )
class BertEncoder(nn.Layer):
def __init__(self,
num_hidden_layers,
hidden_size,
intermediate_size,
num_attention_heads,
attention_probs_dropout_prob,
fc_dropout_prob,
act_fn='ReLU',
output_attentions=False,
output_hidden_feats=False):
super(BertEncoder, self).__init__()
self.output_attentions = output_attentions
self.output_hidden_feats = output_hidden_feats
self.layers = nn.LayerList([
BertLayer(hidden_size, intermediate_size, num_attention_heads,
attention_probs_dropout_prob, fc_dropout_prob, act_fn,
output_attentions) for _ in range(num_hidden_layers)
])
def forward(self, x, attention_mask, head_mask=None):
all_features = (x, )
all_attentions = ()
for i, layer in enumerate(self.layers):
mask = head_mask[i] if head_mask is not None else None
layer_out = layer(x, attention_mask, mask)
if self.output_hidden_feats:
all_features = all_features + (x, )
x = layer_out[0]
if self.output_attentions:
all_attentions = all_attentions + (layer_out[1], )
outputs = (x, )
if self.output_hidden_feats:
outputs += (all_features, )
if self.output_attentions:
outputs += (all_attentions, )
return outputs
class BertPooler(nn.Layer):
def __init__(self, hidden_size):
super(BertPooler, self).__init__()
self.fc = nn.Linear(hidden_size, hidden_size)
self.act = nn.Tanh()
def forward(self, x):
first_token = x[:, 0]
pooled_output = self.fc(first_token)
pooled_output = self.act(pooled_output)
return pooled_output
class METROEncoder(nn.Layer):
def __init__(self,
vocab_size,
num_hidden_layers,
features_dims,
position_embeddings_size,
hidden_size,
intermediate_size,
output_feature_dim,
num_attention_heads,
attention_probs_dropout_prob,
fc_dropout_prob,
act_fn='ReLU',
output_attentions=False,
output_hidden_feats=False,
use_img_layernorm=False):
super(METROEncoder, self).__init__()
self.img_dims = features_dims
self.num_hidden_layers = num_hidden_layers
self.use_img_layernorm = use_img_layernorm
self.output_attentions = output_attentions
self.embedding = BertEmbeddings(vocab_size, position_embeddings_size, 2,
hidden_size, fc_dropout_prob)
self.encoder = BertEncoder(
num_hidden_layers, hidden_size, intermediate_size,
num_attention_heads, attention_probs_dropout_prob, fc_dropout_prob,
act_fn, output_attentions, output_hidden_feats)
self.pooler = BertPooler(hidden_size)
self.position_embeddings = nn.Embedding(position_embeddings_size,
hidden_size)
self.img_embedding = nn.Linear(
features_dims, hidden_size, bias_attr=True)
self.dropout = nn.Dropout(fc_dropout_prob)
self.cls_head = nn.Linear(hidden_size, output_feature_dim)
self.residual = nn.Linear(features_dims, output_feature_dim)
self.apply(self.init_weights)
def init_weights(self, module):
""" Initialize the weights.
"""
if isinstance(module, (nn.Linear, nn.Embedding)):
module.weight.set_value(
paddle.normal(
mean=0.0, std=0.02, shape=module.weight.shape))
elif isinstance(module, nn.LayerNorm):
module.bias.set_value(paddle.zeros(shape=module.bias.shape))
module.weight.set_value(
paddle.full(
shape=module.weight.shape, fill_value=1.0))
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.set_value(paddle.zeros(shape=module.bias.shape))
def forward(self, x):
batchsize, seq_len = x.shape[:2]
input_ids = paddle.zeros((batchsize, seq_len), dtype="int64")
position_ids = paddle.arange(
seq_len, dtype="int64").unsqueeze(0).expand_as(input_ids)
attention_mask = paddle.ones_like(input_ids).unsqueeze(1).unsqueeze(2)
head_mask = [None] * self.num_hidden_layers
position_embs = self.position_embeddings(position_ids)
attention_mask = (1.0 - attention_mask) * -10000.0
img_features = self.img_embedding(x)
# We empirically observe that adding an additional learnable position embedding leads to more stable training
embeddings = position_embs + img_features
if self.use_img_layernorm:
embeddings = self.layernorm(embeddings)
embeddings = self.dropout(embeddings)
encoder_outputs = self.encoder(
embeddings, attention_mask, head_mask=head_mask)
pred_score = self.cls_head(encoder_outputs[0])
res_img_feats = self.residual(x)
pred_score = pred_score + res_img_feats
if self.output_attentions and self.output_hidden_feats:
return pred_score, encoder_outputs[1], encoder_outputs[-1]
else:
return pred_score
def gelu(x):
"""Implementation of the gelu activation function.
https://arxiv.org/abs/1606.08415
"""
return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0)))
@register
class TransEncoder(nn.Layer):
def __init__(self,
vocab_size=30522,
num_hidden_layers=4,
num_attention_heads=4,
position_embeddings_size=512,
intermediate_size=3072,
input_feat_dim=[2048, 512, 128],
hidden_feat_dim=[1024, 256, 128],
attention_probs_dropout_prob=0.1,
fc_dropout_prob=0.1,
act_fn='gelu',
output_attentions=False,
output_hidden_feats=False):
super(TransEncoder, self).__init__()
output_feat_dim = input_feat_dim[1:] + [3]
trans_encoder = []
for i in range(len(output_feat_dim)):
features_dims = input_feat_dim[i]
output_feature_dim = output_feat_dim[i]
hidden_size = hidden_feat_dim[i]
# init a transformer encoder and append it to a list
assert hidden_size % num_attention_heads == 0
model = METROEncoder(vocab_size, num_hidden_layers, features_dims,
position_embeddings_size, hidden_size,
intermediate_size, output_feature_dim,
num_attention_heads,
attention_probs_dropout_prob, fc_dropout_prob,
act_fn, output_attentions, output_hidden_feats)
trans_encoder.append(model)
self.trans_encoder = paddle.nn.Sequential(*trans_encoder)
def forward(self, x):
out = self.trans_encoder(x)
return out
================================================
FILE: ppdet/modeling/backbones/transformer_utils.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import TruncatedNormal, Constant, Assign
# Common initializations
ones_ = Constant(value=1.)
zeros_ = Constant(value=0.)
trunc_normal_ = TruncatedNormal(std=.02)
# Common Layers
def drop_path(x, drop_prob=0., training=False):
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
"""
if drop_prob == 0. or not training:
return x
keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
random_tensor = paddle.floor(random_tensor) # binarize
output = x.divide(keep_prob) * random_tensor
return output
class DropPath(nn.Layer):
def __init__(self, drop_prob=None):
super(DropPath, self).__init__()
self.drop_prob = drop_prob
def forward(self, x):
return drop_path(x, self.drop_prob, self.training)
class Identity(nn.Layer):
def __init__(self):
super(Identity, self).__init__()
def forward(self, input):
return input
# common funcs
def to_2tuple(x):
if isinstance(x, (list, tuple)):
return x
return tuple([x] * 2)
def add_parameter(layer, datas, name=None):
parameter = layer.create_parameter(
shape=(datas.shape), default_initializer=Assign(datas))
if name:
layer.add_parameter(name, parameter)
return parameter
def window_partition(x, window_size):
"""
Partition into non-overlapping windows with padding if needed.
Args:
x (tensor): input tokens with [B, H, W, C].
window_size (int): window size.
Returns:
windows: windows after partition with [B * num_windows, window_size, window_size, C].
(Hp, Wp): padded height and width before partition
"""
B, H, W, C = x.shape
pad_h = (window_size - H % window_size) % window_size
pad_w = (window_size - W % window_size) % window_size
x = F.pad(x.transpose([0, 3, 1, 2]),
paddle.to_tensor(
[0, int(pad_w), 0, int(pad_h)],
dtype='int32')).transpose([0, 2, 3, 1])
Hp, Wp = H + pad_h, W + pad_w
num_h, num_w = Hp // window_size, Wp // window_size
x = x.reshape([B, num_h, window_size, num_w, window_size, C])
windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
[-1, window_size, window_size, C])
return windows, (Hp, Wp), (num_h, num_w)
def window_unpartition(x, pad_hw, num_hw, hw):
"""
Window unpartition into original sequences and removing padding.
Args:
x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
pad_hw (Tuple): padded height and width (Hp, Wp).
hw (Tuple): original height and width (H, W) before padding.
Returns:
x: unpartitioned sequences with [B, H, W, C].
"""
Hp, Wp = pad_hw
num_h, num_w = num_hw
H, W = hw
B, window_size, _, C = x.shape
B = B // (num_h * num_w)
x = x.reshape([B, num_h, num_w, window_size, window_size, C])
x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, Hp, Wp, C])
return x[:, :H, :W, :]
================================================
FILE: ppdet/modeling/backbones/vgg.py
================================================
from __future__ import division
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn import Conv2D, MaxPool2D
from ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec
__all__ = ['VGG']
VGG_cfg = {16: [2, 2, 3, 3, 3], 19: [2, 2, 4, 4, 4]}
class ConvBlock(nn.Layer):
def __init__(self,
in_channels,
out_channels,
groups,
pool_size=2,
pool_stride=2,
pool_padding=0,
name=None):
super(ConvBlock, self).__init__()
self.groups = groups
self.conv0 = nn.Conv2D(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=3,
stride=1,
padding=1)
self.conv_out_list = []
for i in range(1, groups):
conv_out = self.add_sublayer(
'conv{}'.format(i),
Conv2D(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=3,
stride=1,
padding=1))
self.conv_out_list.append(conv_out)
self.pool = MaxPool2D(
kernel_size=pool_size,
stride=pool_stride,
padding=pool_padding,
ceil_mode=True)
def forward(self, inputs):
out = self.conv0(inputs)
out = F.relu(out)
for conv_i in self.conv_out_list:
out = conv_i(out)
out = F.relu(out)
pool = self.pool(out)
return out, pool
class ExtraBlock(nn.Layer):
def __init__(self,
in_channels,
mid_channels,
out_channels,
padding,
stride,
kernel_size,
name=None):
super(ExtraBlock, self).__init__()
self.conv0 = Conv2D(
in_channels=in_channels,
out_channels=mid_channels,
kernel_size=1,
stride=1,
padding=0)
self.conv1 = Conv2D(
in_channels=mid_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding)
def forward(self, inputs):
out = self.conv0(inputs)
out = F.relu(out)
out = self.conv1(out)
out = F.relu(out)
return out
class L2NormScale(nn.Layer):
def __init__(self, num_channels, scale=1.0):
super(L2NormScale, self).__init__()
self.scale = self.create_parameter(
attr=ParamAttr(initializer=paddle.nn.initializer.Constant(scale)),
shape=[num_channels])
def forward(self, inputs):
out = F.normalize(inputs, axis=1, epsilon=1e-10)
# out = self.scale.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(
# out) * out
out = self.scale.unsqueeze(0).unsqueeze(2).unsqueeze(3) * out
return out
@register
@serializable
class VGG(nn.Layer):
def __init__(self,
depth=16,
normalizations=[20., -1, -1, -1, -1, -1],
extra_block_filters=[[256, 512, 1, 2, 3], [128, 256, 1, 2, 3],
[128, 256, 0, 1, 3],
[128, 256, 0, 1, 3]]):
super(VGG, self).__init__()
assert depth in [16, 19], \
"depth as 16/19 supported currently, but got {}".format(depth)
self.depth = depth
self.groups = VGG_cfg[depth]
self.normalizations = normalizations
self.extra_block_filters = extra_block_filters
self._out_channels = []
self.conv_block_0 = ConvBlock(
3, 64, self.groups[0], 2, 2, 0, name="conv1_")
self.conv_block_1 = ConvBlock(
64, 128, self.groups[1], 2, 2, 0, name="conv2_")
self.conv_block_2 = ConvBlock(
128, 256, self.groups[2], 2, 2, 0, name="conv3_")
self.conv_block_3 = ConvBlock(
256, 512, self.groups[3], 2, 2, 0, name="conv4_")
self.conv_block_4 = ConvBlock(
512, 512, self.groups[4], 3, 1, 1, name="conv5_")
self._out_channels.append(512)
self.fc6 = Conv2D(
in_channels=512,
out_channels=1024,
kernel_size=3,
stride=1,
padding=6,
dilation=6)
self.fc7 = Conv2D(
in_channels=1024,
out_channels=1024,
kernel_size=1,
stride=1,
padding=0)
self._out_channels.append(1024)
# extra block
self.extra_convs = []
last_channels = 1024
for i, v in enumerate(self.extra_block_filters):
assert len(v) == 5, "extra_block_filters size not fix"
extra_conv = self.add_sublayer("conv{}".format(6 + i),
ExtraBlock(last_channels, v[0], v[1],
v[2], v[3], v[4]))
last_channels = v[1]
self.extra_convs.append(extra_conv)
self._out_channels.append(last_channels)
self.norms = []
for i, n in enumerate(self.normalizations):
if n != -1:
norm = self.add_sublayer("norm{}".format(i),
L2NormScale(
self.extra_block_filters[i][1], n))
else:
norm = None
self.norms.append(norm)
def forward(self, inputs):
outputs = []
conv, pool = self.conv_block_0(inputs['image'])
conv, pool = self.conv_block_1(pool)
conv, pool = self.conv_block_2(pool)
conv, pool = self.conv_block_3(pool)
outputs.append(conv)
conv, pool = self.conv_block_4(pool)
out = self.fc6(pool)
out = F.relu(out)
out = self.fc7(out)
out = F.relu(out)
outputs.append(out)
if not self.extra_block_filters:
return outputs
# extra block
for extra_conv in self.extra_convs:
out = extra_conv(out)
outputs.append(out)
for i, n in enumerate(self.normalizations):
if n != -1:
outputs[i] = self.norms[i](outputs[i])
return outputs
@property
def out_shape(self):
return [ShapeSpec(channels=c) for c in self._out_channels]
================================================
FILE: ppdet/modeling/backbones/vision_transformer.py
================================================
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import numpy as np
from paddle.nn.initializer import Constant
from ppdet.modeling.shape_spec import ShapeSpec
from ppdet.core.workspace import register, serializable
from .transformer_utils import zeros_, DropPath, Identity
class Mlp(nn.Layer):
def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer=nn.GELU,
drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
class Attention(nn.Layer):
def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0.,
proj_drop=0.,
window_size=None):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim**-0.5
self.qkv = nn.Linear(dim, dim * 3, bias_attr=False)
if qkv_bias:
self.q_bias = self.create_parameter(
shape=([dim]), default_initializer=zeros_)
self.v_bias = self.create_parameter(
shape=([dim]), default_initializer=zeros_)
else:
self.q_bias = None
self.v_bias = None
if window_size:
self.window_size = window_size
self.num_relative_distance = (2 * window_size[0] - 1) * (
2 * window_size[1] - 1) + 3
self.relative_position_bias_table = self.create_parameter(
shape=(self.num_relative_distance, num_heads),
default_initializer=zeros_) # 2*Wh-1 * 2*Ww-1, nH
# cls to token & token 2 cls & cls to cls
# get pair-wise relative position index for each token inside the window
coords_h = paddle.arange(window_size[0])
coords_w = paddle.arange(window_size[1])
coords = paddle.stack(paddle.meshgrid(
[coords_h, coords_w])) # 2, Wh, Ww
coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww
coords_flatten_1 = paddle.unsqueeze(coords_flatten, 2)
coords_flatten_2 = paddle.unsqueeze(coords_flatten, 1)
relative_coords = coords_flatten_1.clone() - coords_flatten_2.clone(
)
#relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Wh
relative_coords = relative_coords.transpose(
(1, 2, 0)) #.contiguous() # Wh*Ww, Wh*Ww, 2
relative_coords[:, :, 0] += window_size[
0] - 1 # shift to start from 0
relative_coords[:, :, 1] += window_size[1] - 1
relative_coords[:, :, 0] *= 2 * window_size[1] - 1
relative_position_index = \
paddle.zeros(shape=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
relative_position_index[1:, 1:] = relative_coords.sum(
-1) # Wh*Ww, Wh*Ww
relative_position_index[0, 0:] = self.num_relative_distance - 3
relative_position_index[0:, 0] = self.num_relative_distance - 2
relative_position_index[0, 0] = self.num_relative_distance - 1
self.register_buffer("relative_position_index",
relative_position_index)
# trunc_normal_(self.relative_position_bias_table, std=.0)
else:
self.window_size = None
self.relative_position_bias_table = None
self.relative_position_index = None
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x, rel_pos_bias=None):
x_shape = x.shape
N, C = x_shape[1], x_shape[2]
qkv_bias = None
if self.q_bias is not None:
qkv_bias = paddle.concat(
(self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)
qkv = qkv.reshape((-1, N, 3, self.num_heads,
C // self.num_heads)).transpose((2, 0, 3, 1, 4))
q, k, v = qkv[0], qkv[1], qkv[2]
attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
if self.relative_position_bias_table is not None:
relative_position_bias = self.relative_position_bias_table[
self.relative_position_index.reshape([-1])].reshape([
self.window_size[0] * self.window_size[1] + 1,
self.window_size[0] * self.window_size[1] + 1, -1
]) # Wh*Ww,Wh*Ww,nH
relative_position_bias = relative_position_bias.transpose(
(2, 0, 1)) #.contiguous() # nH, Wh*Ww, Wh*Ww
attn = attn + relative_position_bias.unsqueeze(0)
if rel_pos_bias is not None:
attn = attn + rel_pos_bias
attn = nn.functional.softmax(attn, axis=-1)
attn = self.attn_drop(attn)
x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
x = self.proj(x)
x = self.proj_drop(x)
return x
class Block(nn.Layer):
def __init__(self,
dim,
num_heads,
mlp_ratio=4.,
qkv_bias=False,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
window_size=None,
init_values=None,
act_layer=nn.GELU,
norm_layer='nn.LayerNorm',
epsilon=1e-5):
super().__init__()
self.norm1 = nn.LayerNorm(dim, epsilon=1e-6)
self.attn = Attention(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop,
window_size=window_size)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)
if init_values is not None:
self.gamma_1 = self.create_parameter(
shape=([dim]), default_initializer=Constant(value=init_values))
self.gamma_2 = self.create_parameter(
shape=([dim]), default_initializer=Constant(value=init_values))
else:
self.gamma_1, self.gamma_2 = None, None
def forward(self, x, rel_pos_bias=None):
if self.gamma_1 is None:
x = x + self.drop_path(
self.attn(
self.norm1(x), rel_pos_bias=rel_pos_bias))
x = x + self.drop_path(self.mlp(self.norm2(x)))
else:
x = x + self.drop_path(self.gamma_1 * self.attn(
self.norm1(x), rel_pos_bias=rel_pos_bias))
x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
return x
class PatchEmbed(nn.Layer):
""" Image to Patch Embedding
"""
def __init__(self,
img_size=[224, 224],
patch_size=16,
in_chans=3,
embed_dim=768):
super().__init__()
self.num_patches_w = img_size[0] // patch_size
self.num_patches_h = img_size[1] // patch_size
num_patches = self.num_patches_w * self.num_patches_h
self.patch_shape = (img_size[0] // patch_size,
img_size[1] // patch_size)
self.img_size = img_size
self.patch_size = patch_size
self.num_patches = num_patches
self.proj = nn.Conv2D(
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
@property
def num_patches_in_h(self):
return self.img_size[1] // self.patch_size
@property
def num_patches_in_w(self):
return self.img_size[0] // self.patch_size
def forward(self, x, mask=None):
B, C, H, W = x.shape
return self.proj(x)
class RelativePositionBias(nn.Layer):
def __init__(self, window_size, num_heads):
super().__init__()
self.window_size = window_size
self.num_relative_distance = (2 * window_size[0] - 1) * (
2 * window_size[1] - 1) + 3
self.relative_position_bias_table = self.create_parameter(
shape=(self.num_relative_distance, num_heads),
default_initialize=zeros_)
# cls to token & token 2 cls & cls to cls
# get pair-wise relative position index for each token inside the window
coords_h = paddle.arange(window_size[0])
coords_w = paddle.arange(window_size[1])
coords = paddle.stack(paddle.meshgrid(
[coords_h, coords_w])) # 2, Wh, Ww
coords_flatten = coords.flatten(1) # 2, Wh*Ww
relative_coords = coords_flatten[:, :,
None] - coords_flatten[:,
None, :] # 2, Wh*Ww, Wh*Ww
relative_coords = relative_coords.transpos(
(1, 2, 0)) # Wh*Ww, Wh*Ww, 2
relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0
relative_coords[:, :, 1] += window_size[1] - 1
relative_coords[:, :, 0] *= 2 * window_size[1] - 1
relative_position_index = \
paddle.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
relative_position_index[1:, 1:] = relative_coords.sum(
-1) # Wh*Ww, Wh*Ww
relative_position_index[0, 0:] = self.num_relative_distance - 3
relative_position_index[0:, 0] = self.num_relative_distance - 2
relative_position_index[0, 0] = self.num_relative_distance - 1
self.register_buffer("relative_position_index", relative_position_index)
def forward(self):
relative_position_bias = \
self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([
self.window_size[0] * self.window_size[1] + 1,
self.window_size[0] * self.window_size[1] + 1, -1]) # Wh*Ww,Wh*Ww,nH
return relative_position_bias.transpose((2, 0, 1)) # nH, Wh*Ww, Wh*Ww
def get_sinusoid_encoding_table(n_position, d_hid, token=False):
''' Sinusoid position encoding table '''
def get_position_angle_vec(position):
return [
position / np.power(10000, 2 * (hid_j // 2) / d_hid)
for hid_j in range(d_hid)
]
sinusoid_table = np.array(
[get_position_angle_vec(pos_i) for pos_i in range(n_position)])
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
if token:
sinusoid_table = np.concatenate(
[sinusoid_table, np.zeros([1, d_hid])], dim=0)
return paddle.to_tensor(sinusoid_table, dtype=paddle.float32).unsqueeze(0)
@register
@serializable
class VisionTransformer(nn.Layer):
""" Vision Transformer with support for patch input
"""
def __init__(self,
img_size=[672, 1092],
patch_size=16,
in_chans=3,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4,
qkv_bias=False,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.,
norm_layer='nn.LayerNorm',
init_values=None,
use_rel_pos_bias=False,
use_shared_rel_pos_bias=False,
epsilon=1e-5,
final_norm=False,
pretrained=None,
out_indices=[3, 5, 7, 11],
use_abs_pos_emb=False,
use_sincos_pos_emb=True,
with_fpn=True,
num_fpn_levels=4,
use_checkpoint=False,
**args):
super().__init__()
self.img_size = img_size
self.embed_dim = embed_dim
self.with_fpn = with_fpn
self.use_checkpoint = use_checkpoint
self.use_sincos_pos_emb = use_sincos_pos_emb
self.use_rel_pos_bias = use_rel_pos_bias
self.final_norm = final_norm
self.out_indices = out_indices
self.num_fpn_levels = num_fpn_levels
if use_checkpoint:
paddle.seed(0)
self.patch_embed = PatchEmbed(
img_size=img_size,
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim)
self.pos_w = self.patch_embed.num_patches_in_w
self.pos_h = self.patch_embed.num_patches_in_h
self.cls_token = self.create_parameter(
shape=(1, 1, embed_dim),
default_initializer=paddle.nn.initializer.Constant(value=0.))
if use_abs_pos_emb:
self.pos_embed = self.create_parameter(
shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
default_initializer=paddle.nn.initializer.TruncatedNormal(
std=.02))
elif use_sincos_pos_emb:
pos_embed = self.build_2d_sincos_position_embedding(embed_dim)
self.pos_embed = pos_embed
self.pos_embed = self.create_parameter(shape=pos_embed.shape)
self.pos_embed.set_value(pos_embed.numpy())
self.pos_embed.stop_gradient = True
else:
self.pos_embed = None
self.pos_drop = nn.Dropout(p=drop_rate)
if use_shared_rel_pos_bias:
self.rel_pos_bias = RelativePositionBias(
window_size=self.patch_embed.patch_shape, num_heads=num_heads)
else:
self.rel_pos_bias = None
dpr = np.linspace(0, drop_path_rate, depth)
self.blocks = nn.LayerList([
Block(
dim=embed_dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[i],
norm_layer=norm_layer,
init_values=init_values,
window_size=self.patch_embed.patch_shape
if use_rel_pos_bias else None,
epsilon=epsilon) for i in range(depth)
])
self.pretrained = pretrained
self.init_weight()
assert len(out_indices) <= 4, ''
self.out_indices = out_indices
self.out_channels = [embed_dim for _ in range(num_fpn_levels)]
self.out_strides = [4, 8, 16, 32][-num_fpn_levels:] if with_fpn else [
patch_size for _ in range(len(out_indices))
]
self.norm = Identity()
if self.with_fpn:
assert num_fpn_levels <= 4, ''
self.init_fpn(
embed_dim=embed_dim,
patch_size=patch_size, )
def init_weight(self):
pretrained = self.pretrained
if pretrained:
if 'http' in pretrained: #URL
path = paddle.utils.download.get_weights_path_from_url(
pretrained)
else: #model in local path
path = pretrained
load_state_dict = paddle.load(path)
model_state_dict = self.state_dict()
pos_embed_name = "pos_embed"
if pos_embed_name in load_state_dict.keys():
load_pos_embed = paddle.to_tensor(
load_state_dict[pos_embed_name], dtype="float32")
if self.pos_embed.shape != load_pos_embed.shape:
pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
model_state_dict[pos_embed_name] = self.resize_pos_embed(
load_pos_embed, (pos_size, pos_size),
(self.pos_h, self.pos_w))
# self.set_state_dict(model_state_dict)
load_state_dict[pos_embed_name] = model_state_dict[
pos_embed_name]
print("Load pos_embed and resize it from {} to {} .".format(
load_pos_embed.shape, self.pos_embed.shape))
self.set_state_dict(load_state_dict)
print("Load load_state_dict....")
def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):
if patch_size == 16:
self.fpn1 = nn.Sequential(
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2),
nn.BatchNorm2D(embed_dim),
nn.GELU(),
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2), )
self.fpn2 = nn.Sequential(
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2), )
self.fpn3 = Identity()
self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)
elif patch_size == 8:
self.fpn1 = nn.Sequential(
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2), )
self.fpn2 = Identity()
self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )
self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )
if not out_with_norm:
self.norm = Identity()
else:
self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6)
def interpolate_pos_encoding(self, x, w, h):
npatch = x.shape[1] - 1
N = self.pos_embed.shape[1] - 1
w0 = w // self.patch_embed.patch_size
h0 = h // self.patch_embed.patch_size
if npatch == N and w0 == self.patch_embed.num_patches_w and h0 == self.patch_embed.num_patches_h:
return self.pos_embed
class_pos_embed = self.pos_embed[:, 0]
patch_pos_embed = self.pos_embed[:, 1:]
dim = x.shape[-1]
# we add a small number to avoid floating point error in the interpolation
# see discussion at https://github.com/facebookresearch/dino/issues/8
# w0, h0 = w0 + 0.1, h0 + 0.1
# patch_pos_embed = nn.functional.interpolate(
# patch_pos_embed.reshape([
# 1, self.patch_embed.num_patches_w,
# self.patch_embed.num_patches_h, dim
# ]).transpose((0, 3, 1, 2)),
# scale_factor=(w0 / self.patch_embed.num_patches_w,
# h0 / self.patch_embed.num_patches_h),
# mode='bicubic', )
patch_pos_embed = nn.functional.interpolate(
patch_pos_embed.reshape([
1, self.patch_embed.num_patches_w,
self.patch_embed.num_patches_h, dim
]).transpose((0, 3, 1, 2)),
(w0, h0),
mode='bicubic', )
assert int(w0) == patch_pos_embed.shape[-2] and int(
h0) == patch_pos_embed.shape[-1]
patch_pos_embed = patch_pos_embed.transpose(
(0, 2, 3, 1)).reshape([1, -1, dim])
return paddle.concat(
(class_pos_embed.unsqueeze(0), patch_pos_embed), axis=1)
def resize_pos_embed(self, pos_embed, old_hw, new_hw):
"""
Resize pos_embed weight.
Args:
pos_embed (Tensor): the pos_embed weight
old_hw (list[int]): the height and width of old pos_embed
new_hw (list[int]): the height and width of new pos_embed
Returns:
Tensor: the resized pos_embed weight
"""
cls_pos_embed = pos_embed[:, :1, :]
pos_embed = pos_embed[:, 1:, :]
pos_embed = pos_embed.transpose([0, 2, 1])
pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
pos_embed = F.interpolate(
pos_embed, new_hw, mode='bicubic', align_corners=False)
pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)
return pos_embed
def build_2d_sincos_position_embedding(
self,
embed_dim=768,
temperature=10000., ):
h, w = self.patch_embed.patch_shape
grid_w = paddle.arange(w, dtype=paddle.float32)
grid_h = paddle.arange(h, dtype=paddle.float32)
grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
pos_dim = embed_dim // 4
omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
omega = 1. / (temperature**omega)
out_w = grid_w.flatten()[..., None] @omega[None]
out_h = grid_h.flatten()[..., None] @omega[None]
pos_emb = paddle.concat(
[
paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
paddle.cos(out_h)
],
axis=1)[None, :, :]
pe_token = paddle.zeros([1, 1, embed_dim], dtype=paddle.float32)
pos_embed = paddle.concat([pe_token, pos_emb], axis=1)
# pos_embed.stop_gradient = True
return pos_embed
def forward(self, x):
x = x['image'] if isinstance(x, dict) else x
_, _, h, w = x.shape
x = self.patch_embed(x)
B, D, Hp, Wp = x.shape # b * c * h * w
cls_tokens = self.cls_token.expand(
(B, self.cls_token.shape[-2], self.cls_token.shape[-1]))
x = x.flatten(2).transpose([0, 2, 1]) # b * hw * c
x = paddle.concat([cls_tokens, x], axis=1)
if self.pos_embed is not None:
# x = x + self.interpolate_pos_encoding(x, w, h)
x = x + self.interpolate_pos_encoding(x, h, w)
x = self.pos_drop(x)
rel_pos_bias = self.rel_pos_bias(
) if self.rel_pos_bias is not None else None
feats = []
for idx, blk in enumerate(self.blocks):
if self.use_checkpoint and self.training:
x = paddle.distributed.fleet.utils.recompute(
blk, x, rel_pos_bias, **{"preserve_rng_state": True})
else:
x = blk(x, rel_pos_bias)
if idx in self.out_indices:
xp = paddle.reshape(
paddle.transpose(
self.norm(x[:, 1:, :]), perm=[0, 2, 1]),
shape=[B, D, Hp, Wp])
feats.append(xp)
if self.with_fpn:
fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4][
-self.num_fpn_levels:]
assert len(fpns) == len(feats) or len(feats) == 1, ''
outputs = []
for i, m in enumerate(fpns):
outputs.append(
m(feats[i] if len(feats) == len(fpns) else feats[-1]))
return outputs
return feats
@property
def num_layers(self):
return len(self.blocks)
@property
def no_weight_decay(self):
return {'pos_embed', 'cls_token'}
@property
def out_shape(self):
return [
ShapeSpec(
channels=c, stride=s)
for c, s in zip(self.out_channels, self.out_strides)
]
================================================
FILE: ppdet/modeling/backbones/vit_mae.py
================================================
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import numpy as np
import math
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddle.nn.initializer import Constant, TruncatedNormal
from ppdet.modeling.shape_spec import ShapeSpec
from ppdet.core.workspace import register, serializable
from .transformer_utils import (zeros_, DropPath, Identity, window_partition,
window_unpartition)
from ..initializer import linear_init_
__all__ = ['VisionTransformer2D', 'SimpleFeaturePyramid']
class Mlp(nn.Layer):
def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer='nn.GELU',
drop=0.,
lr_factor=1.0):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(
in_features,
hidden_features,
weight_attr=ParamAttr(learning_rate=lr_factor),
bias_attr=ParamAttr(learning_rate=lr_factor))
self.act = eval(act_layer)()
self.fc2 = nn.Linear(
hidden_features,
out_features,
weight_attr=ParamAttr(learning_rate=lr_factor),
bias_attr=ParamAttr(learning_rate=lr_factor))
self.drop = nn.Dropout(drop)
self._init_weights()
def _init_weights(self):
linear_init_(self.fc1)
linear_init_(self.fc2)
def forward(self, x):
x = self.drop(self.act(self.fc1(x)))
x = self.drop(self.fc2(x))
return x
class Attention(nn.Layer):
def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
attn_bias=False,
attn_drop=0.,
proj_drop=0.,
use_rel_pos=False,
rel_pos_zero_init=True,
window_size=None,
input_size=None,
qk_scale=None,
lr_factor=1.0):
super().__init__()
self.num_heads = num_heads
self.head_dim = dim // num_heads
self.scale = qk_scale or self.head_dim**-0.5
self.use_rel_pos = use_rel_pos
self.input_size = input_size
self.rel_pos_zero_init = rel_pos_zero_init
self.window_size = window_size
self.lr_factor = lr_factor
self.qkv = nn.Linear(
dim,
dim * 3,
weight_attr=ParamAttr(learning_rate=lr_factor),
bias_attr=ParamAttr(learning_rate=lr_factor)
if attn_bias else False)
if qkv_bias:
self.q_bias = self.create_parameter(
shape=([dim]), default_initializer=zeros_)
self.v_bias = self.create_parameter(
shape=([dim]), default_initializer=zeros_)
else:
self.q_bias = None
self.v_bias = None
self.proj = nn.Linear(
dim,
dim,
weight_attr=ParamAttr(learning_rate=lr_factor),
bias_attr=ParamAttr(learning_rate=lr_factor))
self.attn_drop = nn.Dropout(attn_drop)
if window_size is None:
self.window_size = self.input_size[0]
self._init_weights()
def _init_weights(self):
linear_init_(self.qkv)
linear_init_(self.proj)
if self.use_rel_pos:
self.rel_pos_h = self.create_parameter(
[2 * self.window_size - 1, self.head_dim],
attr=ParamAttr(learning_rate=self.lr_factor),
default_initializer=Constant(value=0.))
self.rel_pos_w = self.create_parameter(
[2 * self.window_size - 1, self.head_dim],
attr=ParamAttr(learning_rate=self.lr_factor),
default_initializer=Constant(value=0.))
if not self.rel_pos_zero_init:
TruncatedNormal(self.rel_pos_h, std=0.02)
TruncatedNormal(self.rel_pos_w, std=0.02)
def get_rel_pos(self, seq_size, rel_pos):
max_rel_dist = int(2 * seq_size - 1)
# Interpolate rel pos if needed.
if rel_pos.shape[0] != max_rel_dist:
# Interpolate rel pos.
rel_pos = rel_pos.reshape([1, rel_pos.shape[0], -1])
rel_pos = rel_pos.transpose([0, 2, 1])
rel_pos_resized = F.interpolate(
rel_pos,
size=(max_rel_dist, ),
mode="linear",
data_format='NCW')
rel_pos_resized = rel_pos_resized.reshape([-1, max_rel_dist])
rel_pos_resized = rel_pos_resized.transpose([1, 0])
else:
rel_pos_resized = rel_pos
coords = paddle.arange(seq_size, dtype='float32')
relative_coords = coords.unsqueeze(-1) - coords.unsqueeze(0)
relative_coords += (seq_size - 1)
relative_coords = relative_coords.astype('int64').flatten()
return paddle.index_select(rel_pos_resized, relative_coords).reshape(
[seq_size, seq_size, self.head_dim])
def add_decomposed_rel_pos(self, attn, q, h, w):
"""
Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
Args:
attn (Tensor): attention map.
q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
Returns:
attn (Tensor): attention map with added relative positional embeddings.
"""
Rh = self.get_rel_pos(h, self.rel_pos_h)
Rw = self.get_rel_pos(w, self.rel_pos_w)
B, _, dim = q.shape
r_q = q.reshape([B, h, w, dim])
# bhwc, hch->bhwh1
# bwhc, wcw->bhw1w
rel_h = paddle.einsum("bhwc,hkc->bhwk", r_q, Rh).unsqueeze(-1)
rel_w = paddle.einsum("bhwc,wkc->bhwk", r_q, Rw).unsqueeze(-2)
attn = attn.reshape([B, h, w, h, w]) + rel_h + rel_w
return attn.reshape([B, h * w, h * w])
def forward(self, x):
B, H, W, C = x.shape
if self.q_bias is not None:
qkv_bias = paddle.concat(
(self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)
else:
qkv = self.qkv(x).reshape(
[B, H * W, 3, self.num_heads, self.head_dim]).transpose(
[2, 0, 3, 1, 4]).reshape(
[3, B * self.num_heads, H * W, self.head_dim])
q, k, v = qkv[0], qkv[1], qkv[2]
attn = q.matmul(k.transpose([0, 2, 1])) * self.scale
if self.use_rel_pos:
attn = self.add_decomposed_rel_pos(attn, q, H, W)
attn = F.softmax(attn, axis=-1)
attn = self.attn_drop(attn)
x = attn.matmul(v).reshape(
[B, self.num_heads, H * W, self.head_dim]).transpose(
[0, 2, 1, 3]).reshape([B, H, W, C])
x = self.proj(x)
return x
class Block(nn.Layer):
def __init__(self,
dim,
num_heads,
mlp_ratio=4.,
qkv_bias=False,
attn_bias=False,
qk_scale=None,
init_values=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
use_rel_pos=True,
rel_pos_zero_init=True,
window_size=None,
input_size=None,
act_layer='nn.GELU',
norm_layer='nn.LayerNorm',
lr_factor=1.0,
epsilon=1e-5):
super().__init__()
self.window_size = window_size
self.norm1 = eval(norm_layer)(dim,
weight_attr=ParamAttr(
learning_rate=lr_factor,
regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(
learning_rate=lr_factor,
regularizer=L2Decay(0.0)),
epsilon=epsilon)
self.attn = Attention(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
attn_bias=attn_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop,
use_rel_pos=use_rel_pos,
rel_pos_zero_init=rel_pos_zero_init,
window_size=window_size,
input_size=input_size,
lr_factor=lr_factor)
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
self.norm2 = eval(norm_layer)(dim,
weight_attr=ParamAttr(
learning_rate=lr_factor,
regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(
learning_rate=lr_factor,
regularizer=L2Decay(0.0)),
epsilon=epsilon)
self.mlp = Mlp(in_features=dim,
hidden_features=int(dim * mlp_ratio),
act_layer=act_layer,
drop=drop,
lr_factor=lr_factor)
if init_values is not None:
self.gamma_1 = self.create_parameter(
shape=([dim]), default_initializer=Constant(value=init_values))
self.gamma_2 = self.create_parameter(
shape=([dim]), default_initializer=Constant(value=init_values))
else:
self.gamma_1, self.gamma_2 = None, None
def forward(self, x):
y = self.norm1(x)
if self.window_size is not None:
y, pad_hw, num_hw = window_partition(y, self.window_size)
y = self.attn(y)
if self.gamma_1 is not None:
y = self.gamma_1 * y
if self.window_size is not None:
y = window_unpartition(y, pad_hw, num_hw, (x.shape[1], x.shape[2]))
x = x + self.drop_path(y)
if self.gamma_2 is None:
x = x + self.drop_path(self.mlp(self.norm2(x)))
else:
x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
return x
class PatchEmbed(nn.Layer):
""" Image to Patch Embedding
"""
def __init__(self,
img_size=(224, 224),
patch_size=16,
in_chans=3,
embed_dim=768,
lr_factor=0.01):
super().__init__()
self.img_size = img_size
self.patch_size = patch_size
self.proj = nn.Conv2D(
in_chans,
embed_dim,
kernel_size=patch_size,
stride=patch_size,
weight_attr=ParamAttr(learning_rate=lr_factor),
bias_attr=ParamAttr(learning_rate=lr_factor))
@property
def num_patches_in_h(self):
return self.img_size[1] // self.patch_size
@property
def num_patches_in_w(self):
return self.img_size[0] // self.patch_size
def forward(self, x):
out = self.proj(x)
return out
@register
@serializable
class VisionTransformer2D(nn.Layer):
""" Vision Transformer with support for patch input
"""
def __init__(self,
img_size=(1024, 1024),
patch_size=16,
in_chans=3,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4,
qkv_bias=False,
attn_bias=False,
qk_scale=None,
init_values=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.,
act_layer='nn.GELU',
norm_layer='nn.LayerNorm',
lr_decay_rate=1.0,
global_attn_indexes=(2, 5, 8, 11),
use_abs_pos=False,
use_rel_pos=False,
use_abs_pos_emb=False,
use_sincos_pos_emb=False,
rel_pos_zero_init=True,
epsilon=1e-5,
final_norm=False,
pretrained=None,
window_size=None,
out_indices=(11, ),
with_fpn=False,
use_checkpoint=False,
*args,
**kwargs):
super().__init__()
self.img_size = img_size
self.patch_size = patch_size
self.embed_dim = embed_dim
self.num_heads = num_heads
self.depth = depth
self.global_attn_indexes = global_attn_indexes
self.epsilon = epsilon
self.with_fpn = with_fpn
self.use_checkpoint = use_checkpoint
self.patch_h = img_size[0] // patch_size
self.patch_w = img_size[1] // patch_size
self.num_patches = self.patch_h * self.patch_w
self.use_abs_pos = use_abs_pos
self.use_abs_pos_emb = use_abs_pos_emb
self.patch_embed = PatchEmbed(
img_size=img_size,
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim)
dpr = np.linspace(0, drop_path_rate, depth)
if use_checkpoint:
paddle.seed(0)
if use_abs_pos_emb:
self.pos_w = self.patch_embed.num_patches_in_w
self.pos_h = self.patch_embed.num_patches_in_h
self.pos_embed = self.create_parameter(
shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
default_initializer=paddle.nn.initializer.TruncatedNormal(
std=.02))
elif use_sincos_pos_emb:
pos_embed = self.get_2d_sincos_position_embedding(self.patch_h,
self.patch_w)
self.pos_embed = pos_embed
self.pos_embed = self.create_parameter(shape=pos_embed.shape)
self.pos_embed.set_value(pos_embed.numpy())
self.pos_embed.stop_gradient = True
else:
self.pos_embed = None
self.blocks = nn.LayerList([
Block(
embed_dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
attn_bias=attn_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[i],
use_rel_pos=use_rel_pos,
rel_pos_zero_init=rel_pos_zero_init,
window_size=None
if i in self.global_attn_indexes else window_size,
input_size=[self.patch_h, self.patch_w],
act_layer=act_layer,
lr_factor=self.get_vit_lr_decay_rate(i, lr_decay_rate),
norm_layer=norm_layer,
init_values=init_values,
epsilon=epsilon) for i in range(depth)
])
assert len(out_indices) <= 4, 'out_indices out of bound'
self.out_indices = out_indices
self.pretrained = pretrained
self.init_weight()
self.out_channels = [embed_dim for _ in range(len(out_indices))]
self.out_strides = [4, 8, 16, 32][-len(out_indices):] if with_fpn else [
patch_size for _ in range(len(out_indices))
]
self.norm = Identity()
if self.with_fpn:
self.init_fpn(
embed_dim=embed_dim,
patch_size=patch_size,
out_with_norm=final_norm)
def get_vit_lr_decay_rate(self, layer_id, lr_decay_rate):
return lr_decay_rate**(self.depth - layer_id)
def init_weight(self):
pretrained = self.pretrained
if pretrained:
if 'http' in pretrained:
path = paddle.utils.download.get_weights_path_from_url(
pretrained)
else:
path = pretrained
load_state_dict = paddle.load(path)
model_state_dict = self.state_dict()
pos_embed_name = "pos_embed"
if pos_embed_name in load_state_dict.keys(
) and self.use_abs_pos_emb:
load_pos_embed = paddle.to_tensor(
load_state_dict[pos_embed_name], dtype="float32")
if self.pos_embed.shape != load_pos_embed.shape:
pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
model_state_dict[pos_embed_name] = self.resize_pos_embed(
load_pos_embed, (pos_size, pos_size),
(self.pos_h, self.pos_w))
# self.set_state_dict(model_state_dict)
load_state_dict[pos_embed_name] = model_state_dict[
pos_embed_name]
print("Load pos_embed and resize it from {} to {} .".format(
load_pos_embed.shape, self.pos_embed.shape))
self.set_state_dict(load_state_dict)
print("Load load_state_dict....")
def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):
if patch_size == 16:
self.fpn1 = nn.Sequential(
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2),
nn.BatchNorm2D(embed_dim),
nn.GELU(),
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2), )
self.fpn2 = nn.Sequential(
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2), )
self.fpn3 = Identity()
self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)
elif patch_size == 8:
self.fpn1 = nn.Sequential(
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2), )
self.fpn2 = Identity()
self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )
self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )
if not out_with_norm:
self.norm = Identity()
else:
self.norm = nn.LayerNorm(embed_dim, epsilon=self.epsilon)
def resize_pos_embed(self, pos_embed, old_hw, new_hw):
"""
Resize pos_embed weight.
Args:
pos_embed (Tensor): the pos_embed weight
old_hw (list[int]): the height and width of old pos_embed
new_hw (list[int]): the height and width of new pos_embed
Returns:
Tensor: the resized pos_embed weight
"""
cls_pos_embed = pos_embed[:, :1, :]
pos_embed = pos_embed[:, 1:, :]
pos_embed = pos_embed.transpose([0, 2, 1])
pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
pos_embed = F.interpolate(
pos_embed, new_hw, mode='bicubic', align_corners=False)
pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)
return pos_embed
def get_2d_sincos_position_embedding(self, h, w, temperature=10000.):
grid_y, grid_x = paddle.meshgrid(
paddle.arange(
h, dtype=paddle.float32),
paddle.arange(
w, dtype=paddle.float32))
assert self.embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
pos_dim = self.embed_dim // 4
omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
omega = (1. / (temperature**omega)).unsqueeze(0)
out_x = grid_x.reshape([-1, 1]).matmul(omega)
out_y = grid_y.reshape([-1, 1]).matmul(omega)
pos_emb = paddle.concat(
[
paddle.sin(out_y), paddle.cos(out_y), paddle.sin(out_x),
paddle.cos(out_x)
],
axis=1)
return pos_emb.reshape([1, h, w, self.embed_dim])
def forward(self, inputs):
x = self.patch_embed(inputs['image']).transpose([0, 2, 3, 1])
B, Hp, Wp, _ = x.shape
if self.use_abs_pos:
x = x + self.get_2d_sincos_position_embedding(Hp, Wp)
if self.use_abs_pos_emb:
x = x + self.resize_pos_embed(self.pos_embed,
(self.pos_h, self.pos_w), (Hp, Wp))
feats = []
for idx, blk in enumerate(self.blocks):
if self.use_checkpoint and self.training:
x = paddle.distributed.fleet.utils.recompute(
blk, x, **{"preserve_rng_state": True})
else:
x = blk(x)
if idx in self.out_indices:
feats.append(self.norm(x.transpose([0, 3, 1, 2])))
if self.with_fpn:
fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
for i in range(len(feats)):
feats[i] = fpns[i](feats[i])
return feats
@property
def num_layers(self):
return len(self.blocks)
@property
def no_weight_decay(self):
return {'pos_embed', 'cls_token'}
@property
def out_shape(self):
return [
ShapeSpec(
channels=c, stride=s)
for c, s in zip(self.out_channels, self.out_strides)
]
class LayerNorm(nn.Layer):
"""
A LayerNorm variant, popularized by Transformers, that performs point-wise mean and
variance normalization over the channel dimension for inputs that have shape
(batch_size, channels, height, width).
Note that, the modified LayerNorm on used in ResBlock and SimpleFeaturePyramid.
In ViT, we use the nn.LayerNorm
"""
def __init__(self, normalized_shape, eps=1e-6):
super().__init__()
self.weight = self.create_parameter([normalized_shape])
self.bias = self.create_parameter([normalized_shape])
self.eps = eps
self.normalized_shape = (normalized_shape, )
def forward(self, x):
u = x.mean(1, keepdim=True)
s = (x - u).pow(2).mean(1, keepdim=True)
x = (x - u) / paddle.sqrt(s + self.eps)
x = self.weight[:, None, None] * x + self.bias[:, None, None]
return x
@register
@serializable
class SimpleFeaturePyramid(nn.Layer):
def __init__(self,
in_channels,
out_channels,
spatial_scales,
num_levels=4,
use_bias=False):
"""
Args:
in_channels (list[int]): input channels of each level which can be
derived from the output shape of backbone by from_config
out_channel (int): output channel of each level.
spatial_scales (list[float]): list of scaling factors to upsample or downsample
the input features for creating pyramid features which can be derived from
the output shape of backbone by from_config
num_levels (int): number of levels of output features.
use_bias (bool): whether use bias or not.
"""
super(SimpleFeaturePyramid, self).__init__()
self.in_channels = in_channels[0]
self.out_channels = out_channels
self.num_levels = num_levels
self.stages = []
dim = self.in_channels
if num_levels == 4:
scale_factors = [2.0, 1.0, 0.5]
elif num_levels == 5:
scale_factors = [4.0, 2.0, 1.0, 0.5]
else:
raise NotImplementedError(
f"num_levels={num_levels} is not supported yet.")
dim = in_channels[0]
for idx, scale in enumerate(scale_factors):
out_dim = dim
if scale == 4.0:
layers = [
nn.Conv2DTranspose(
dim, dim // 2, kernel_size=2, stride=2),
nn.LayerNorm(dim // 2),
nn.GELU(),
nn.Conv2DTranspose(
dim // 2, dim // 4, kernel_size=2, stride=2),
]
out_dim = dim // 4
elif scale == 2.0:
layers = [
nn.Conv2DTranspose(
dim, dim // 2, kernel_size=2, stride=2)
]
out_dim = dim // 2
elif scale == 1.0:
layers = []
elif scale == 0.5:
layers = [nn.MaxPool2D(kernel_size=2, stride=2)]
layers.extend([
nn.Conv2D(
out_dim,
out_channels,
kernel_size=1,
bias_attr=use_bias, ), LayerNorm(out_channels), nn.Conv2D(
out_channels,
out_channels,
kernel_size=3,
padding=1,
bias_attr=use_bias, ), LayerNorm(out_channels)
])
layers = nn.Sequential(*layers)
stage = -int(math.log2(spatial_scales[0] * scale_factors[idx]))
self.add_sublayer(f"simfp_{stage}", layers)
self.stages.append(layers)
# top block output feature maps.
self.top_block = nn.Sequential(
nn.MaxPool2D(
kernel_size=1, stride=2, padding=0))
@classmethod
def from_config(cls, cfg, input_shape):
return {
'in_channels': [i.channels for i in input_shape],
'spatial_scales': [1.0 / i.stride for i in input_shape],
}
@property
def out_shape(self):
return [
ShapeSpec(channels=self.out_channels)
for _ in range(self.num_levels)
]
def forward(self, feats):
"""
Args:
x: Tensor of shape (N,C,H,W).
"""
features = feats[0]
results = []
for stage in self.stages:
results.append(stage(features))
top_block_in_feature = results[-1]
results.append(self.top_block(top_block_in_feature))
assert self.num_levels == len(results)
return results
================================================
FILE: ppdet/modeling/backbones/vitpose.py
================================================
# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Code was based on https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
# reference: https://arxiv.org/abs/2010.11929
from collections.abc import Callable
import numpy as np
import paddle
import paddle.nn as nn
from paddle.nn.initializer import TruncatedNormal, Constant, Normal
from ppdet.core.workspace import register, serializable
trunc_normal_ = TruncatedNormal(std=.02)
def to_2tuple(x):
if isinstance(x, (list, tuple)):
return x
return tuple([x] * 2)
def drop_path(x, drop_prob=0., training=False):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
"""
if drop_prob == 0. or not training:
return x
keep_prob = paddle.to_tensor(1.0 - drop_prob).astype(x.dtype)
shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype)
random_tensor = paddle.floor(random_tensor) # binarize
output = x.divide(keep_prob) * random_tensor
return output
class DropPath(nn.Layer):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
"""
def __init__(self, drop_prob=None):
super(DropPath, self).__init__()
self.drop_prob = drop_prob
def forward(self, x):
return drop_path(x, self.drop_prob, self.training)
class Identity(nn.Layer):
def __init__(self):
super(Identity, self).__init__()
def forward(self, input):
return input
class Mlp(nn.Layer):
def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer=nn.GELU,
drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.fc2(x)
x = self.drop(x)
return x
class Attention(nn.Layer):
def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0.,
proj_drop=0.):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim**-0.5
self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x):
N, C = x.shape[1:]
qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C //
self.num_heads)).transpose((2, 0, 3, 1, 4))
q, k, v = qkv[0], qkv[1], qkv[2]
attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
attn = nn.functional.softmax(attn, axis=-1)
attn = self.attn_drop(attn)
x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
x = self.proj(x)
x = self.proj_drop(x)
return x
class Block(nn.Layer):
def __init__(self,
dim,
num_heads,
mlp_ratio=4.,
qkv_bias=False,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
act_layer=nn.GELU,
norm_layer='nn.LayerNorm',
epsilon=1e-5):
super().__init__()
if isinstance(norm_layer, str):
self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
elif isinstance(norm_layer, Callable):
self.norm1 = norm_layer(dim)
else:
raise TypeError(
"The norm_layer must be str or paddle.nn.layer.Layer class")
self.attn = Attention(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
if isinstance(norm_layer, str):
self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
elif isinstance(norm_layer, Callable):
self.norm2 = norm_layer(dim)
else:
raise TypeError(
"The norm_layer must be str or paddle.nn.layer.Layer class")
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)
def forward(self, x):
x = x + self.drop_path(self.attn(self.norm1(x)))
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
class PatchEmbed(nn.Layer):
""" Image to Patch Embedding
"""
def __init__(self,
img_size=224,
patch_size=16,
in_chans=3,
embed_dim=768,
ratio=1):
super().__init__()
img_size = to_2tuple(img_size)
patch_size = to_2tuple(patch_size)
num_patches = (img_size[1] // patch_size[1]) * (
img_size[0] // patch_size[0]) * (ratio**2)
self.img_size = img_size
self.patch_size = patch_size
self.num_patches = num_patches
self.proj = nn.Conv2D(
in_chans,
embed_dim,
kernel_size=patch_size,
stride=(patch_size[0] // ratio),
padding=(4 + 2 * (ratio // 2 - 1), 4 + 2 * (ratio // 2 - 1)))
def forward(self, x):
B, C, H, W = x.shape
assert H == self.img_size[0] and W == self.img_size[1], \
f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
x = self.proj(x)
return x
@register
@serializable
class ViT(nn.Layer):
""" Vision Transformer with support for patch input
This module is different from ppdet's VisionTransformer (from ppdet/modeling/backbones/visio_transformer.py),
the main differences are:
1.the module PatchEmbed.proj has padding set,padding=(4 + 2 * (ratio // 2 - 1), 4 + 2 * (ratio // 2 - 1),
VisionTransformer dose not
2.Attention module qkv is standard.but VisionTransformer provide more options
3.MLP module only one Dropout,and VisionTransformer twice;
4.VisionTransformer provide fpn layer,but the module does not.
"""
def __init__(self,
img_size=224,
patch_size=16,
in_chans=3,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4,
qkv_bias=False,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.,
norm_layer='nn.LayerNorm',
epsilon=1e-5,
ratio=1,
pretrained=None,
**kwargs):
super().__init__()
self.pretrained = pretrained
self.num_features = self.embed_dim = embed_dim
self.patch_embed = PatchEmbed(
img_size=img_size,
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim,
ratio=ratio)
num_patches = self.patch_embed.num_patches
self.pos_embed = self.create_parameter(
shape=(1, num_patches + 1, embed_dim),
default_initializer=trunc_normal_)
self.add_parameter("pos_embed", self.pos_embed)
dpr = np.linspace(0, drop_path_rate, depth, dtype='float32')
self.blocks = nn.LayerList([
Block(
dim=embed_dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[i],
norm_layer=norm_layer,
epsilon=epsilon) for i in range(depth)
])
self.last_norm = eval(norm_layer)(embed_dim, epsilon=epsilon)
trunc_normal_(self.pos_embed)
self._init_weights()
def _init_weights(self):
pretrained = self.pretrained
if pretrained:
if 'http' in pretrained: #URL
path = paddle.utils.download.get_weights_path_from_url(
pretrained)
else: #model in local path
path = pretrained
load_state_dict = paddle.load(path)
self.set_state_dict(load_state_dict)
print("Load load_state_dict:", path)
def forward_features(self, x):
B = x.shape[0]
x = self.patch_embed(x)
B, D, Hp, Wp = x.shape
x = x.flatten(2).transpose([0, 2, 1])
x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1]
for blk in self.blocks:
x = blk(x)
x = self.last_norm(x)
xp = paddle.reshape(
paddle.transpose(
x, perm=[0, 2, 1]), shape=[B, -1, Hp, Wp])
return xp
================================================
FILE: ppdet/modeling/bbox_utils.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle
import numpy as np
def bbox2delta(src_boxes, tgt_boxes, weights=[1.0, 1.0, 1.0, 1.0]):
"""Encode bboxes to deltas.
"""
src_w = src_boxes[:, 2] - src_boxes[:, 0]
src_h = src_boxes[:, 3] - src_boxes[:, 1]
src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
src_ctr_y = src_boxes[:, 1] + 0.5 * src_h
tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]
tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]
tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w
tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h
wx, wy, ww, wh = weights
dx = wx * (tgt_ctr_x - src_ctr_x) / src_w
dy = wy * (tgt_ctr_y - src_ctr_y) / src_h
dw = ww * paddle.log(tgt_w / src_w)
dh = wh * paddle.log(tgt_h / src_h)
deltas = paddle.stack((dx, dy, dw, dh), axis=1)
return deltas
def delta2bbox(deltas, boxes, weights=[1.0, 1.0, 1.0, 1.0], max_shape=None):
"""Decode deltas to boxes. Used in RCNNBox,CascadeHead,RCNNHead,RetinaHead.
Note: return tensor shape [n,1,4]
If you want to add a reshape, please add after the calling code instead of here.
"""
clip_scale = math.log(1000.0 / 16)
widths = boxes[:, 2] - boxes[:, 0]
heights = boxes[:, 3] - boxes[:, 1]
ctr_x = boxes[:, 0] + 0.5 * widths
ctr_y = boxes[:, 1] + 0.5 * heights
wx, wy, ww, wh = weights
dx = deltas[:, 0::4] / wx
dy = deltas[:, 1::4] / wy
dw = deltas[:, 2::4] / ww
dh = deltas[:, 3::4] / wh
# Prevent sending too large values into paddle.exp()
dw = paddle.clip(dw, max=clip_scale)
dh = paddle.clip(dh, max=clip_scale)
pred_ctr_x = dx * widths.unsqueeze(1) + ctr_x.unsqueeze(1)
pred_ctr_y = dy * heights.unsqueeze(1) + ctr_y.unsqueeze(1)
pred_w = paddle.exp(dw) * widths.unsqueeze(1)
pred_h = paddle.exp(dh) * heights.unsqueeze(1)
pred_boxes = []
pred_boxes.append(pred_ctr_x - 0.5 * pred_w)
pred_boxes.append(pred_ctr_y - 0.5 * pred_h)
pred_boxes.append(pred_ctr_x + 0.5 * pred_w)
pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
pred_boxes = paddle.stack(pred_boxes, axis=-1)
if max_shape is not None:
pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip(
min=0, max=max_shape[1])
pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip(
min=0, max=max_shape[0])
return pred_boxes
def bbox2delta_v2(src_boxes,
tgt_boxes,
delta_mean=[0.0, 0.0, 0.0, 0.0],
delta_std=[1.0, 1.0, 1.0, 1.0]):
"""Encode bboxes to deltas.
Modified from bbox2delta() which just use weight parameters to multiply deltas.
"""
src_w = src_boxes[:, 2] - src_boxes[:, 0]
src_h = src_boxes[:, 3] - src_boxes[:, 1]
src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
src_ctr_y = src_boxes[:, 1] + 0.5 * src_h
tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]
tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]
tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w
tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h
dx = (tgt_ctr_x - src_ctr_x) / src_w
dy = (tgt_ctr_y - src_ctr_y) / src_h
dw = paddle.log(tgt_w / src_w)
dh = paddle.log(tgt_h / src_h)
deltas = paddle.stack((dx, dy, dw, dh), axis=1)
deltas = (
deltas - paddle.to_tensor(delta_mean)) / paddle.to_tensor(delta_std)
return deltas
def delta2bbox_v2(deltas,
boxes,
delta_mean=[0.0, 0.0, 0.0, 0.0],
delta_std=[1.0, 1.0, 1.0, 1.0],
max_shape=None,
ctr_clip=32.0):
"""Decode deltas to bboxes.
Modified from delta2bbox() which just use weight parameters to be divided by deltas.
Used in YOLOFHead.
Note: return tensor shape [n,1,4]
If you want to add a reshape, please add after the calling code instead of here.
"""
clip_scale = math.log(1000.0 / 16)
widths = boxes[:, 2] - boxes[:, 0]
heights = boxes[:, 3] - boxes[:, 1]
ctr_x = boxes[:, 0] + 0.5 * widths
ctr_y = boxes[:, 1] + 0.5 * heights
deltas = deltas * paddle.to_tensor(delta_std) + paddle.to_tensor(delta_mean)
dx = deltas[:, 0::4]
dy = deltas[:, 1::4]
dw = deltas[:, 2::4]
dh = deltas[:, 3::4]
# Prevent sending too large values into paddle.exp()
dx = dx * widths.unsqueeze(1)
dy = dy * heights.unsqueeze(1)
if ctr_clip is not None:
dx = paddle.clip(dx, max=ctr_clip, min=-ctr_clip)
dy = paddle.clip(dy, max=ctr_clip, min=-ctr_clip)
dw = paddle.clip(dw, max=clip_scale)
dh = paddle.clip(dh, max=clip_scale)
else:
dw = dw.clip(min=-clip_scale, max=clip_scale)
dh = dh.clip(min=-clip_scale, max=clip_scale)
pred_ctr_x = dx + ctr_x.unsqueeze(1)
pred_ctr_y = dy + ctr_y.unsqueeze(1)
pred_w = paddle.exp(dw) * widths.unsqueeze(1)
pred_h = paddle.exp(dh) * heights.unsqueeze(1)
pred_boxes = []
pred_boxes.append(pred_ctr_x - 0.5 * pred_w)
pred_boxes.append(pred_ctr_y - 0.5 * pred_h)
pred_boxes.append(pred_ctr_x + 0.5 * pred_w)
pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
pred_boxes = paddle.stack(pred_boxes, axis=-1)
if max_shape is not None:
pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip(
min=0, max=max_shape[1])
pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip(
min=0, max=max_shape[0])
return pred_boxes
def expand_bbox(bboxes, scale):
w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5
h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5
x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5
y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5
w_half *= scale
h_half *= scale
bboxes_exp = np.zeros(bboxes.shape, dtype=np.float32)
bboxes_exp[:, 0] = x_c - w_half
bboxes_exp[:, 2] = x_c + w_half
bboxes_exp[:, 1] = y_c - h_half
bboxes_exp[:, 3] = y_c + h_half
return bboxes_exp
def clip_bbox(boxes, im_shape):
h, w = im_shape[0], im_shape[1]
x1 = boxes[:, 0].clip(0, w)
y1 = boxes[:, 1].clip(0, h)
x2 = boxes[:, 2].clip(0, w)
y2 = boxes[:, 3].clip(0, h)
return paddle.stack([x1, y1, x2, y2], axis=1)
def nonempty_bbox(boxes, min_size=0, return_mask=False):
w = boxes[:, 2] - boxes[:, 0]
h = boxes[:, 3] - boxes[:, 1]
mask = paddle.logical_and(h > min_size, w > min_size)
if return_mask:
return mask
keep = paddle.nonzero(mask).flatten()
return keep
def bbox_area(boxes):
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
def bbox_overlaps(boxes1, boxes2):
"""
Calculate overlaps between boxes1 and boxes2
Args:
boxes1 (Tensor): boxes with shape [M, 4]
boxes2 (Tensor): boxes with shape [N, 4]
Return:
overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N]
"""
M = boxes1.shape[0]
N = boxes2.shape[0]
if M * N == 0:
return paddle.zeros([M, N], dtype='float32')
area1 = bbox_area(boxes1)
area2 = bbox_area(boxes2)
xy_max = paddle.minimum(
paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:])
xy_min = paddle.maximum(
paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2])
width_height = xy_max - xy_min
width_height = width_height.clip(min=0)
inter = width_height.prod(axis=2)
overlaps = paddle.where(inter > 0, inter /
(paddle.unsqueeze(area1, 1) + area2 - inter),
paddle.zeros_like(inter))
return overlaps
def batch_bbox_overlaps(bboxes1,
bboxes2,
mode='iou',
is_aligned=False,
eps=1e-6):
"""Calculate overlap between two set of bboxes.
If ``is_aligned `` is ``False``, then calculate the overlaps between each
bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
pair of bboxes1 and bboxes2.
Args:
bboxes1 (Tensor): shape (B, m, 4) in format or empty.
bboxes2 (Tensor): shape (B, n, 4) in format or empty.
B indicates the batch dim, in shape (B1, B2, ..., Bn).
If ``is_aligned `` is ``True``, then m and n must be equal.
mode (str): "iou" (intersection over union) or "iof" (intersection over
foreground).
is_aligned (bool, optional): If True, then m and n must be equal.
Default False.
eps (float, optional): A value added to the denominator for numerical
stability. Default 1e-6.
Returns:
Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
"""
assert mode in ['iou', 'iof', 'giou'], 'Unsupported mode {}'.format(mode)
# Either the boxes are empty or the length of boxes's last dimenstion is 4
assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0)
assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0)
# Batch dim must be the same
# Batch dim: (B1, B2, ... Bn)
assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
batch_shape = bboxes1.shape[:-2]
rows = bboxes1.shape[-2] if bboxes1.shape[0] > 0 else 0
cols = bboxes2.shape[-2] if bboxes2.shape[0] > 0 else 0
if is_aligned:
assert rows == cols
if rows * cols == 0:
if is_aligned:
return paddle.full(batch_shape + (rows, ), 1)
else:
return paddle.full(batch_shape + (rows, cols), 1)
area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1])
area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1])
if is_aligned:
lt = paddle.maximum(bboxes1[:, :2], bboxes2[:, :2]) # [B, rows, 2]
rb = paddle.minimum(bboxes1[:, 2:], bboxes2[:, 2:]) # [B, rows, 2]
wh = (rb - lt).clip(min=0) # [B, rows, 2]
overlap = wh[:, 0] * wh[:, 1]
if mode in ['iou', 'giou']:
union = area1 + area2 - overlap
else:
union = area1
if mode == 'giou':
enclosed_lt = paddle.minimum(bboxes1[:, :2], bboxes2[:, :2])
enclosed_rb = paddle.maximum(bboxes1[:, 2:], bboxes2[:, 2:])
else:
lt = paddle.maximum(bboxes1[:, :2].reshape([rows, 1, 2]),
bboxes2[:, :2]) # [B, rows, cols, 2]
rb = paddle.minimum(bboxes1[:, 2:].reshape([rows, 1, 2]),
bboxes2[:, 2:]) # [B, rows, cols, 2]
wh = (rb - lt).clip(min=0) # [B, rows, cols, 2]
overlap = wh[:, :, 0] * wh[:, :, 1]
if mode in ['iou', 'giou']:
union = area1.reshape([rows,1]) \
+ area2.reshape([1,cols]) - overlap
else:
union = area1[:, None]
if mode == 'giou':
enclosed_lt = paddle.minimum(bboxes1[:, :2].reshape([rows, 1, 2]),
bboxes2[:, :2])
enclosed_rb = paddle.maximum(bboxes1[:, 2:].reshape([rows, 1, 2]),
bboxes2[:, 2:])
eps = paddle.to_tensor([eps])
union = paddle.maximum(union, eps)
ious = overlap / union
if mode in ['iou', 'iof']:
return ious
# calculate gious
enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
enclose_area = enclose_wh[:, :, 0] * enclose_wh[:, :, 1]
enclose_area = paddle.maximum(enclose_area, eps)
gious = ious - (enclose_area - union) / enclose_area
return 1 - gious
def xywh2xyxy(box):
x, y, w, h = box
x1 = x - w * 0.5
y1 = y - h * 0.5
x2 = x + w * 0.5
y2 = y + h * 0.5
return [x1, y1, x2, y2]
def make_grid(h, w, dtype):
yv, xv = paddle.meshgrid([paddle.arange(h), paddle.arange(w)])
return paddle.stack((xv, yv), 2).cast(dtype=dtype)
def decode_yolo(box, anchor, downsample_ratio):
"""decode yolo box
Args:
box (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
anchor (list): anchor with the shape [na, 2]
downsample_ratio (int): downsample ratio, default 32
scale (float): scale, default 1.
Return:
box (list): decoded box, [x, y, w, h], all have the shape [b, na, h, w, 1]
"""
x, y, w, h = box
na, grid_h, grid_w = x.shape[1:4]
grid = make_grid(grid_h, grid_w, x.dtype).reshape((1, 1, grid_h, grid_w, 2))
x1 = (x + grid[:, :, :, :, 0:1]) / grid_w
y1 = (y + grid[:, :, :, :, 1:2]) / grid_h
anchor = paddle.to_tensor(anchor, dtype=x.dtype)
anchor = anchor.reshape((1, na, 1, 1, 2))
w1 = paddle.exp(w) * anchor[:, :, :, :, 0:1] / (downsample_ratio * grid_w)
h1 = paddle.exp(h) * anchor[:, :, :, :, 1:2] / (downsample_ratio * grid_h)
return [x1, y1, w1, h1]
def batch_iou_similarity(box1, box2, eps=1e-9):
"""Calculate iou of box1 and box2 in batch
Args:
box1 (Tensor): box with the shape [N, M1, 4]
box2 (Tensor): box with the shape [N, M2, 4]
Return:
iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2]
"""
box1 = box1.unsqueeze(2) # [N, M1, 4] -> [N, M1, 1, 4]
box2 = box2.unsqueeze(1) # [N, M2, 4] -> [N, 1, M2, 4]
px1y1, px2y2 = box1[:, :, :, 0:2], box1[:, :, :, 2:4]
gx1y1, gx2y2 = box2[:, :, :, 0:2], box2[:, :, :, 2:4]
x1y1 = paddle.maximum(px1y1, gx1y1)
x2y2 = paddle.minimum(px2y2, gx2y2)
overlap = (x2y2 - x1y1).clip(0).prod(-1)
area1 = (px2y2 - px1y1).clip(0).prod(-1)
area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
union = area1 + area2 - overlap + eps
return overlap / union
def bbox_iou(box1, box2, giou=False, diou=False, ciou=False, eps=1e-9):
"""calculate the iou of box1 and box2
Args:
box1 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
box2 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
giou (bool): whether use giou or not, default False
diou (bool): whether use diou or not, default False
ciou (bool): whether use ciou or not, default False
eps (float): epsilon to avoid divide by zero
Return:
iou (Tensor): iou of box1 and box1, with the shape [b, na, h, w, 1]
"""
px1, py1, px2, py2 = box1
gx1, gy1, gx2, gy2 = box2
x1 = paddle.maximum(px1, gx1)
y1 = paddle.maximum(py1, gy1)
x2 = paddle.minimum(px2, gx2)
y2 = paddle.minimum(py2, gy2)
overlap = ((x2 - x1).clip(0)) * ((y2 - y1).clip(0))
area1 = (px2 - px1) * (py2 - py1)
area1 = area1.clip(0)
area2 = (gx2 - gx1) * (gy2 - gy1)
area2 = area2.clip(0)
union = area1 + area2 - overlap + eps
iou = overlap / union
if giou or ciou or diou:
# convex w, h
cw = paddle.maximum(px2, gx2) - paddle.minimum(px1, gx1)
ch = paddle.maximum(py2, gy2) - paddle.minimum(py1, gy1)
if giou:
c_area = cw * ch + eps
return iou - (c_area - union) / c_area
else:
# convex diagonal squared
c2 = cw**2 + ch**2 + eps
# center distance
rho2 = ((px1 + px2 - gx1 - gx2)**2 + (py1 + py2 - gy1 - gy2)**2) / 4
if diou:
return iou - rho2 / c2
else:
w1, h1 = px2 - px1, py2 - py1 + eps
w2, h2 = gx2 - gx1, gy2 - gy1 + eps
delta = paddle.atan(w1 / h1) - paddle.atan(w2 / h2)
v = (4 / math.pi**2) * paddle.pow(delta, 2)
alpha = v / (1 + eps - iou + v)
alpha.stop_gradient = True
return iou - (rho2 / c2 + v * alpha)
else:
return iou
def bbox_iou_np_expand(box1, box2, x1y1x2y2=True, eps=1e-16):
"""
Calculate the iou of box1 and box2 with numpy.
Args:
box1 (ndarray): [N, 4]
box2 (ndarray): [M, 4], usually N != M
x1y1x2y2 (bool): whether in x1y1x2y2 stype, default True
eps (float): epsilon to avoid divide by zero
Return:
iou (ndarray): iou of box1 and box2, [N, M]
"""
N, M = len(box1), len(box2) # usually N != M
if x1y1x2y2:
b1_x1, b1_y1 = box1[:, 0], box1[:, 1]
b1_x2, b1_y2 = box1[:, 2], box1[:, 3]
b2_x1, b2_y1 = box2[:, 0], box2[:, 1]
b2_x2, b2_y2 = box2[:, 2], box2[:, 3]
else:
# cxcywh style
# Transform from center and width to exact coordinates
b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
# get the coordinates of the intersection rectangle
inter_rect_x1 = np.zeros((N, M), dtype=np.float32)
inter_rect_y1 = np.zeros((N, M), dtype=np.float32)
inter_rect_x2 = np.zeros((N, M), dtype=np.float32)
inter_rect_y2 = np.zeros((N, M), dtype=np.float32)
for i in range(len(box2)):
inter_rect_x1[:, i] = np.maximum(b1_x1, b2_x1[i])
inter_rect_y1[:, i] = np.maximum(b1_y1, b2_y1[i])
inter_rect_x2[:, i] = np.minimum(b1_x2, b2_x2[i])
inter_rect_y2[:, i] = np.minimum(b1_y2, b2_y2[i])
# Intersection area
inter_area = np.maximum(inter_rect_x2 - inter_rect_x1, 0) * np.maximum(
inter_rect_y2 - inter_rect_y1, 0)
# Union Area
b1_area = np.repeat(
((b1_x2 - b1_x1) * (b1_y2 - b1_y1)).reshape(-1, 1), M, axis=-1)
b2_area = np.repeat(
((b2_x2 - b2_x1) * (b2_y2 - b2_y1)).reshape(1, -1), N, axis=0)
ious = inter_area / (b1_area + b2_area - inter_area + eps)
return ious
def bbox2distance(points, bbox, max_dis=None, eps=0.1):
"""Decode bounding box based on distances.
Args:
points (Tensor): Shape (n, 2), [x, y].
bbox (Tensor): Shape (n, 4), "xyxy" format
max_dis (float): Upper bound of the distance.
eps (float): a small value to ensure target < max_dis, instead <=
Returns:
Tensor: Decoded distances.
"""
left = points[:, 0] - bbox[:, 0]
top = points[:, 1] - bbox[:, 1]
right = bbox[:, 2] - points[:, 0]
bottom = bbox[:, 3] - points[:, 1]
if max_dis is not None:
left = left.clip(min=0, max=max_dis - eps)
top = top.clip(min=0, max=max_dis - eps)
right = right.clip(min=0, max=max_dis - eps)
bottom = bottom.clip(min=0, max=max_dis - eps)
return paddle.stack([left, top, right, bottom], -1)
def distance2bbox(points, distance, max_shape=None):
"""Decode distance prediction to bounding box.
Args:
points (Tensor): Shape (n, 2), [x, y].
distance (Tensor): Distance from the given point to 4
boundaries (left, top, right, bottom).
max_shape (tuple): Shape of the image.
Returns:
Tensor: Decoded bboxes.
"""
x1 = points[:, 0] - distance[:, 0]
y1 = points[:, 1] - distance[:, 1]
x2 = points[:, 0] + distance[:, 2]
y2 = points[:, 1] + distance[:, 3]
if max_shape is not None:
x1 = x1.clip(min=0, max=max_shape[1])
y1 = y1.clip(min=0, max=max_shape[0])
x2 = x2.clip(min=0, max=max_shape[1])
y2 = y2.clip(min=0, max=max_shape[0])
return paddle.stack([x1, y1, x2, y2], -1)
def bbox_center(boxes):
"""Get bbox centers from boxes.
Args:
boxes (Tensor): boxes with shape (..., 4), "xmin, ymin, xmax, ymax" format.
Returns:
Tensor: boxes centers with shape (..., 2), "cx, cy" format.
"""
boxes_cx = (boxes[..., 0] + boxes[..., 2]) / 2
boxes_cy = (boxes[..., 1] + boxes[..., 3]) / 2
return paddle.stack([boxes_cx, boxes_cy], axis=-1)
def batch_distance2bbox(points, distance, max_shapes=None):
"""Decode distance prediction to bounding box for batch.
Args:
points (Tensor): [B, ..., 2], "xy" format
distance (Tensor): [B, ..., 4], "ltrb" format
max_shapes (Tensor): [B, 2], "h,w" format, Shape of the image.
Returns:
Tensor: Decoded bboxes, "x1y1x2y2" format.
"""
lt, rb = paddle.split(distance, 2, -1)
# while tensor add parameters, parameters should be better placed on the second place
x1y1 = -lt + points
x2y2 = rb + points
out_bbox = paddle.concat([x1y1, x2y2], -1)
if max_shapes is not None:
max_shapes = max_shapes.flip(-1).tile([1, 2])
delta_dim = out_bbox.ndim - max_shapes.ndim
for _ in range(delta_dim):
max_shapes.unsqueeze_(1)
out_bbox = paddle.where(out_bbox < max_shapes, out_bbox, max_shapes)
out_bbox = paddle.where(out_bbox > 0, out_bbox,
paddle.zeros_like(out_bbox))
return out_bbox
def iou_similarity(box1, box2, eps=1e-10):
"""Calculate iou of box1 and box2
Args:
box1 (Tensor): box with the shape [M1, 4]
box2 (Tensor): box with the shape [M2, 4]
Return:
iou (Tensor): iou between box1 and box2 with the shape [M1, M2]
"""
box1 = box1.unsqueeze(1) # [M1, 4] -> [M1, 1, 4]
box2 = box2.unsqueeze(0) # [M2, 4] -> [1, M2, 4]
px1y1, px2y2 = box1[:, :, 0:2], box1[:, :, 2:4]
gx1y1, gx2y2 = box2[:, :, 0:2], box2[:, :, 2:4]
x1y1 = paddle.maximum(px1y1, gx1y1)
x2y2 = paddle.minimum(px2y2, gx2y2)
overlap = (x2y2 - x1y1).clip(0).prod(-1)
area1 = (px2y2 - px1y1).clip(0).prod(-1)
area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
union = area1 + area2 - overlap + eps
return overlap / union
================================================
FILE: ppdet/modeling/clrnet_utils.py
================================================
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.modeling.initializer import constant_
from paddle.nn.initializer import KaimingNormal
class ConvModule(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size=1,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=False,
norm_type='bn',
wtih_act=True):
super(ConvModule, self).__init__()
assert norm_type in ['bn', 'sync_bn', 'gn', None]
self.with_norm = norm_type is not None
self.wtih_act = wtih_act
self.conv = nn.Conv2D(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
bias_attr=bias,
weight_attr=KaimingNormal())
if self.with_norm:
if norm_type == 'bn':
self.bn = nn.BatchNorm2D(out_channels)
elif norm_type == 'gn':
self.bn = nn.GroupNorm(out_channels, out_channels)
if self.wtih_act:
self.act = nn.ReLU()
def forward(self, inputs):
x = self.conv(inputs)
if self.with_norm:
x = self.bn(x)
if self.wtih_act:
x = self.act(x)
return x
def LinearModule(hidden_dim):
return nn.LayerList(
[nn.Linear(
hidden_dim, hidden_dim, bias_attr=True), nn.ReLU()])
class FeatureResize(nn.Layer):
def __init__(self, size=(10, 25)):
super(FeatureResize, self).__init__()
self.size = size
def forward(self, x):
x = F.interpolate(x, self.size)
return x.flatten(2)
class ROIGather(nn.Layer):
'''
ROIGather module for gather global information
Args:
in_channels: prior feature channels
num_priors: prior numbers we predefined
sample_points: the number of sampled points when we extract feature from line
fc_hidden_dim: the fc output channel
refine_layers: the total number of layers to build refine
'''
def __init__(self,
in_channels,
num_priors,
sample_points,
fc_hidden_dim,
refine_layers,
mid_channels=48):
super(ROIGather, self).__init__()
self.in_channels = in_channels
self.num_priors = num_priors
self.f_key = ConvModule(
in_channels=self.in_channels,
out_channels=self.in_channels,
kernel_size=1,
stride=1,
padding=0,
norm_type='bn')
self.f_query = nn.Sequential(
nn.Conv1D(
in_channels=num_priors,
out_channels=num_priors,
kernel_size=1,
stride=1,
padding=0,
groups=num_priors),
nn.ReLU(), )
self.f_value = nn.Conv2D(
in_channels=self.in_channels,
out_channels=self.in_channels,
kernel_size=1,
stride=1,
padding=0)
self.W = nn.Conv1D(
in_channels=num_priors,
out_channels=num_priors,
kernel_size=1,
stride=1,
padding=0,
groups=num_priors)
self.resize = FeatureResize()
constant_(self.W.weight, 0)
constant_(self.W.bias, 0)
self.convs = nn.LayerList()
self.catconv = nn.LayerList()
for i in range(refine_layers):
self.convs.append(
ConvModule(
in_channels,
mid_channels, (9, 1),
padding=(4, 0),
bias=False,
norm_type='bn'))
self.catconv.append(
ConvModule(
mid_channels * (i + 1),
in_channels, (9, 1),
padding=(4, 0),
bias=False,
norm_type='bn'))
self.fc = nn.Linear(
sample_points * fc_hidden_dim, fc_hidden_dim, bias_attr=True)
self.fc_norm = nn.LayerNorm(fc_hidden_dim)
def roi_fea(self, x, layer_index):
feats = []
for i, feature in enumerate(x):
feat_trans = self.convs[i](feature)
feats.append(feat_trans)
cat_feat = paddle.concat(feats, axis=1)
cat_feat = self.catconv[layer_index](cat_feat)
return cat_feat
def forward(self, roi_features, x, layer_index):
'''
Args:
roi_features: prior feature, shape: (Batch * num_priors, prior_feat_channel, sample_point, 1)
x: feature map
layer_index: currently on which layer to refine
Return:
roi: prior features with gathered global information, shape: (Batch, num_priors, fc_hidden_dim)
'''
roi = self.roi_fea(roi_features, layer_index)
# return roi
# print(roi.shape)
# return roi
bs = x.shape[0]
# print(bs)
#roi = roi.contiguous().view(bs * self.num_priors, -1)
roi = roi.reshape([bs * self.num_priors, -1])
# roi = paddle.randn([192,2304])
# return roi
# print(roi)
# print(self.fc)
# print(self.fc.weight)
roi = self.fc(roi)
roi = F.relu(self.fc_norm(roi))
# return roi
#roi = roi.view(bs, self.num_priors, -1)
roi = roi.reshape([bs, self.num_priors, -1])
query = roi
value = self.resize(self.f_value(x)) # (B, C, N) global feature
query = self.f_query(
query) # (B, N, 1) sample context feature from prior roi
key = self.f_key(x)
value = value.transpose(perm=[0, 2, 1])
key = self.resize(key) # (B, C, N) global feature
sim_map = paddle.matmul(query, key)
sim_map = (self.in_channels**-.5) * sim_map
sim_map = F.softmax(sim_map, axis=-1)
context = paddle.matmul(sim_map, value)
context = self.W(context)
roi = roi + F.dropout(context, p=0.1, training=self.training)
return roi
class SegDecoder(nn.Layer):
'''
Optionaly seg decoder
'''
def __init__(self,
image_height,
image_width,
num_class,
prior_feat_channels=64,
refine_layers=3):
super().__init__()
self.dropout = nn.Dropout2D(0.1)
self.conv = nn.Conv2D(prior_feat_channels * refine_layers, num_class, 1)
self.image_height = image_height
self.image_width = image_width
def forward(self, x):
x = self.dropout(x)
x = self.conv(x)
x = F.interpolate(
x,
size=[self.image_height, self.image_width],
mode='bilinear',
align_corners=False)
return x
import paddle.nn as nn
def accuracy(pred, target, topk=1, thresh=None):
"""Calculate accuracy according to the prediction and target.
Args:
pred (torch.Tensor): The model prediction, shape (N, num_class)
target (torch.Tensor): The target of each prediction, shape (N, )
topk (int | tuple[int], optional): If the predictions in ``topk``
matches the target, the predictions will be regarded as
correct ones. Defaults to 1.
thresh (float, optional): If not None, predictions with scores under
this threshold are considered incorrect. Default to None.
Returns:
float | tuple[float]: If the input ``topk`` is a single integer,
the function will return a single float as accuracy. If
``topk`` is a tuple containing multiple integers, the
function will return a tuple containing accuracies of
each ``topk`` number.
"""
assert isinstance(topk, (int, tuple))
if isinstance(topk, int):
topk = (topk, )
return_single = True
else:
return_single = False
maxk = max(topk)
if pred.shape[0] == 0:
accu = [pred.new_tensor(0.) for i in range(len(topk))]
return accu[0] if return_single else accu
assert pred.ndim == 2 and target.ndim == 1
assert pred.shape[0] == target.shape[0]
assert maxk <= pred.shape[1], \
f'maxk {maxk} exceeds pred dimension {pred.shape[1]}'
pred_value, pred_label = pred.topk(maxk, axis=1)
pred_label = pred_label.t() # transpose to shape (maxk, N)
correct = pred_label.equal(target.reshape([1, -1]).expand_as(pred_label))
if thresh is not None:
# Only prediction values larger than thresh are counted as correct
correct = correct & (pred_value > thresh).t()
res = []
for k in topk:
correct_k = correct[:k].reshape([-1]).cast("float32").sum(0,
keepdim=True)
correct_k = correct_k * (100.0 / pred.shape[0])
res.append(correct_k)
return res[0] if return_single else res
class Accuracy(nn.Layer):
def __init__(self, topk=(1, ), thresh=None):
"""Module to calculate the accuracy.
Args:
topk (tuple, optional): The criterion used to calculate the
accuracy. Defaults to (1,).
thresh (float, optional): If not None, predictions with scores
under this threshold are considered incorrect. Default to None.
"""
super().__init__()
self.topk = topk
self.thresh = thresh
def forward(self, pred, target):
"""Forward function to calculate accuracy.
Args:
pred (torch.Tensor): Prediction of models.
target (torch.Tensor): Target for each prediction.
Returns:
tuple[float]: The accuracies under different topk criterions.
"""
return accuracy(pred, target, self.topk, self.thresh)
================================================
FILE: ppdet/modeling/cls_utils.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
def _get_class_default_kwargs(cls, *args, **kwargs):
"""
Get default arguments of a class in dict format, if args and
kwargs is specified, it will replace default arguments
"""
varnames = cls.__init__.__code__.co_varnames
argcount = cls.__init__.__code__.co_argcount
keys = varnames[:argcount]
assert keys[0] == 'self'
keys = keys[1:]
values = list(cls.__init__.__defaults__)
assert len(values) == len(keys)
if len(args) > 0:
for i, arg in enumerate(args):
values[i] = arg
default_kwargs = dict(zip(keys, values))
if len(kwargs) > 0:
for k, v in kwargs.items():
default_kwargs[k] = v
return default_kwargs
================================================
FILE: ppdet/modeling/heads/__init__.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import bbox_head
from . import mask_head
from . import yolo_head
from . import roi_extractor
from . import ssd_head
from . import fcos_head
from . import solov2_head
from . import ttf_head
from . import cascade_head
from . import face_head
from . import s2anet_head
from . import keypoint_hrhrnet_head
from . import centernet_head
from . import gfl_head
from . import simota_head
from . import pico_head
from . import detr_head
from . import sparsercnn_head
from . import tood_head
from . import retina_head
from . import ppyoloe_head
from . import fcosr_head
from . import ppyoloe_r_head
from . import yolof_head
from . import ppyoloe_contrast_head
from . import centertrack_head
from . import sparse_roi_head
from . import vitpose_head
from . import clrnet_head
from . import ppyoloe_ins_head
from .bbox_head import *
from .mask_head import *
from .yolo_head import *
from .roi_extractor import *
from .ssd_head import *
from .fcos_head import *
from .solov2_head import *
from .ttf_head import *
from .cascade_head import *
from .face_head import *
from .s2anet_head import *
from .keypoint_hrhrnet_head import *
from .centernet_head import *
from .gfl_head import *
from .simota_head import *
from .pico_head import *
from .detr_head import *
from .sparsercnn_head import *
from .tood_head import *
from .retina_head import *
from .ppyoloe_head import *
from .fcosr_head import *
from .ppyoloe_r_head import *
from .yolof_head import *
from .ppyoloe_contrast_head import *
from .centertrack_head import *
from .sparse_roi_head import *
from .petr_head import *
from .vitpose_head import *
from .clrnet_head import *
from .ppyoloe_ins_head import PPYOLOEInsHead
================================================
FILE: ppdet/modeling/heads/bbox_head.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import Normal, XavierUniform, KaimingNormal
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register, create
from .roi_extractor import RoIAlign
from ..shape_spec import ShapeSpec
from ..bbox_utils import bbox2delta
from ..cls_utils import _get_class_default_kwargs
from ppdet.modeling.layers import ConvNormLayer
__all__ = ['TwoFCHead', 'XConvNormHead', 'BBoxHead']
@register
class TwoFCHead(nn.Layer):
"""
RCNN bbox head with Two fc layers to extract feature
Args:
in_channel (int): Input channel which can be derived by from_config
out_channel (int): Output channel
resolution (int): Resolution of input feature map, default 7
"""
def __init__(self, in_channel=256, out_channel=1024, resolution=7):
super(TwoFCHead, self).__init__()
self.in_channel = in_channel
self.out_channel = out_channel
fan = in_channel * resolution * resolution
self.fc6 = nn.Linear(
in_channel * resolution * resolution,
out_channel,
weight_attr=paddle.ParamAttr(
initializer=XavierUniform(fan_out=fan)))
self.fc6.skip_quant = True
self.fc7 = nn.Linear(
out_channel,
out_channel,
weight_attr=paddle.ParamAttr(initializer=XavierUniform()))
self.fc7.skip_quant = True
@classmethod
def from_config(cls, cfg, input_shape):
s = input_shape
s = s[0] if isinstance(s, (list, tuple)) else s
return {'in_channel': s.channels}
@property
def out_shape(self):
return [ShapeSpec(channels=self.out_channel, )]
def forward(self, rois_feat):
rois_feat = paddle.flatten(rois_feat, start_axis=1, stop_axis=-1)
fc6 = self.fc6(rois_feat)
fc6 = F.relu(fc6)
fc7 = self.fc7(fc6)
fc7 = F.relu(fc7)
return fc7
@register
class XConvNormHead(nn.Layer):
__shared__ = ['norm_type', 'freeze_norm']
"""
RCNN bbox head with serveral convolution layers
Args:
in_channel (int): Input channels which can be derived by from_config
num_convs (int): The number of conv layers
conv_dim (int): The number of channels for the conv layers
out_channel (int): Output channels
resolution (int): Resolution of input feature map
norm_type (string): Norm type, bn, gn, sync_bn are available,
default `gn`
freeze_norm (bool): Whether to freeze the norm
stage_name (string): Prefix name for conv layer, '' by default
"""
def __init__(self,
in_channel=256,
num_convs=4,
conv_dim=256,
out_channel=1024,
resolution=7,
norm_type='gn',
freeze_norm=False,
stage_name=''):
super(XConvNormHead, self).__init__()
self.in_channel = in_channel
self.num_convs = num_convs
self.conv_dim = conv_dim
self.out_channel = out_channel
self.norm_type = norm_type
self.freeze_norm = freeze_norm
self.bbox_head_convs = []
fan = conv_dim * 3 * 3
initializer = KaimingNormal(fan_in=fan)
for i in range(self.num_convs):
in_c = in_channel if i == 0 else conv_dim
head_conv_name = stage_name + 'bbox_head_conv{}'.format(i)
head_conv = self.add_sublayer(
head_conv_name,
ConvNormLayer(
ch_in=in_c,
ch_out=conv_dim,
filter_size=3,
stride=1,
norm_type=self.norm_type,
freeze_norm=self.freeze_norm,
initializer=initializer))
self.bbox_head_convs.append(head_conv)
fan = conv_dim * resolution * resolution
self.fc6 = nn.Linear(
conv_dim * resolution * resolution,
out_channel,
weight_attr=paddle.ParamAttr(
initializer=XavierUniform(fan_out=fan)),
bias_attr=paddle.ParamAttr(
learning_rate=2., regularizer=L2Decay(0.)))
@classmethod
def from_config(cls, cfg, input_shape):
s = input_shape
s = s[0] if isinstance(s, (list, tuple)) else s
return {'in_channel': s.channels}
@property
def out_shape(self):
return [ShapeSpec(channels=self.out_channel, )]
def forward(self, rois_feat):
for i in range(self.num_convs):
rois_feat = F.relu(self.bbox_head_convs[i](rois_feat))
rois_feat = paddle.flatten(rois_feat, start_axis=1, stop_axis=-1)
fc6 = F.relu(self.fc6(rois_feat))
return fc6
@register
class BBoxHead(nn.Layer):
__shared__ = ['num_classes', 'use_cot']
__inject__ = ['bbox_assigner', 'bbox_loss', 'loss_cot']
"""
RCNN bbox head
Args:
head (nn.Layer): Extract feature in bbox head
in_channel (int): Input channel after RoI extractor
roi_extractor (object): The module of RoI Extractor
bbox_assigner (object): The module of Box Assigner, label and sample the
box.
with_pool (bool): Whether to use pooling for the RoI feature.
num_classes (int): The number of classes
bbox_weight (List[float]): The weight to get the decode box
cot_classes (int): The number of base classes
loss_cot (object): The module of Label-cotuning
use_cot(bool): whether to use Label-cotuning
"""
def __init__(self,
head,
in_channel,
roi_extractor=_get_class_default_kwargs(RoIAlign),
bbox_assigner='BboxAssigner',
with_pool=False,
num_classes=80,
bbox_weight=[10., 10., 5., 5.],
bbox_loss=None,
loss_normalize_pos=False,
cot_classes=None,
loss_cot='COTLoss',
use_cot=False):
super(BBoxHead, self).__init__()
self.head = head
self.roi_extractor = roi_extractor
if isinstance(roi_extractor, dict):
self.roi_extractor = RoIAlign(**roi_extractor)
self.bbox_assigner = bbox_assigner
self.with_pool = with_pool
self.num_classes = num_classes
self.bbox_weight = bbox_weight
self.bbox_loss = bbox_loss
self.loss_normalize_pos = loss_normalize_pos
self.loss_cot = loss_cot
self.cot_relation = None
self.cot_classes = cot_classes
self.use_cot = use_cot
if use_cot:
self.cot_bbox_score = nn.Linear(
in_channel,
self.num_classes + 1,
weight_attr=paddle.ParamAttr(initializer=Normal(
mean=0.0, std=0.01)))
self.bbox_score = nn.Linear(
in_channel,
self.cot_classes + 1,
weight_attr=paddle.ParamAttr(initializer=Normal(
mean=0.0, std=0.01)))
self.cot_bbox_score.skip_quant = True
else:
self.bbox_score = nn.Linear(
in_channel,
self.num_classes + 1,
weight_attr=paddle.ParamAttr(initializer=Normal(
mean=0.0, std=0.01)))
self.bbox_score.skip_quant = True
self.bbox_delta = nn.Linear(
in_channel,
4 * self.num_classes,
weight_attr=paddle.ParamAttr(initializer=Normal(
mean=0.0, std=0.001)))
self.bbox_delta.skip_quant = True
self.assigned_label = None
self.assigned_rois = None
def init_cot_head(self, relationship):
self.cot_relation = relationship
@classmethod
def from_config(cls, cfg, input_shape):
roi_pooler = cfg['roi_extractor']
assert isinstance(roi_pooler, dict)
kwargs = RoIAlign.from_config(cfg, input_shape)
roi_pooler.update(kwargs)
kwargs = {'input_shape': input_shape}
head = create(cfg['head'], **kwargs)
return {
'roi_extractor': roi_pooler,
'head': head,
'in_channel': head.out_shape[0].channels
}
def forward(self, body_feats=None, rois=None, rois_num=None, inputs=None, cot=False):
"""
body_feats (list[Tensor]): Feature maps from backbone
rois (list[Tensor]): RoIs generated from RPN module
rois_num (Tensor): The number of RoIs in each image
inputs (dict{Tensor}): The ground-truth of image
"""
if self.training:
rois, rois_num, targets = self.bbox_assigner(rois, rois_num, inputs)
self.assigned_rois = (rois, rois_num)
self.assigned_targets = targets
rois_feat = self.roi_extractor(body_feats, rois, rois_num)
bbox_feat = self.head(rois_feat)
if self.with_pool:
feat = F.adaptive_avg_pool2d(bbox_feat, output_size=1)
feat = paddle.squeeze(feat, axis=[2, 3])
else:
feat = bbox_feat
if self.use_cot:
scores = self.cot_bbox_score(feat)
cot_scores = self.bbox_score(feat)
else:
scores = self.bbox_score(feat)
deltas = self.bbox_delta(feat)
if self.training:
loss = self.get_loss(
scores,
deltas,
targets,
rois,
self.bbox_weight,
loss_normalize_pos=self.loss_normalize_pos)
if self.cot_relation is not None:
loss_cot = self.loss_cot(cot_scores, targets, self.cot_relation)
loss.update(loss_cot)
return loss, bbox_feat
else:
if cot:
pred = self.get_prediction(cot_scores, deltas)
else:
pred = self.get_prediction(scores, deltas)
return pred, self.head
def get_loss(self,
scores,
deltas,
targets,
rois,
bbox_weight,
loss_normalize_pos=False):
"""
scores (Tensor): scores from bbox head outputs
deltas (Tensor): deltas from bbox head outputs
targets (list[List[Tensor]]): bbox targets containing tgt_labels, tgt_bboxes and tgt_gt_inds
rois (List[Tensor]): RoIs generated in each batch
"""
cls_name = 'loss_bbox_cls'
reg_name = 'loss_bbox_reg'
loss_bbox = {}
# TODO: better pass args
tgt_labels, tgt_bboxes, tgt_gt_inds = targets
# bbox cls
tgt_labels = paddle.concat(tgt_labels) if len(
tgt_labels) > 1 else tgt_labels[0]
valid_inds = paddle.nonzero(tgt_labels >= 0).flatten()
if valid_inds.shape[0] == 0:
loss_bbox[cls_name] = paddle.zeros([1], dtype='float32')
else:
tgt_labels = tgt_labels.cast('int64')
tgt_labels.stop_gradient = True
if not loss_normalize_pos:
loss_bbox_cls = F.cross_entropy(
input=scores, label=tgt_labels, reduction='mean')
else:
loss_bbox_cls = F.cross_entropy(
input=scores, label=tgt_labels,
reduction='none').sum() / (tgt_labels.shape[0] + 1e-7)
loss_bbox[cls_name] = loss_bbox_cls
# bbox reg
cls_agnostic_bbox_reg = deltas.shape[1] == 4
fg_inds = paddle.nonzero(
paddle.logical_and(tgt_labels >= 0, tgt_labels <
self.num_classes)).flatten()
if fg_inds.numel() == 0:
# loss_bbox[reg_name] = paddle.zeros([1], dtype='float32')
loss_bbox[reg_name] = scores.mean() * 0. + deltas.mean() * 0.
return loss_bbox
if cls_agnostic_bbox_reg:
reg_delta = paddle.gather(deltas, fg_inds)
else:
fg_gt_classes = paddle.gather(tgt_labels, fg_inds)
reg_row_inds = paddle.arange(fg_gt_classes.shape[0]).unsqueeze(1)
reg_row_inds = paddle.tile(reg_row_inds, [1, 4]).reshape([-1, 1])
reg_col_inds = 4 * fg_gt_classes.unsqueeze(1) + paddle.arange(4)
reg_col_inds = reg_col_inds.reshape([-1, 1])
reg_inds = paddle.concat([reg_row_inds, reg_col_inds], axis=1)
reg_delta = paddle.gather(deltas, fg_inds)
reg_delta = paddle.gather_nd(reg_delta, reg_inds).reshape([-1, 4])
rois = paddle.concat(rois) if len(rois) > 1 else rois[0]
tgt_bboxes = paddle.concat(tgt_bboxes) if len(
tgt_bboxes) > 1 else tgt_bboxes[0]
reg_target = bbox2delta(rois, tgt_bboxes, bbox_weight)
reg_target = paddle.gather(reg_target, fg_inds)
reg_target.stop_gradient = True
if self.bbox_loss is not None:
reg_delta = self.bbox_transform(reg_delta)
reg_target = self.bbox_transform(reg_target)
if not loss_normalize_pos:
loss_bbox_reg = self.bbox_loss(
reg_delta, reg_target).sum() / tgt_labels.shape[0]
loss_bbox_reg *= self.num_classes
else:
loss_bbox_reg = self.bbox_loss(
reg_delta, reg_target).sum() / (tgt_labels.shape[0] + 1e-7)
else:
loss_bbox_reg = paddle.abs(reg_delta - reg_target).sum(
) / tgt_labels.shape[0]
loss_bbox[reg_name] = loss_bbox_reg
return loss_bbox
def bbox_transform(self, deltas, weights=[0.1, 0.1, 0.2, 0.2]):
wx, wy, ww, wh = weights
deltas = paddle.reshape(deltas, shape=(0, -1, 4))
dx = paddle.slice(deltas, axes=[2], starts=[0], ends=[1]) * wx
dy = paddle.slice(deltas, axes=[2], starts=[1], ends=[2]) * wy
dw = paddle.slice(deltas, axes=[2], starts=[2], ends=[3]) * ww
dh = paddle.slice(deltas, axes=[2], starts=[3], ends=[4]) * wh
dw = paddle.clip(dw, -1.e10, np.log(1000. / 16))
dh = paddle.clip(dh, -1.e10, np.log(1000. / 16))
pred_ctr_x = dx
pred_ctr_y = dy
pred_w = paddle.exp(dw)
pred_h = paddle.exp(dh)
x1 = pred_ctr_x - 0.5 * pred_w
y1 = pred_ctr_y - 0.5 * pred_h
x2 = pred_ctr_x + 0.5 * pred_w
y2 = pred_ctr_y + 0.5 * pred_h
x1 = paddle.reshape(x1, shape=(-1, ))
y1 = paddle.reshape(y1, shape=(-1, ))
x2 = paddle.reshape(x2, shape=(-1, ))
y2 = paddle.reshape(y2, shape=(-1, ))
return paddle.concat([x1, y1, x2, y2])
def get_prediction(self, score, delta):
bbox_prob = F.softmax(score)
return delta, bbox_prob
def get_head(self, ):
return self.head
def get_assigned_targets(self, ):
return self.assigned_targets
def get_assigned_rois(self, ):
return self.assigned_rois
================================================
FILE: ppdet/modeling/heads/cascade_head.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import Normal
from ppdet.core.workspace import register
from .bbox_head import BBoxHead, TwoFCHead, XConvNormHead
from .roi_extractor import RoIAlign
from ..shape_spec import ShapeSpec
from ..bbox_utils import delta2bbox, clip_bbox, nonempty_bbox
from ..cls_utils import _get_class_default_kwargs
__all__ = ['CascadeTwoFCHead', 'CascadeXConvNormHead', 'CascadeHead']
@register
class CascadeTwoFCHead(nn.Layer):
__shared__ = ['num_cascade_stage']
"""
Cascade RCNN bbox head with Two fc layers to extract feature
Args:
in_channel (int): Input channel which can be derived by from_config
out_channel (int): Output channel
resolution (int): Resolution of input feature map, default 7
num_cascade_stage (int): The number of cascade stage, default 3
"""
def __init__(self,
in_channel=256,
out_channel=1024,
resolution=7,
num_cascade_stage=3):
super(CascadeTwoFCHead, self).__init__()
self.in_channel = in_channel
self.out_channel = out_channel
self.head_list = []
for stage in range(num_cascade_stage):
head_per_stage = self.add_sublayer(
str(stage), TwoFCHead(in_channel, out_channel, resolution))
self.head_list.append(head_per_stage)
@classmethod
def from_config(cls, cfg, input_shape):
s = input_shape
s = s[0] if isinstance(s, (list, tuple)) else s
return {'in_channel': s.channels}
@property
def out_shape(self):
return [ShapeSpec(channels=self.out_channel, )]
def forward(self, rois_feat, stage=0):
out = self.head_list[stage](rois_feat)
return out
@register
class CascadeXConvNormHead(nn.Layer):
__shared__ = ['norm_type', 'freeze_norm', 'num_cascade_stage']
"""
Cascade RCNN bbox head with serveral convolution layers
Args:
in_channel (int): Input channels which can be derived by from_config
num_convs (int): The number of conv layers
conv_dim (int): The number of channels for the conv layers
out_channel (int): Output channels
resolution (int): Resolution of input feature map
norm_type (string): Norm type, bn, gn, sync_bn are available,
default `gn`
freeze_norm (bool): Whether to freeze the norm
num_cascade_stage (int): The number of cascade stage, default 3
"""
def __init__(self,
in_channel=256,
num_convs=4,
conv_dim=256,
out_channel=1024,
resolution=7,
norm_type='gn',
freeze_norm=False,
num_cascade_stage=3):
super(CascadeXConvNormHead, self).__init__()
self.in_channel = in_channel
self.out_channel = out_channel
self.head_list = []
for stage in range(num_cascade_stage):
head_per_stage = self.add_sublayer(
str(stage),
XConvNormHead(
in_channel,
num_convs,
conv_dim,
out_channel,
resolution,
norm_type,
freeze_norm,
stage_name='stage{}_'.format(stage)))
self.head_list.append(head_per_stage)
@classmethod
def from_config(cls, cfg, input_shape):
s = input_shape
s = s[0] if isinstance(s, (list, tuple)) else s
return {'in_channel': s.channels}
@property
def out_shape(self):
return [ShapeSpec(channels=self.out_channel, )]
def forward(self, rois_feat, stage=0):
out = self.head_list[stage](rois_feat)
return out
@register
class CascadeHead(BBoxHead):
__shared__ = ['num_classes', 'num_cascade_stages']
__inject__ = ['bbox_assigner', 'bbox_loss']
"""
Cascade RCNN bbox head
Args:
head (nn.Layer): Extract feature in bbox head
in_channel (int): Input channel after RoI extractor
roi_extractor (object): The module of RoI Extractor
bbox_assigner (object): The module of Box Assigner, label and sample the
box.
num_classes (int): The number of classes
bbox_weight (List[List[float]]): The weight to get the decode box and the
length of weight is the number of cascade stage
num_cascade_stages (int): THe number of stage to refine the box
"""
def __init__(self,
head,
in_channel,
roi_extractor=_get_class_default_kwargs(RoIAlign),
bbox_assigner='BboxAssigner',
num_classes=80,
bbox_weight=[[10., 10., 5., 5.], [20.0, 20.0, 10.0, 10.0],
[30.0, 30.0, 15.0, 15.0]],
num_cascade_stages=3,
bbox_loss=None,
reg_class_agnostic=True,
stage_loss_weights=None,
loss_normalize_pos=False,
add_gt_as_proposals=[True, False, False]):
nn.Layer.__init__(self, )
self.head = head
self.roi_extractor = roi_extractor
if isinstance(roi_extractor, dict):
self.roi_extractor = RoIAlign(**roi_extractor)
self.bbox_assigner = bbox_assigner
self.num_classes = num_classes
self.bbox_weight = bbox_weight
self.num_cascade_stages = num_cascade_stages
self.bbox_loss = bbox_loss
self.stage_loss_weights = [
1. / num_cascade_stages for _ in range(num_cascade_stages)
] if stage_loss_weights is None else stage_loss_weights
self.add_gt_as_proposals = add_gt_as_proposals
assert len(
self.stage_loss_weights
) == num_cascade_stages, f'stage_loss_weights({len(self.stage_loss_weights)}) do not equal to num_cascade_stages({num_cascade_stages})'
self.reg_class_agnostic = reg_class_agnostic
num_bbox_delta = 4 if reg_class_agnostic else 4 * num_classes
self.loss_normalize_pos = loss_normalize_pos
self.bbox_score_list = []
self.bbox_delta_list = []
for i in range(num_cascade_stages):
score_name = 'bbox_score_stage{}'.format(i)
delta_name = 'bbox_delta_stage{}'.format(i)
bbox_score = self.add_sublayer(
score_name,
nn.Linear(
in_channel,
self.num_classes + 1,
weight_attr=paddle.ParamAttr(initializer=Normal(
mean=0.0, std=0.01))))
bbox_delta = self.add_sublayer(
delta_name,
nn.Linear(
in_channel,
num_bbox_delta,
weight_attr=paddle.ParamAttr(initializer=Normal(
mean=0.0, std=0.001))))
self.bbox_score_list.append(bbox_score)
self.bbox_delta_list.append(bbox_delta)
self.assigned_label = None
self.assigned_rois = None
def forward(self, body_feats=None, rois=None, rois_num=None, inputs=None):
"""
body_feats (list[Tensor]): Feature maps from backbone
rois (Tensor): RoIs generated from RPN module
rois_num (Tensor): The number of RoIs in each image
inputs (dict{Tensor}): The ground-truth of image
"""
targets = []
if self.training:
rois, rois_num, targets = self.bbox_assigner(
rois,
rois_num,
inputs,
add_gt_as_proposals=self.add_gt_as_proposals[0])
targets_list = [targets]
self.assigned_rois = (rois, rois_num)
self.assigned_targets = targets
pred_bbox = None
head_out_list = []
for i in range(self.num_cascade_stages):
if i > 0:
rois, rois_num = self._get_rois_from_boxes(pred_bbox,
inputs['im_shape'])
if self.training:
rois, rois_num, targets = self.bbox_assigner(
rois,
rois_num,
inputs,
i,
is_cascade=True,
add_gt_as_proposals=self.add_gt_as_proposals[i])
targets_list.append(targets)
rois_feat = self.roi_extractor(body_feats, rois, rois_num)
bbox_feat = self.head(rois_feat, i)
scores = self.bbox_score_list[i](bbox_feat)
deltas = self.bbox_delta_list[i](bbox_feat)
# TODO (lyuwenyu) Is it correct for only one class ?
if not self.reg_class_agnostic and i < self.num_cascade_stages - 1:
deltas = deltas.reshape([deltas.shape[0], self.num_classes, 4])
labels = scores[:, :-1].argmax(axis=-1)
if self.training:
deltas = deltas[paddle.arange(deltas.shape[0]), labels]
else:
deltas = deltas[((deltas + 10000) * F.one_hot(
labels, num_classes=self.num_classes).unsqueeze(-1) != 0
).nonzero(as_tuple=True)].reshape(
[deltas.shape[0], 4])
head_out_list.append([scores, deltas, rois])
pred_bbox = self._get_pred_bbox(deltas, rois, self.bbox_weight[i])
if self.training:
loss = {}
for stage, value in enumerate(zip(head_out_list, targets_list)):
(scores, deltas, rois), targets = value
loss_stage = self.get_loss(
scores,
deltas,
targets,
rois,
self.bbox_weight[stage],
loss_normalize_pos=self.loss_normalize_pos)
for k, v in loss_stage.items():
loss[k + "_stage{}".format(
stage)] = v * self.stage_loss_weights[stage]
return loss, bbox_feat
else:
scores, deltas, self.refined_rois = self.get_prediction(
head_out_list)
return (deltas, scores), self.head
def _get_rois_from_boxes(self, boxes, im_shape):
rois = []
for i, boxes_per_image in enumerate(boxes):
clip_box = clip_bbox(boxes_per_image, im_shape[i])
if self.training:
keep = nonempty_bbox(clip_box)
if keep.shape[0] == 0:
keep = paddle.zeros([1], dtype='int32')
clip_box = paddle.gather(clip_box, keep)
rois.append(clip_box)
rois_num = paddle.concat([paddle.shape(r)[0:1] for r in rois])
return rois, rois_num
def _get_pred_bbox(self, deltas, proposals, weights):
pred_proposals = paddle.concat(proposals) if len(
proposals) > 1 else proposals[0]
pred_bbox = delta2bbox(deltas, pred_proposals, weights)
pred_bbox = paddle.reshape(pred_bbox, [-1, deltas.shape[-1]])
num_prop = []
for p in proposals:
num_prop.append(p.shape[0])
# NOTE(dev): num_prob will be tagged as LoDTensorArray because it
# depends on batch_size under @to_static. However the argument
# num_or_sections in paddle.split does not support LoDTensorArray,
# so we use [-1] to replace it if num_prop is not list. The modification
# This ensures the correctness of both dynamic and static graphs.
if not isinstance(num_prop, list):
num_prop = [-1]
return pred_bbox.split(num_prop)
def get_prediction(self, head_out_list):
"""
head_out_list(List[Tensor]): scores, deltas, rois
"""
pred_list = []
scores_list = [F.softmax(head[0]) for head in head_out_list]
scores = paddle.add_n(scores_list) / self.num_cascade_stages
# Get deltas and rois from the last stage
_, deltas, rois = head_out_list[-1]
return scores, deltas, rois
def get_refined_rois(self, ):
return self.refined_rois
================================================
FILE: ppdet/modeling/heads/centernet_head.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import Constant, Uniform
from ppdet.core.workspace import register
from ppdet.modeling.losses import CTFocalLoss, GIoULoss
class ConvLayer(nn.Layer):
def __init__(self,
ch_in,
ch_out,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=False):
super(ConvLayer, self).__init__()
bias_attr = False
fan_in = ch_in * kernel_size**2
bound = 1 / math.sqrt(fan_in)
param_attr = paddle.ParamAttr(initializer=Uniform(-bound, bound))
if bias:
bias_attr = paddle.ParamAttr(initializer=Constant(0.))
self.conv = nn.Conv2D(
in_channels=ch_in,
out_channels=ch_out,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
weight_attr=param_attr,
bias_attr=bias_attr)
def forward(self, inputs):
out = self.conv(inputs)
return out
@register
class CenterNetHead(nn.Layer):
"""
Args:
in_channels (int): the channel number of input to CenterNetHead.
num_classes (int): the number of classes, 80 (COCO dataset) by default.
head_planes (int): the channel number in all head, 256 by default.
prior_bias (float): prior bias in heatmap head, -2.19 by default, -4.6 in CenterTrack
regress_ltrb (bool): whether to regress left/top/right/bottom or
width/height for a box, True by default.
size_loss (str): the type of size regression loss, 'L1' by default, can be 'giou'.
loss_weight (dict): the weight of each loss.
add_iou (bool): whether to add iou branch, False by default.
"""
__shared__ = ['num_classes']
def __init__(self,
in_channels,
num_classes=80,
head_planes=256,
prior_bias=-2.19,
regress_ltrb=True,
size_loss='L1',
loss_weight={
'heatmap': 1.0,
'size': 0.1,
'offset': 1.0,
'iou': 0.0,
},
add_iou=False):
super(CenterNetHead, self).__init__()
self.regress_ltrb = regress_ltrb
self.loss_weight = loss_weight
self.add_iou = add_iou
# heatmap head
self.heatmap = nn.Sequential(
ConvLayer(
in_channels, head_planes, kernel_size=3, padding=1, bias=True),
nn.ReLU(),
ConvLayer(
head_planes,
num_classes,
kernel_size=1,
stride=1,
padding=0,
bias=True))
with paddle.no_grad():
self.heatmap[2].conv.bias[:] = prior_bias
# size(ltrb or wh) head
self.size = nn.Sequential(
ConvLayer(
in_channels, head_planes, kernel_size=3, padding=1, bias=True),
nn.ReLU(),
ConvLayer(
head_planes,
4 if regress_ltrb else 2,
kernel_size=1,
stride=1,
padding=0,
bias=True))
self.size_loss = size_loss
# offset head
self.offset = nn.Sequential(
ConvLayer(
in_channels, head_planes, kernel_size=3, padding=1, bias=True),
nn.ReLU(),
ConvLayer(
head_planes, 2, kernel_size=1, stride=1, padding=0, bias=True))
# iou head (optinal)
if self.add_iou and 'iou' in self.loss_weight:
self.iou = nn.Sequential(
ConvLayer(
in_channels,
head_planes,
kernel_size=3,
padding=1,
bias=True),
nn.ReLU(),
ConvLayer(
head_planes,
4 if regress_ltrb else 2,
kernel_size=1,
stride=1,
padding=0,
bias=True))
@classmethod
def from_config(cls, cfg, input_shape):
if isinstance(input_shape, (list, tuple)):
input_shape = input_shape[0]
return {'in_channels': input_shape.channels}
def forward(self, feat, inputs):
heatmap = F.sigmoid(self.heatmap(feat))
size = self.size(feat)
offset = self.offset(feat)
head_outs = {'heatmap': heatmap, 'size': size, 'offset': offset}
if self.add_iou and 'iou' in self.loss_weight:
iou = self.iou(feat)
head_outs.update({'iou': iou})
if self.training:
losses = self.get_loss(inputs, self.loss_weight, head_outs)
return losses
else:
return head_outs
def get_loss(self, inputs, weights, head_outs):
# 1.heatmap(hm) head loss: CTFocalLoss
heatmap = head_outs['heatmap']
heatmap_target = inputs['heatmap']
heatmap = paddle.clip(heatmap, 1e-4, 1 - 1e-4)
ctfocal_loss = CTFocalLoss()
heatmap_loss = ctfocal_loss(heatmap, heatmap_target)
# 2.size(wh) head loss: L1 loss or GIoU loss
size = head_outs['size']
index = inputs['index']
mask = inputs['index_mask']
size = paddle.transpose(size, perm=[0, 2, 3, 1])
size_n, _, _, size_c = size.shape
size = paddle.reshape(size, shape=[size_n, -1, size_c])
index = paddle.unsqueeze(index, 2)
batch_inds = list()
for i in range(size_n):
batch_ind = paddle.full(
shape=[1, index.shape[1], 1], fill_value=i, dtype='int64')
batch_inds.append(batch_ind)
batch_inds = paddle.concat(batch_inds, axis=0)
index = paddle.concat(x=[batch_inds, index], axis=2)
pos_size = paddle.gather_nd(size, index=index)
mask = paddle.unsqueeze(mask, axis=2)
size_mask = paddle.expand_as(mask, pos_size)
size_mask = paddle.cast(size_mask, dtype=pos_size.dtype)
pos_num = size_mask.sum()
size_mask.stop_gradient = True
if self.size_loss == 'L1':
if self.regress_ltrb:
size_target = inputs['size']
# shape: [bs, max_per_img, 4]
else:
if inputs['size'].shape[-1] == 2:
# inputs['size'] is wh, and regress as wh
# shape: [bs, max_per_img, 2]
size_target = inputs['size']
else:
# inputs['size'] is ltrb, but regress as wh
# shape: [bs, max_per_img, 4]
size_target = inputs['size'][:, :, 0:2] + inputs[
'size'][:, :, 2:]
size_target.stop_gradient = True
size_loss = F.l1_loss(
pos_size * size_mask, size_target * size_mask, reduction='sum')
size_loss = size_loss / (pos_num + 1e-4)
elif self.size_loss == 'giou':
size_target = inputs['bbox_xys']
size_target.stop_gradient = True
centers_x = (size_target[:, :, 0:1] + size_target[:, :, 2:3]) / 2.0
centers_y = (size_target[:, :, 1:2] + size_target[:, :, 3:4]) / 2.0
x1 = centers_x - pos_size[:, :, 0:1]
y1 = centers_y - pos_size[:, :, 1:2]
x2 = centers_x + pos_size[:, :, 2:3]
y2 = centers_y + pos_size[:, :, 3:4]
pred_boxes = paddle.concat([x1, y1, x2, y2], axis=-1)
giou_loss = GIoULoss(reduction='sum')
size_loss = giou_loss(
pred_boxes * size_mask,
size_target * size_mask,
iou_weight=size_mask,
loc_reweight=None)
size_loss = size_loss / (pos_num + 1e-4)
# 3.offset(reg) head loss: L1 loss
offset = head_outs['offset']
offset_target = inputs['offset']
offset = paddle.transpose(offset, perm=[0, 2, 3, 1])
offset_n, _, _, offset_c = offset.shape
offset = paddle.reshape(offset, shape=[offset_n, -1, offset_c])
pos_offset = paddle.gather_nd(offset, index=index)
offset_mask = paddle.expand_as(mask, pos_offset)
offset_mask = paddle.cast(offset_mask, dtype=pos_offset.dtype)
pos_num = offset_mask.sum()
offset_mask.stop_gradient = True
offset_target.stop_gradient = True
offset_loss = F.l1_loss(
pos_offset * offset_mask,
offset_target * offset_mask,
reduction='sum')
offset_loss = offset_loss / (pos_num + 1e-4)
# 4.iou head loss: GIoU loss (optinal)
if self.add_iou and 'iou' in self.loss_weight:
iou = head_outs['iou']
iou = paddle.transpose(iou, perm=[0, 2, 3, 1])
iou_n, _, _, iou_c = iou.shape
iou = paddle.reshape(iou, shape=[iou_n, -1, iou_c])
pos_iou = paddle.gather_nd(iou, index=index)
iou_mask = paddle.expand_as(mask, pos_iou)
iou_mask = paddle.cast(iou_mask, dtype=pos_iou.dtype)
pos_num = iou_mask.sum()
iou_mask.stop_gradient = True
gt_bbox_xys = inputs['bbox_xys']
gt_bbox_xys.stop_gradient = True
centers_x = (gt_bbox_xys[:, :, 0:1] + gt_bbox_xys[:, :, 2:3]) / 2.0
centers_y = (gt_bbox_xys[:, :, 1:2] + gt_bbox_xys[:, :, 3:4]) / 2.0
x1 = centers_x - pos_size[:, :, 0:1]
y1 = centers_y - pos_size[:, :, 1:2]
x2 = centers_x + pos_size[:, :, 2:3]
y2 = centers_y + pos_size[:, :, 3:4]
pred_boxes = paddle.concat([x1, y1, x2, y2], axis=-1)
giou_loss = GIoULoss(reduction='sum')
iou_loss = giou_loss(
pred_boxes * iou_mask,
gt_bbox_xys * iou_mask,
iou_weight=iou_mask,
loc_reweight=None)
iou_loss = iou_loss / (pos_num + 1e-4)
losses = {
'heatmap_loss': heatmap_loss,
'size_loss': size_loss,
'offset_loss': offset_loss,
}
det_loss = weights['heatmap'] * heatmap_loss + weights[
'size'] * size_loss + weights['offset'] * offset_loss
if self.add_iou and 'iou' in self.loss_weight:
losses.update({'iou_loss': iou_loss})
det_loss += weights['iou'] * iou_loss
losses.update({'det_loss': det_loss})
return losses
================================================
FILE: ppdet/modeling/heads/centertrack_head.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from .centernet_head import ConvLayer
from ..keypoint_utils import get_affine_transform
__all__ = ['CenterTrackHead']
@register
class CenterTrackHead(nn.Layer):
"""
Args:
in_channels (int): the channel number of input to CenterNetHead.
num_classes (int): the number of classes, 1 (MOT17 dataset) by default.
head_planes (int): the channel number in all head, 256 by default.
task (str): the type of task for regression, 'tracking' by default.
loss_weight (dict): the weight of each loss.
add_ltrb_amodal (bool): whether to add ltrb_amodal branch, False by default.
"""
__shared__ = ['num_classes']
def __init__(self,
in_channels,
num_classes=1,
head_planes=256,
task='tracking',
loss_weight={
'tracking': 1.0,
'ltrb_amodal': 0.1,
},
add_ltrb_amodal=True):
super(CenterTrackHead, self).__init__()
self.task = task
self.loss_weight = loss_weight
self.add_ltrb_amodal = add_ltrb_amodal
# tracking head
self.tracking = nn.Sequential(
ConvLayer(
in_channels, head_planes, kernel_size=3, padding=1, bias=True),
nn.ReLU(),
ConvLayer(
head_planes, 2, kernel_size=1, stride=1, padding=0, bias=True))
# ltrb_amodal head
if self.add_ltrb_amodal and 'ltrb_amodal' in self.loss_weight:
self.ltrb_amodal = nn.Sequential(
ConvLayer(
in_channels,
head_planes,
kernel_size=3,
padding=1,
bias=True),
nn.ReLU(),
ConvLayer(
head_planes,
4,
kernel_size=1,
stride=1,
padding=0,
bias=True))
# TODO: add more tasks
@classmethod
def from_config(cls, cfg, input_shape):
if isinstance(input_shape, (list, tuple)):
input_shape = input_shape[0]
return {'in_channels': input_shape.channels}
def forward(self,
feat,
inputs,
bboxes=None,
bbox_inds=None,
topk_clses=None,
topk_ys=None,
topk_xs=None):
tracking = self.tracking(feat)
head_outs = {'tracking': tracking}
if self.add_ltrb_amodal and 'ltrb_amodal' in self.loss_weight:
ltrb_amodal = self.ltrb_amodal(feat)
head_outs.update({'ltrb_amodal': ltrb_amodal})
if self.training:
losses = self.get_loss(inputs, self.loss_weight, head_outs)
return losses
else:
ret = self.generic_decode(head_outs, bboxes, bbox_inds, topk_ys,
topk_xs)
return ret
def get_loss(self, inputs, weights, head_outs):
index = inputs['index'].unsqueeze(2)
mask = inputs['index_mask'].unsqueeze(2)
batch_inds = list()
for i in range(head_outs['tracking'].shape[0]):
batch_ind = paddle.full(
shape=[1, index.shape[1], 1], fill_value=i, dtype='int64')
batch_inds.append(batch_ind)
batch_inds = paddle.concat(batch_inds, axis=0)
index = paddle.concat(x=[batch_inds, index], axis=2)
# 1.tracking head loss: L1 loss
tracking = head_outs['tracking'].transpose([0, 2, 3, 1])
tracking_target = inputs['tracking']
bs, _, _, c = tracking.shape
tracking = tracking.reshape([bs, -1, c])
pos_tracking = paddle.gather_nd(tracking, index=index)
tracking_mask = paddle.cast(
paddle.expand_as(mask, pos_tracking), dtype=pos_tracking.dtype)
pos_num = tracking_mask.sum()
tracking_mask.stop_gradient = True
tracking_target.stop_gradient = True
tracking_loss = F.l1_loss(
pos_tracking * tracking_mask,
tracking_target * tracking_mask,
reduction='sum')
tracking_loss = tracking_loss / (pos_num + 1e-4)
# 2.ltrb_amodal head loss(optinal): L1 loss
if self.add_ltrb_amodal and 'ltrb_amodal' in self.loss_weight:
ltrb_amodal = head_outs['ltrb_amodal'].transpose([0, 2, 3, 1])
ltrb_amodal_target = inputs['ltrb_amodal']
bs, _, _, c = ltrb_amodal.shape
ltrb_amodal = ltrb_amodal.reshape([bs, -1, c])
pos_ltrb_amodal = paddle.gather_nd(ltrb_amodal, index=index)
ltrb_amodal_mask = paddle.cast(
paddle.expand_as(mask, pos_ltrb_amodal),
dtype=pos_ltrb_amodal.dtype)
pos_num = ltrb_amodal_mask.sum()
ltrb_amodal_mask.stop_gradient = True
ltrb_amodal_target.stop_gradient = True
ltrb_amodal_loss = F.l1_loss(
pos_ltrb_amodal * ltrb_amodal_mask,
ltrb_amodal_target * ltrb_amodal_mask,
reduction='sum')
ltrb_amodal_loss = ltrb_amodal_loss / (pos_num + 1e-4)
losses = {'tracking_loss': tracking_loss, }
plugin_loss = weights['tracking'] * tracking_loss
if self.add_ltrb_amodal and 'ltrb_amodal' in self.loss_weight:
losses.update({'ltrb_amodal_loss': ltrb_amodal_loss})
plugin_loss += weights['ltrb_amodal'] * ltrb_amodal_loss
losses.update({'plugin_loss': plugin_loss})
return losses
def generic_decode(self, head_outs, bboxes, bbox_inds, topk_ys, topk_xs):
topk_ys = paddle.floor(topk_ys) # note: More accurate
topk_xs = paddle.floor(topk_xs)
cts = paddle.concat([topk_xs, topk_ys], 1)
ret = {'bboxes': bboxes, 'cts': cts}
regression_heads = ['tracking'] # todo: add more tasks
for head in regression_heads:
if head in head_outs:
ret[head] = _tranpose_and_gather_feat(head_outs[head],
bbox_inds)
if 'ltrb_amodal' in head_outs:
ltrb_amodal = head_outs['ltrb_amodal']
ltrb_amodal = _tranpose_and_gather_feat(ltrb_amodal, bbox_inds)
bboxes_amodal = paddle.concat(
[
topk_xs * 1.0 + ltrb_amodal[..., 0:1],
topk_ys * 1.0 + ltrb_amodal[..., 1:2],
topk_xs * 1.0 + ltrb_amodal[..., 2:3],
topk_ys * 1.0 + ltrb_amodal[..., 3:4]
],
axis=1)
ret['bboxes'] = paddle.concat([bboxes[:, 0:2], bboxes_amodal], 1)
# cls_id, score, x0, y0, x1, y1
return ret
def centertrack_post_process(self, dets, meta, out_thresh):
if not ('bboxes' in dets):
return [{}]
preds = []
c, s = meta['center'].numpy(), meta['scale'].numpy()
h, w = meta['out_height'].numpy(), meta['out_width'].numpy()
trans = get_affine_transform(
center=c[0],
input_size=s[0],
rot=0,
output_size=[w[0], h[0]],
shift=(0., 0.),
inv=True).astype(np.float32)
for i, dets_bbox in enumerate(dets['bboxes']):
if dets_bbox[1] < out_thresh:
break
item = {}
item['score'] = dets_bbox[1]
item['class'] = int(dets_bbox[0]) + 1
item['ct'] = transform_preds_with_trans(
dets['cts'][i].reshape([1, 2]), trans).reshape(2)
if 'tracking' in dets:
tracking = transform_preds_with_trans(
(dets['tracking'][i] + dets['cts'][i]).reshape([1, 2]),
trans).reshape(2)
item['tracking'] = tracking - item['ct']
if 'bboxes' in dets:
bbox = transform_preds_with_trans(
dets_bbox[2:6].reshape([2, 2]), trans).reshape(4)
item['bbox'] = bbox
preds.append(item)
return preds
def transform_preds_with_trans(coords, trans):
target_coords = np.ones((coords.shape[0], 3), np.float32)
target_coords[:, :2] = coords
target_coords = np.dot(trans, target_coords.transpose()).transpose()
return target_coords[:, :2]
def _tranpose_and_gather_feat(feat, bbox_inds):
feat = feat.transpose([0, 2, 3, 1])
feat = feat.reshape([-1, feat.shape[3]])
feat = paddle.gather(feat, bbox_inds)
return feat
================================================
FILE: ppdet/modeling/heads/clrnet_head.py
================================================
import math
import paddle
import numpy as np
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from ppdet.modeling.initializer import normal_
from ppdet.modeling.lane_utils import Lane
from ppdet.modeling.losses import line_iou
from ppdet.modeling.clrnet_utils import ROIGather, LinearModule, SegDecoder
__all__ = ['CLRHead']
@register
class CLRHead(nn.Layer):
__inject__ = ['loss']
__shared__ = [
'img_w', 'img_h', 'ori_img_h', 'num_classes', 'cut_height',
'num_points', "max_lanes"
]
def __init__(self,
num_points=72,
prior_feat_channels=64,
fc_hidden_dim=64,
num_priors=192,
img_w=800,
img_h=320,
ori_img_h=590,
cut_height=270,
num_classes=5,
num_fc=2,
refine_layers=3,
sample_points=36,
conf_threshold=0.4,
nms_thres=0.5,
max_lanes=4,
loss='CLRNetLoss'):
super(CLRHead, self).__init__()
self.img_w = img_w
self.img_h = img_h
self.n_strips = num_points - 1
self.n_offsets = num_points
self.num_priors = num_priors
self.sample_points = sample_points
self.refine_layers = refine_layers
self.num_classes = num_classes
self.fc_hidden_dim = fc_hidden_dim
self.ori_img_h = ori_img_h
self.cut_height = cut_height
self.conf_threshold = conf_threshold
self.nms_thres = nms_thres
self.max_lanes = max_lanes
self.prior_feat_channels = prior_feat_channels
self.loss = loss
self.register_buffer(
name='sample_x_indexs',
tensor=(paddle.linspace(
start=0, stop=1, num=self.sample_points,
dtype=paddle.float32) * self.n_strips).astype(dtype='int64'))
self.register_buffer(
name='prior_feat_ys',
tensor=paddle.flip(
x=(1 - self.sample_x_indexs.astype('float32') / self.n_strips),
axis=[-1]))
self.register_buffer(
name='prior_ys',
tensor=paddle.linspace(
start=1, stop=0, num=self.n_offsets).astype('float32'))
self.prior_feat_channels = prior_feat_channels
self._init_prior_embeddings()
init_priors, priors_on_featmap = self.generate_priors_from_embeddings()
self.register_buffer(name='priors', tensor=init_priors)
self.register_buffer(name='priors_on_featmap', tensor=priors_on_featmap)
self.seg_decoder = SegDecoder(self.img_h, self.img_w, self.num_classes,
self.prior_feat_channels,
self.refine_layers)
reg_modules = list()
cls_modules = list()
for _ in range(num_fc):
reg_modules += [*LinearModule(self.fc_hidden_dim)]
cls_modules += [*LinearModule(self.fc_hidden_dim)]
self.reg_modules = nn.LayerList(sublayers=reg_modules)
self.cls_modules = nn.LayerList(sublayers=cls_modules)
self.roi_gather = ROIGather(self.prior_feat_channels, self.num_priors,
self.sample_points, self.fc_hidden_dim,
self.refine_layers)
self.reg_layers = nn.Linear(
in_features=self.fc_hidden_dim,
out_features=self.n_offsets + 1 + 2 + 1,
bias_attr=True)
self.cls_layers = nn.Linear(
in_features=self.fc_hidden_dim, out_features=2, bias_attr=True)
self.init_weights()
def init_weights(self):
for m in self.cls_layers.parameters():
normal_(m, mean=0.0, std=0.001)
for m in self.reg_layers.parameters():
normal_(m, mean=0.0, std=0.001)
def pool_prior_features(self, batch_features, num_priors, prior_xs):
"""
pool prior feature from feature map.
Args:
batch_features (Tensor): Input feature maps, shape: (B, C, H, W)
"""
batch_size = batch_features.shape[0]
prior_xs = prior_xs.reshape([batch_size, num_priors, -1, 1])
prior_ys = self.prior_feat_ys.tile(repeat_times=[
batch_size * num_priors
]).reshape([batch_size, num_priors, -1, 1])
prior_xs = prior_xs * 2.0 - 1.0
prior_ys = prior_ys * 2.0 - 1.0
grid = paddle.concat(x=(prior_xs, prior_ys), axis=-1)
feature = F.grid_sample(
x=batch_features, grid=grid,
align_corners=True).transpose(perm=[0, 2, 1, 3])
feature = feature.reshape([
batch_size * num_priors, self.prior_feat_channels,
self.sample_points, 1
])
return feature
def generate_priors_from_embeddings(self):
predictions = self.prior_embeddings.weight
# 2 scores, 1 start_y, 1 start_x, 1 theta, 1 length, 72 coordinates, score[0] = negative prob, score[1] = positive prob
priors = paddle.zeros(
(self.num_priors, 2 + 2 + 2 + self.n_offsets),
dtype=predictions.dtype)
priors[:, 2:5] = predictions.clone()
priors[:, 6:] = (
priors[:, 3].unsqueeze(1).clone().tile([1, self.n_offsets]) *
(self.img_w - 1) +
((1 - self.prior_ys.tile([self.num_priors, 1]) -
priors[:, 2].unsqueeze(1).clone().tile([1, self.n_offsets])) *
self.img_h / paddle.tan(x=priors[:, 4].unsqueeze(1).clone().tile(
[1, self.n_offsets]) * math.pi + 1e-05))) / (self.img_w - 1)
priors_on_featmap = paddle.index_select(
priors, 6 + self.sample_x_indexs, axis=-1)
return priors, priors_on_featmap
def _init_prior_embeddings(self):
self.prior_embeddings = nn.Embedding(self.num_priors, 3)
bottom_priors_nums = self.num_priors * 3 // 4
left_priors_nums, _ = self.num_priors // 8, self.num_priors // 8
strip_size = 0.5 / (left_priors_nums // 2 - 1)
bottom_strip_size = 1 / (bottom_priors_nums // 4 + 1)
with paddle.no_grad():
for i in range(left_priors_nums):
self.prior_embeddings.weight[i, 0] = i // 2 * strip_size
self.prior_embeddings.weight[i, 1] = 0.0
self.prior_embeddings.weight[i,
2] = 0.16 if i % 2 == 0 else 0.32
for i in range(left_priors_nums,
left_priors_nums + bottom_priors_nums):
self.prior_embeddings.weight[i, 0] = 0.0
self.prior_embeddings.weight[i, 1] = (
(i - left_priors_nums) // 4 + 1) * bottom_strip_size
self.prior_embeddings.weight[i, 2] = 0.2 * (i % 4 + 1)
for i in range(left_priors_nums + bottom_priors_nums,
self.num_priors):
self.prior_embeddings.weight[i, 0] = (
i - left_priors_nums - bottom_priors_nums) // 2 * strip_size
self.prior_embeddings.weight[i, 1] = 1.0
self.prior_embeddings.weight[i,
2] = 0.68 if i % 2 == 0 else 0.84
def forward(self, x, inputs=None):
"""
Take pyramid features as input to perform Cross Layer Refinement and finally output the prediction lanes.
Each feature is a 4D tensor.
Args:
x: input features (list[Tensor])
Return:
prediction_list: each layer's prediction result
seg: segmentation result for auxiliary loss
"""
batch_features = list(x[len(x) - self.refine_layers:])
batch_features.reverse()
batch_size = batch_features[-1].shape[0]
if self.training:
self.priors, self.priors_on_featmap = self.generate_priors_from_embeddings(
)
priors, priors_on_featmap = self.priors.tile(
[batch_size, 1,
1]), self.priors_on_featmap.tile([batch_size, 1, 1])
predictions_lists = []
prior_features_stages = []
for stage in range(self.refine_layers):
num_priors = priors_on_featmap.shape[1]
prior_xs = paddle.flip(x=priors_on_featmap, axis=[2])
batch_prior_features = self.pool_prior_features(
batch_features[stage], num_priors, prior_xs)
prior_features_stages.append(batch_prior_features)
fc_features = self.roi_gather(prior_features_stages,
batch_features[stage], stage)
# return fc_features
fc_features = fc_features.reshape(
[num_priors, batch_size, -1]).reshape(
[batch_size * num_priors, self.fc_hidden_dim])
cls_features = fc_features.clone()
reg_features = fc_features.clone()
for cls_layer in self.cls_modules:
cls_features = cls_layer(cls_features)
# return cls_features
for reg_layer in self.reg_modules:
reg_features = reg_layer(reg_features)
cls_logits = self.cls_layers(cls_features)
reg = self.reg_layers(reg_features)
cls_logits = cls_logits.reshape(
[batch_size, -1, cls_logits.shape[1]])
reg = reg.reshape([batch_size, -1, reg.shape[1]])
predictions = priors.clone()
predictions[:, :, :2] = cls_logits
predictions[:, :, 2:5] += reg[:, :, :3]
predictions[:, :, 5] = reg[:, :, 3]
def tran_tensor(t):
return t.unsqueeze(axis=2).clone().tile([1, 1, self.n_offsets])
predictions[..., 6:] = (
tran_tensor(predictions[..., 3]) * (self.img_w - 1) +
((1 - self.prior_ys.tile([batch_size, num_priors, 1]) -
tran_tensor(predictions[..., 2])) * self.img_h / paddle.tan(
tran_tensor(predictions[..., 4]) * math.pi + 1e-05))) / (
self.img_w - 1)
prediction_lines = predictions.clone()
predictions[..., 6:] += reg[..., 4:]
predictions_lists.append(predictions)
if stage != self.refine_layers - 1:
priors = prediction_lines.detach().clone()
priors_on_featmap = priors.index_select(
6 + self.sample_x_indexs, axis=-1)
if self.training:
seg = None
seg_features = paddle.concat(
[
F.interpolate(
feature,
size=[
batch_features[-1].shape[2],
batch_features[-1].shape[3]
],
mode='bilinear',
align_corners=False) for feature in batch_features
],
axis=1)
seg = self.seg_decoder(seg_features)
output = {'predictions_lists': predictions_lists, 'seg': seg}
return self.loss(output, inputs)
return predictions_lists[-1]
def predictions_to_pred(self, predictions):
"""
Convert predictions to internal Lane structure for evaluation.
"""
self.prior_ys = paddle.to_tensor(self.prior_ys)
self.prior_ys = self.prior_ys.astype('float64')
lanes = []
for lane in predictions:
lane_xs = lane[6:].clone()
start = min(
max(0, int(round(lane[2].item() * self.n_strips))),
self.n_strips)
length = int(round(lane[5].item()))
end = start + length - 1
end = min(end, len(self.prior_ys) - 1)
if start > 0:
mask = ((lane_xs[:start] >= 0.) &
(lane_xs[:start] <= 1.)).cpu().detach().numpy()[::-1]
mask = ~((mask.cumprod()[::-1]).astype(np.bool_))
lane_xs[:start][mask] = -2
if end < len(self.prior_ys) - 1:
lane_xs[end + 1:] = -2
lane_ys = self.prior_ys[lane_xs >= 0].clone()
lane_xs = lane_xs[lane_xs >= 0]
lane_xs = lane_xs.flip(axis=0).astype('float64')
lane_ys = lane_ys.flip(axis=0)
lane_ys = (lane_ys *
(self.ori_img_h - self.cut_height) + self.cut_height
) / self.ori_img_h
if len(lane_xs) <= 1:
continue
points = paddle.stack(
x=(lane_xs.reshape([-1, 1]), lane_ys.reshape([-1, 1])),
axis=1).squeeze(axis=2)
lane = Lane(
points=points.cpu().numpy(),
metadata={
'start_x': lane[3],
'start_y': lane[2],
'conf': lane[1]
})
lanes.append(lane)
return lanes
def lane_nms(self, predictions, scores, nms_overlap_thresh, top_k):
"""
NMS for lane detection.
predictions: paddle.Tensor [num_lanes,conf,y,x,lenght,72offsets] [12,77]
scores: paddle.Tensor [num_lanes]
nms_overlap_thresh: float
top_k: int
"""
# sort by scores to get idx
idx = scores.argsort(descending=True)
keep = []
condidates = predictions.clone()
condidates = condidates.index_select(idx)
while len(condidates) > 0:
keep.append(idx[0])
if len(keep) >= top_k or len(condidates) == 1:
break
ious = []
for i in range(1, len(condidates)):
ious.append(1 - line_iou(
condidates[i].unsqueeze(0),
condidates[0].unsqueeze(0),
img_w=self.img_w,
length=15))
ious = paddle.to_tensor(ious)
mask = ious <= nms_overlap_thresh
id = paddle.where(mask == False)[0]
if id.shape[0] == 0:
break
condidates = condidates[1:].index_select(id)
idx = idx[1:].index_select(id)
keep = paddle.stack(keep)
return keep
def get_lanes(self, output, as_lanes=True):
"""
Convert model output to lanes.
"""
softmax = nn.Softmax(axis=1)
decoded = []
for predictions in output:
threshold = self.conf_threshold
scores = softmax(predictions[:, :2])[:, 1]
keep_inds = scores >= threshold
predictions = predictions[keep_inds]
scores = scores[keep_inds]
if predictions.shape[0] == 0:
decoded.append([])
continue
nms_predictions = predictions.detach().clone()
nms_predictions = paddle.concat(
x=[nms_predictions[..., :4], nms_predictions[..., 5:]], axis=-1)
nms_predictions[..., 4] = nms_predictions[..., 4] * self.n_strips
nms_predictions[..., 5:] = nms_predictions[..., 5:] * (
self.img_w - 1)
keep = self.lane_nms(
nms_predictions[..., 5:],
scores,
nms_overlap_thresh=self.nms_thres,
top_k=self.max_lanes)
predictions = predictions.index_select(keep)
if predictions.shape[0] == 0:
decoded.append([])
continue
predictions[:, 5] = paddle.round(predictions[:, 5] * self.n_strips)
if as_lanes:
pred = self.predictions_to_pred(predictions)
else:
pred = predictions
decoded.append(pred)
return decoded
================================================
FILE: ppdet/modeling/heads/detr_head.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
import pycocotools.mask as mask_util
from ..initializer import linear_init_, constant_
from ..transformers.utils import inverse_sigmoid
__all__ = ['DETRHead', 'DeformableDETRHead', 'DINOHead', 'MaskDINOHead', 'DINOv3Head']
class MLP(nn.Layer):
"""This code is based on
https://github.com/facebookresearch/detr/blob/main/models/detr.py
"""
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
super().__init__()
self.num_layers = num_layers
h = [hidden_dim] * (num_layers - 1)
self.layers = nn.LayerList(
nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
self._reset_parameters()
def _reset_parameters(self):
for l in self.layers:
linear_init_(l)
def forward(self, x):
for i, layer in enumerate(self.layers):
x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
return x
class MultiHeadAttentionMap(nn.Layer):
"""This code is based on
https://github.com/facebookresearch/detr/blob/main/models/segmentation.py
This is a 2D attention module, which only returns the attention softmax (no multiplication by value)
"""
def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0,
bias=True):
super().__init__()
self.num_heads = num_heads
self.hidden_dim = hidden_dim
self.dropout = nn.Dropout(dropout)
weight_attr = paddle.ParamAttr(
initializer=paddle.nn.initializer.XavierUniform())
bias_attr = paddle.framework.ParamAttr(
initializer=paddle.nn.initializer.Constant()) if bias else False
self.q_proj = nn.Linear(query_dim, hidden_dim, weight_attr, bias_attr)
self.k_proj = nn.Conv2D(
query_dim,
hidden_dim,
1,
weight_attr=weight_attr,
bias_attr=bias_attr)
self.normalize_fact = float(hidden_dim / self.num_heads)**-0.5
def forward(self, q, k, mask=None):
q = self.q_proj(q)
k = self.k_proj(k)
bs, num_queries, n, c, h, w = q.shape[0], q.shape[1], self.num_heads,\
self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1]
qh = q.reshape([bs, num_queries, n, c])
kh = k.reshape([bs, n, c, h, w])
# weights = paddle.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh)
qh = qh.transpose([0, 2, 1, 3]).reshape([-1, num_queries, c])
kh = kh.reshape([-1, c, h * w])
weights = paddle.bmm(qh * self.normalize_fact, kh).reshape(
[bs, n, num_queries, h, w]).transpose([0, 2, 1, 3, 4])
if mask is not None:
weights += mask
# fix a potenial bug: https://github.com/facebookresearch/detr/issues/247
weights = F.softmax(weights.flatten(3), axis=-1).reshape(weights.shape)
weights = self.dropout(weights)
return weights
class MaskHeadFPNConv(nn.Layer):
"""This code is based on
https://github.com/facebookresearch/detr/blob/main/models/segmentation.py
Simple convolutional head, using group norm.
Upsampling is done using a FPN approach
"""
def __init__(self, input_dim, fpn_dims, context_dim, num_groups=8):
super().__init__()
inter_dims = [input_dim,
] + [context_dim // (2**i) for i in range(1, 5)]
weight_attr = paddle.ParamAttr(
initializer=paddle.nn.initializer.KaimingUniform())
bias_attr = paddle.framework.ParamAttr(
initializer=paddle.nn.initializer.Constant())
self.conv0 = self._make_layers(input_dim, input_dim, 3, num_groups,
weight_attr, bias_attr)
self.conv_inter = nn.LayerList()
for in_dims, out_dims in zip(inter_dims[:-1], inter_dims[1:]):
self.conv_inter.append(
self._make_layers(in_dims, out_dims, 3, num_groups, weight_attr,
bias_attr))
self.conv_out = nn.Conv2D(
inter_dims[-1],
1,
3,
padding=1,
weight_attr=weight_attr,
bias_attr=bias_attr)
self.adapter = nn.LayerList()
for i in range(len(fpn_dims)):
self.adapter.append(
nn.Conv2D(
fpn_dims[i],
inter_dims[i + 1],
1,
weight_attr=weight_attr,
bias_attr=bias_attr))
def _make_layers(self,
in_dims,
out_dims,
kernel_size,
num_groups,
weight_attr=None,
bias_attr=None):
return nn.Sequential(
nn.Conv2D(
in_dims,
out_dims,
kernel_size,
padding=kernel_size // 2,
weight_attr=weight_attr,
bias_attr=bias_attr),
nn.GroupNorm(num_groups, out_dims),
nn.ReLU())
def forward(self, x, bbox_attention_map, fpns):
x = paddle.concat([
x.tile([bbox_attention_map.shape[1], 1, 1, 1]),
bbox_attention_map.flatten(0, 1)
], 1)
x = self.conv0(x)
for inter_layer, adapter_layer, feat in zip(self.conv_inter[:-1],
self.adapter, fpns):
feat = adapter_layer(feat).tile(
[bbox_attention_map.shape[1], 1, 1, 1])
x = inter_layer(x)
x = feat + F.interpolate(x, size=feat.shape[-2:])
x = self.conv_inter[-1](x)
x = self.conv_out(x)
return x
@register
class DETRHead(nn.Layer):
__shared__ = ['num_classes', 'hidden_dim', 'use_focal_loss']
__inject__ = ['loss']
def __init__(self,
num_classes=80,
hidden_dim=256,
nhead=8,
num_mlp_layers=3,
loss='DETRLoss',
fpn_dims=[1024, 512, 256],
with_mask_head=False,
use_focal_loss=False):
super(DETRHead, self).__init__()
# add background class
self.num_classes = num_classes if use_focal_loss else num_classes + 1
self.hidden_dim = hidden_dim
self.loss = loss
self.with_mask_head = with_mask_head
self.use_focal_loss = use_focal_loss
self.score_head = nn.Linear(hidden_dim, self.num_classes)
self.bbox_head = MLP(hidden_dim,
hidden_dim,
output_dim=4,
num_layers=num_mlp_layers)
if self.with_mask_head:
self.bbox_attention = MultiHeadAttentionMap(hidden_dim, hidden_dim,
nhead)
self.mask_head = MaskHeadFPNConv(hidden_dim + nhead, fpn_dims,
hidden_dim)
self._reset_parameters()
def _reset_parameters(self):
linear_init_(self.score_head)
@classmethod
def from_config(cls, cfg, hidden_dim, nhead, input_shape):
return {
'hidden_dim': hidden_dim,
'nhead': nhead,
'fpn_dims': [i.channels for i in input_shape[::-1]][1:]
}
@staticmethod
def get_gt_mask_from_polygons(gt_poly, pad_mask):
out_gt_mask = []
for polygons, padding in zip(gt_poly, pad_mask):
height, width = int(padding[:, 0].sum()), int(padding[0, :].sum())
masks = []
for obj_poly in polygons:
rles = mask_util.frPyObjects(obj_poly, height, width)
rle = mask_util.merge(rles)
masks.append(
paddle.to_tensor(mask_util.decode(rle)).astype('float32'))
masks = paddle.stack(masks)
masks_pad = paddle.zeros(
[masks.shape[0], pad_mask.shape[1], pad_mask.shape[2]])
masks_pad[:, :height, :width] = masks
out_gt_mask.append(masks_pad)
return out_gt_mask
def forward(self, out_transformer, body_feats, inputs=None):
r"""
Args:
out_transformer (Tuple): (feats: [num_levels, batch_size,
num_queries, hidden_dim],
memory: [batch_size, hidden_dim, h, w],
src_proj: [batch_size, h*w, hidden_dim],
src_mask: [batch_size, 1, 1, h, w])
body_feats (List(Tensor)): list[[B, C, H, W]]
inputs (dict): dict(inputs)
"""
feats, memory, src_proj, src_mask = out_transformer
outputs_logit = self.score_head(feats)
outputs_bbox = F.sigmoid(self.bbox_head(feats))
outputs_seg = None
if self.with_mask_head:
bbox_attention_map = self.bbox_attention(feats[-1], memory,
src_mask)
fpn_feats = [a for a in body_feats[::-1]][1:]
outputs_seg = self.mask_head(src_proj, bbox_attention_map,
fpn_feats)
outputs_seg = outputs_seg.reshape([
feats.shape[1], feats.shape[2], outputs_seg.shape[-2],
outputs_seg.shape[-1]
])
if self.training:
assert inputs is not None
assert 'gt_bbox' in inputs and 'gt_class' in inputs
gt_mask = self.get_gt_mask_from_polygons(
inputs['gt_poly'],
inputs['pad_mask']) if 'gt_poly' in inputs else None
return self.loss(
outputs_bbox,
outputs_logit,
inputs['gt_bbox'],
inputs['gt_class'],
masks=outputs_seg,
gt_mask=gt_mask)
else:
return (outputs_bbox[-1], outputs_logit[-1], outputs_seg)
@register
class DeformableDETRHead(nn.Layer):
__shared__ = ['num_classes', 'hidden_dim']
__inject__ = ['loss']
def __init__(self,
num_classes=80,
hidden_dim=512,
nhead=8,
num_mlp_layers=3,
loss='DETRLoss'):
super(DeformableDETRHead, self).__init__()
self.num_classes = num_classes
self.hidden_dim = hidden_dim
self.nhead = nhead
self.loss = loss
self.score_head = nn.Linear(hidden_dim, self.num_classes)
self.bbox_head = MLP(hidden_dim,
hidden_dim,
output_dim=4,
num_layers=num_mlp_layers)
self._reset_parameters()
def _reset_parameters(self):
linear_init_(self.score_head)
constant_(self.score_head.bias, -4.595)
constant_(self.bbox_head.layers[-1].weight)
with paddle.no_grad():
bias = paddle.zeros_like(self.bbox_head.layers[-1].bias)
bias[2:] = -2.0
self.bbox_head.layers[-1].bias.set_value(bias)
@classmethod
def from_config(cls, cfg, hidden_dim, nhead, input_shape):
return {'hidden_dim': hidden_dim, 'nhead': nhead}
def forward(self, out_transformer, body_feats, inputs=None):
r"""
Args:
out_transformer (Tuple): (feats: [num_levels, batch_size,
num_queries, hidden_dim],
memory: [batch_size,
\sum_{l=0}^{L-1} H_l \cdot W_l, hidden_dim],
reference_points: [batch_size, num_queries, 2])
body_feats (List(Tensor)): list[[B, C, H, W]]
inputs (dict): dict(inputs)
"""
feats, memory, reference_points = out_transformer
reference_points = inverse_sigmoid(reference_points.unsqueeze(0))
outputs_bbox = self.bbox_head(feats)
# It's equivalent to "outputs_bbox[:, :, :, :2] += reference_points",
# but the gradient is wrong in paddle.
outputs_bbox = paddle.concat(
[
outputs_bbox[:, :, :, :2] + reference_points,
outputs_bbox[:, :, :, 2:]
],
axis=-1)
outputs_bbox = F.sigmoid(outputs_bbox)
outputs_logit = self.score_head(feats)
if self.training:
assert inputs is not None
assert 'gt_bbox' in inputs and 'gt_class' in inputs
return self.loss(outputs_bbox, outputs_logit, inputs['gt_bbox'],
inputs['gt_class'])
else:
return (outputs_bbox[-1], outputs_logit[-1], None)
@register
class DINOHead(nn.Layer):
__inject__ = ['loss']
def __init__(self, loss='DINOLoss', eval_idx=-1):
super(DINOHead, self).__init__()
self.loss = loss
self.eval_idx = eval_idx
def forward(self, out_transformer, body_feats, inputs=None):
(dec_out_bboxes, dec_out_logits, enc_topk_bboxes, enc_topk_logits,
dn_meta) = out_transformer
if self.training:
assert inputs is not None
assert 'gt_bbox' in inputs and 'gt_class' in inputs
if dn_meta is not None:
if isinstance(dn_meta, list):
dual_groups = len(dn_meta) - 1
dec_out_bboxes = paddle.split(
dec_out_bboxes, dual_groups + 1, axis=2)
dec_out_logits = paddle.split(
dec_out_logits, dual_groups + 1, axis=2)
enc_topk_bboxes = paddle.split(
enc_topk_bboxes, dual_groups + 1, axis=1)
enc_topk_logits = paddle.split(
enc_topk_logits, dual_groups + 1, axis=1)
dec_out_bboxes_list = []
dec_out_logits_list = []
dn_out_bboxes_list = []
dn_out_logits_list = []
loss = {}
for g_id in range(dual_groups + 1):
if dn_meta[g_id] is not None:
dn_out_bboxes_gid, dec_out_bboxes_gid = paddle.split(
dec_out_bboxes[g_id],
dn_meta[g_id]['dn_num_split'],
axis=2)
dn_out_logits_gid, dec_out_logits_gid = paddle.split(
dec_out_logits[g_id],
dn_meta[g_id]['dn_num_split'],
axis=2)
else:
dn_out_bboxes_gid, dn_out_logits_gid = None, None
dec_out_bboxes_gid = dec_out_bboxes[g_id]
dec_out_logits_gid = dec_out_logits[g_id]
out_bboxes_gid = paddle.concat([
enc_topk_bboxes[g_id].unsqueeze(0),
dec_out_bboxes_gid
])
out_logits_gid = paddle.concat([
enc_topk_logits[g_id].unsqueeze(0),
dec_out_logits_gid
])
loss_gid = self.loss(
out_bboxes_gid,
out_logits_gid,
inputs['gt_bbox'],
inputs['gt_class'],
dn_out_bboxes=dn_out_bboxes_gid,
dn_out_logits=dn_out_logits_gid,
dn_meta=dn_meta[g_id])
# sum loss
for key, value in loss_gid.items():
loss.update({
key: loss.get(key, paddle.zeros([1])) + value
})
# average across (dual_groups + 1)
for key, value in loss.items():
loss.update({key: value / (dual_groups + 1)})
return loss
else:
dn_out_bboxes, dec_out_bboxes = paddle.split(
dec_out_bboxes, dn_meta['dn_num_split'], axis=2)
dn_out_logits, dec_out_logits = paddle.split(
dec_out_logits, dn_meta['dn_num_split'], axis=2)
else:
dn_out_bboxes, dn_out_logits = None, None
out_bboxes = paddle.concat(
[enc_topk_bboxes.unsqueeze(0), dec_out_bboxes])
out_logits = paddle.concat(
[enc_topk_logits.unsqueeze(0), dec_out_logits])
return self.loss(
out_bboxes,
out_logits,
inputs['gt_bbox'],
inputs['gt_class'],
dn_out_bboxes=dn_out_bboxes,
dn_out_logits=dn_out_logits,
dn_meta=dn_meta,
gt_score=inputs.get('gt_score', None))
else:
return (dec_out_bboxes[self.eval_idx],
dec_out_logits[self.eval_idx], None)
@register
class MaskDINOHead(nn.Layer):
__inject__ = ['loss']
def __init__(self, loss='DINOLoss'):
super(MaskDINOHead, self).__init__()
self.loss = loss
def forward(self, out_transformer, body_feats, inputs=None):
(dec_out_logits, dec_out_bboxes, dec_out_masks, enc_out, init_out,
dn_meta) = out_transformer
if self.training:
assert inputs is not None
assert 'gt_bbox' in inputs and 'gt_class' in inputs
assert 'gt_segm' in inputs
if dn_meta is not None:
dn_out_logits, dec_out_logits = paddle.split(
dec_out_logits, dn_meta['dn_num_split'], axis=2)
dn_out_bboxes, dec_out_bboxes = paddle.split(
dec_out_bboxes, dn_meta['dn_num_split'], axis=2)
dn_out_masks, dec_out_masks = paddle.split(
dec_out_masks, dn_meta['dn_num_split'], axis=2)
if init_out is not None:
init_out_logits, init_out_bboxes, init_out_masks = init_out
init_out_logits_dn, init_out_logits = paddle.split(
init_out_logits, dn_meta['dn_num_split'], axis=1)
init_out_bboxes_dn, init_out_bboxes = paddle.split(
init_out_bboxes, dn_meta['dn_num_split'], axis=1)
init_out_masks_dn, init_out_masks = paddle.split(
init_out_masks, dn_meta['dn_num_split'], axis=1)
dec_out_logits = paddle.concat(
[init_out_logits.unsqueeze(0), dec_out_logits])
dec_out_bboxes = paddle.concat(
[init_out_bboxes.unsqueeze(0), dec_out_bboxes])
dec_out_masks = paddle.concat(
[init_out_masks.unsqueeze(0), dec_out_masks])
dn_out_logits = paddle.concat(
[init_out_logits_dn.unsqueeze(0), dn_out_logits])
dn_out_bboxes = paddle.concat(
[init_out_bboxes_dn.unsqueeze(0), dn_out_bboxes])
dn_out_masks = paddle.concat(
[init_out_masks_dn.unsqueeze(0), dn_out_masks])
else:
dn_out_bboxes, dn_out_logits = None, None
dn_out_masks = None
enc_out_logits, enc_out_bboxes, enc_out_masks = enc_out
out_logits = paddle.concat(
[enc_out_logits.unsqueeze(0), dec_out_logits])
out_bboxes = paddle.concat(
[enc_out_bboxes.unsqueeze(0), dec_out_bboxes])
out_masks = paddle.concat(
[enc_out_masks.unsqueeze(0), dec_out_masks])
inputs['gt_segm'] = [gt_segm.astype(out_masks.dtype)
for gt_segm in inputs['gt_segm']]
return self.loss(
out_bboxes,
out_logits,
inputs['gt_bbox'],
inputs['gt_class'],
masks=out_masks,
gt_mask=inputs['gt_segm'],
dn_out_logits=dn_out_logits,
dn_out_bboxes=dn_out_bboxes,
dn_out_masks=dn_out_masks,
dn_meta=dn_meta)
else:
return (dec_out_bboxes[-1], dec_out_logits[-1], dec_out_masks[-1])
@register
class DINOv3Head(nn.Layer):
__inject__ = ['loss']
__shared__ = ['o2m_branch', 'num_queries_o2m']
def __init__(self, loss='DINOLoss', eval_idx=-1, o2m=4, o2m_branch=False, num_queries_o2m=450):
super(DINOv3Head, self).__init__()
self.loss = loss
self.eval_idx = eval_idx
self.o2m = o2m
self.o2m_branch = o2m_branch
self.num_queries_o2m = num_queries_o2m
def forward(self, out_transformer, body_feats, inputs=None):
(dec_out_bboxes, dec_out_logits, enc_topk_bboxes, enc_topk_logits,
dn_meta) = out_transformer
if self.training:
assert inputs is not None
assert 'gt_bbox' in inputs and 'gt_class' in inputs
if dn_meta is not None:
num_groups = len(dn_meta)
total_dec_queries = dec_out_bboxes.shape[2]
total_enc_queries = enc_topk_bboxes.shape[1]
loss = {}
if self.o2m_branch:
dec_out_bboxes, dec_out_bboxes_o2m = paddle.split(dec_out_bboxes, [total_dec_queries - self.num_queries_o2m, self.num_queries_o2m], axis=2)
dec_out_logits, dec_out_logits_o2m = paddle.split(dec_out_logits, [total_dec_queries - self.num_queries_o2m, self.num_queries_o2m], axis=2)
enc_topk_bboxes, enc_topk_bboxes_o2m = paddle.split(enc_topk_bboxes, [total_enc_queries - self.num_queries_o2m, self.num_queries_o2m], axis=1)
enc_topk_logits, enc_topk_logits_o2m = paddle.split(enc_topk_logits, [total_enc_queries - self.num_queries_o2m, self.num_queries_o2m], axis=1)
out_bboxes_o2m = paddle.concat([enc_topk_bboxes_o2m.unsqueeze(0), dec_out_bboxes_o2m])
out_logits_o2m = paddle.concat([enc_topk_logits_o2m.unsqueeze(0), dec_out_logits_o2m])
loss_o2m = self.loss(
out_bboxes_o2m,
out_logits_o2m,
inputs['gt_bbox'],
inputs['gt_class'],
dn_out_bboxes=None,
dn_out_logits=None,
dn_meta=None,
o2m=self.o2m)
for key, value in loss_o2m.items():
key = key + '_o2m_branch'
loss.update({
key: loss.get(key, paddle.zeros([1])) + value
})
split_dec_num = [sum(dn['dn_num_split']) for dn in dn_meta]
split_enc_num = [dn['dn_num_split'][1] for dn in dn_meta]
dec_out_bboxes = paddle.split(dec_out_bboxes, split_dec_num, axis=2)
dec_out_logits = paddle.split(dec_out_logits, split_dec_num, axis=2)
enc_topk_bboxes = paddle.split(enc_topk_bboxes, split_enc_num, axis=1)
enc_topk_logits = paddle.split(enc_topk_logits, split_enc_num, axis=1)
for g_id in range(num_groups):
dn_out_bboxes_gid, dec_out_bboxes_gid = paddle.split(
dec_out_bboxes[g_id], dn_meta[g_id]['dn_num_split'], axis=2)
dn_out_logits_gid, dec_out_logits_gid = paddle.split(
dec_out_logits[g_id], dn_meta[g_id]['dn_num_split'], axis=2)
out_bboxes_gid = paddle.concat([
enc_topk_bboxes[g_id].unsqueeze(0), dec_out_bboxes_gid])
out_logits_gid = paddle.concat([
enc_topk_logits[g_id].unsqueeze(0), dec_out_logits_gid])
loss_gid = self.loss(
out_bboxes_gid,
out_logits_gid,
inputs['gt_bbox'],
inputs['gt_class'],
dn_out_bboxes=dn_out_bboxes_gid,
dn_out_logits=dn_out_logits_gid,
dn_meta=dn_meta[g_id])
# sum loss
for key, value in loss_gid.items():
loss.update({
key: loss.get(key, paddle.zeros([1])) + value
})
# average across (dual_groups + 1)
for key, value in loss.items():
if '_o2m_branch' not in key:
loss.update({key: value / num_groups})
return loss
else:
dn_out_bboxes, dn_out_logits = None, None
out_bboxes = paddle.concat(
[enc_topk_bboxes.unsqueeze(0), dec_out_bboxes])
out_logits = paddle.concat(
[enc_topk_logits.unsqueeze(0), dec_out_logits])
return self.loss(
out_bboxes,
out_logits,
inputs['gt_bbox'],
inputs['gt_class'],
dn_out_bboxes=dn_out_bboxes,
dn_out_logits=dn_out_logits,
dn_meta=dn_meta,
gt_score=inputs.get('gt_score', None))
else:
return (dec_out_bboxes[self.eval_idx],
dec_out_logits[self.eval_idx], None)
================================================
FILE: ppdet/modeling/heads/face_head.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
from ppdet.core.workspace import register
from ..layers import AnchorGeneratorSSD
from ..cls_utils import _get_class_default_kwargs
@register
class FaceHead(nn.Layer):
"""
Head block for Face detection network
Args:
num_classes (int): Number of output classes.
in_channels (int): Number of input channels.
anchor_generator(object): instance of anchor genertor method.
kernel_size (int): kernel size of Conv2D in FaceHead.
padding (int): padding of Conv2D in FaceHead.
conv_decay (float): norm_decay (float): weight decay for conv layer weights.
loss (object): loss of face detection model.
"""
__shared__ = ['num_classes']
__inject__ = ['anchor_generator', 'loss']
def __init__(self,
num_classes=80,
in_channels=[96, 96],
anchor_generator=_get_class_default_kwargs(AnchorGeneratorSSD),
kernel_size=3,
padding=1,
conv_decay=0.,
loss='SSDLoss'):
super(FaceHead, self).__init__()
# add background class
self.num_classes = num_classes + 1
self.in_channels = in_channels
self.anchor_generator = anchor_generator
self.loss = loss
if isinstance(anchor_generator, dict):
self.anchor_generator = AnchorGeneratorSSD(**anchor_generator)
self.num_priors = self.anchor_generator.num_priors
self.box_convs = []
self.score_convs = []
for i, num_prior in enumerate(self.num_priors):
box_conv_name = "boxes{}".format(i)
box_conv = self.add_sublayer(
box_conv_name,
nn.Conv2D(
in_channels=self.in_channels[i],
out_channels=num_prior * 4,
kernel_size=kernel_size,
padding=padding))
self.box_convs.append(box_conv)
score_conv_name = "scores{}".format(i)
score_conv = self.add_sublayer(
score_conv_name,
nn.Conv2D(
in_channels=self.in_channels[i],
out_channels=num_prior * self.num_classes,
kernel_size=kernel_size,
padding=padding))
self.score_convs.append(score_conv)
@classmethod
def from_config(cls, cfg, input_shape):
return {'in_channels': [i.channels for i in input_shape], }
def forward(self, feats, image, gt_bbox=None, gt_class=None):
box_preds = []
cls_scores = []
prior_boxes = []
for feat, box_conv, score_conv in zip(feats, self.box_convs,
self.score_convs):
box_pred = box_conv(feat)
box_pred = paddle.transpose(box_pred, [0, 2, 3, 1])
box_pred = paddle.reshape(box_pred, [0, -1, 4])
box_preds.append(box_pred)
cls_score = score_conv(feat)
cls_score = paddle.transpose(cls_score, [0, 2, 3, 1])
cls_score = paddle.reshape(cls_score, [0, -1, self.num_classes])
cls_scores.append(cls_score)
prior_boxes = self.anchor_generator(feats, image)
if self.training:
return self.get_loss(box_preds, cls_scores, gt_bbox, gt_class,
prior_boxes)
else:
return (box_preds, cls_scores), prior_boxes
def get_loss(self, boxes, scores, gt_bbox, gt_class, prior_boxes):
return self.loss(boxes, scores, gt_bbox, gt_class, prior_boxes)
================================================
FILE: ppdet/modeling/heads/fcos_head.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import Normal, Constant
from ppdet.core.workspace import register
from ppdet.modeling.layers import ConvNormLayer, MultiClassNMS
__all__ = ['FCOSFeat', 'FCOSHead', 'FCOSHead_ARSL']
class ScaleReg(nn.Layer):
"""
Parameter for scaling the regression outputs.
"""
def __init__(self):
super(ScaleReg, self).__init__()
self.scale_reg = self.create_parameter(
shape=[1],
attr=ParamAttr(initializer=Constant(value=1.)),
dtype="float32")
def forward(self, inputs):
out = inputs * self.scale_reg
return out
@register
class FCOSFeat(nn.Layer):
"""
FCOSFeat of FCOS
Args:
feat_in (int): The channel number of input Tensor.
feat_out (int): The channel number of output Tensor.
num_convs (int): The convolution number of the FCOSFeat.
norm_type (str): Normalization type, 'bn'/'sync_bn'/'gn'.
use_dcn (bool): Whether to use dcn in tower or not.
"""
def __init__(self,
feat_in=256,
feat_out=256,
num_convs=4,
norm_type='bn',
use_dcn=False):
super(FCOSFeat, self).__init__()
self.feat_in = feat_in
self.feat_out = feat_out
self.num_convs = num_convs
self.norm_type = norm_type
self.cls_subnet_convs = []
self.reg_subnet_convs = []
for i in range(self.num_convs):
in_c = feat_in if i == 0 else feat_out
cls_conv_name = 'fcos_head_cls_tower_conv_{}'.format(i)
cls_conv = self.add_sublayer(
cls_conv_name,
ConvNormLayer(
ch_in=in_c,
ch_out=feat_out,
filter_size=3,
stride=1,
norm_type=norm_type,
use_dcn=use_dcn,
bias_on=True,
lr_scale=2.))
self.cls_subnet_convs.append(cls_conv)
reg_conv_name = 'fcos_head_reg_tower_conv_{}'.format(i)
reg_conv = self.add_sublayer(
reg_conv_name,
ConvNormLayer(
ch_in=in_c,
ch_out=feat_out,
filter_size=3,
stride=1,
norm_type=norm_type,
use_dcn=use_dcn,
bias_on=True,
lr_scale=2.))
self.reg_subnet_convs.append(reg_conv)
def forward(self, fpn_feat):
cls_feat = fpn_feat
reg_feat = fpn_feat
for i in range(self.num_convs):
cls_feat = F.relu(self.cls_subnet_convs[i](cls_feat))
reg_feat = F.relu(self.reg_subnet_convs[i](reg_feat))
return cls_feat, reg_feat
@register
class FCOSHead(nn.Layer):
"""
FCOSHead
Args:
num_classes (int): Number of classes
fcos_feat (object): Instance of 'FCOSFeat'
fpn_stride (list): The stride of each FPN Layer
prior_prob (float): Used to set the bias init for the class prediction layer
norm_reg_targets (bool): Normalization the regression target if true
centerness_on_reg (bool): The prediction of centerness on regression or clssification branch
num_shift (float): Relative offset between the center of the first shift and the top-left corner of img
fcos_loss (object): Instance of 'FCOSLoss'
nms (object): Instance of 'MultiClassNMS'
trt (bool): Whether to use trt in nms of deploy
"""
__inject__ = ['fcos_feat', 'fcos_loss', 'nms']
__shared__ = ['num_classes', 'trt']
def __init__(self,
num_classes=80,
fcos_feat='FCOSFeat',
fpn_stride=[8, 16, 32, 64, 128],
prior_prob=0.01,
multiply_strides_reg_targets=False,
norm_reg_targets=True,
centerness_on_reg=True,
num_shift=0.5,
sqrt_score=False,
fcos_loss='FCOSLoss',
nms='MultiClassNMS',
trt=False):
super(FCOSHead, self).__init__()
self.fcos_feat = fcos_feat
self.num_classes = num_classes
self.fpn_stride = fpn_stride
self.prior_prob = prior_prob
self.fcos_loss = fcos_loss
self.norm_reg_targets = norm_reg_targets
self.centerness_on_reg = centerness_on_reg
self.multiply_strides_reg_targets = multiply_strides_reg_targets
self.num_shift = num_shift
self.nms = nms
if isinstance(self.nms, MultiClassNMS) and trt:
self.nms.trt = trt
self.sqrt_score = sqrt_score
self.is_teacher = False
conv_cls_name = "fcos_head_cls"
bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob)
self.fcos_head_cls = self.add_sublayer(
conv_cls_name,
nn.Conv2D(
in_channels=256,
out_channels=self.num_classes,
kernel_size=3,
stride=1,
padding=1,
weight_attr=ParamAttr(initializer=Normal(
mean=0., std=0.01)),
bias_attr=ParamAttr(
initializer=Constant(value=bias_init_value))))
conv_reg_name = "fcos_head_reg"
self.fcos_head_reg = self.add_sublayer(
conv_reg_name,
nn.Conv2D(
in_channels=256,
out_channels=4,
kernel_size=3,
stride=1,
padding=1,
weight_attr=ParamAttr(initializer=Normal(
mean=0., std=0.01)),
bias_attr=ParamAttr(initializer=Constant(value=0))))
conv_centerness_name = "fcos_head_centerness"
self.fcos_head_centerness = self.add_sublayer(
conv_centerness_name,
nn.Conv2D(
in_channels=256,
out_channels=1,
kernel_size=3,
stride=1,
padding=1,
weight_attr=ParamAttr(initializer=Normal(
mean=0., std=0.01)),
bias_attr=ParamAttr(initializer=Constant(value=0))))
self.scales_regs = []
for i in range(len(self.fpn_stride)):
lvl = int(math.log(int(self.fpn_stride[i]), 2))
feat_name = 'p{}_feat'.format(lvl)
scale_reg = self.add_sublayer(feat_name, ScaleReg())
self.scales_regs.append(scale_reg)
def _compute_locations_by_level(self, fpn_stride, feature, num_shift=0.5):
"""
Compute locations of anchor points of each FPN layer
Args:
fpn_stride (int): The stride of current FPN feature map
feature (Tensor): Tensor of current FPN feature map
Return:
Anchor points locations of current FPN feature map
"""
h, w = feature.shape[2], feature.shape[3]
shift_x = paddle.arange(0, w * fpn_stride, fpn_stride)
shift_y = paddle.arange(0, h * fpn_stride, fpn_stride)
shift_x = paddle.unsqueeze(shift_x, axis=0)
shift_y = paddle.unsqueeze(shift_y, axis=1)
shift_x = paddle.expand(shift_x, shape=[h, w])
shift_y = paddle.expand(shift_y, shape=[h, w])
shift_x = paddle.reshape(shift_x, shape=[-1])
shift_y = paddle.reshape(shift_y, shape=[-1])
location = paddle.stack(
[shift_x, shift_y], axis=-1) + float(fpn_stride * num_shift)
return location
def forward(self, fpn_feats, targets=None):
assert len(fpn_feats) == len(
self.fpn_stride
), "The size of fpn_feats is not equal to size of fpn_stride"
cls_logits_list = []
bboxes_reg_list = []
centerness_list = []
for scale_reg, fpn_stride, fpn_feat in zip(self.scales_regs,
self.fpn_stride, fpn_feats):
fcos_cls_feat, fcos_reg_feat = self.fcos_feat(fpn_feat)
cls_logits = self.fcos_head_cls(fcos_cls_feat)
bbox_reg = scale_reg(self.fcos_head_reg(fcos_reg_feat))
if self.centerness_on_reg:
centerness = self.fcos_head_centerness(fcos_reg_feat)
else:
centerness = self.fcos_head_centerness(fcos_cls_feat)
if self.norm_reg_targets:
bbox_reg = F.relu(bbox_reg)
if self.multiply_strides_reg_targets:
bbox_reg = bbox_reg * fpn_stride
else:
if not self.training or targets.get(
'get_data',
False) or targets.get('is_teacher', False):
bbox_reg = bbox_reg * fpn_stride
else:
bbox_reg = paddle.exp(bbox_reg)
cls_logits_list.append(cls_logits)
bboxes_reg_list.append(bbox_reg)
centerness_list.append(centerness)
if targets is not None:
self.is_teacher = targets.get('is_teacher', False)
if self.is_teacher:
return [cls_logits_list, bboxes_reg_list, centerness_list]
if self.training and targets is not None:
get_data = targets.get('get_data', False)
if get_data:
return [cls_logits_list, bboxes_reg_list, centerness_list]
losses = {}
fcos_head_outs = [cls_logits_list, bboxes_reg_list, centerness_list]
losses_fcos = self.get_loss(fcos_head_outs, targets)
losses.update(losses_fcos)
total_loss = paddle.add_n(list(losses.values()))
losses.update({'loss': total_loss})
return losses
else:
# eval or infer
locations_list = []
for fpn_stride, feature in zip(self.fpn_stride, fpn_feats):
location = self._compute_locations_by_level(fpn_stride, feature,
self.num_shift)
locations_list.append(location)
fcos_head_outs = [
locations_list, cls_logits_list, bboxes_reg_list,
centerness_list
]
return fcos_head_outs
def get_loss(self, fcos_head_outs, targets):
cls_logits, bboxes_reg, centerness = fcos_head_outs
# get labels,reg_target,centerness
tag_labels, tag_bboxes, tag_centerness = [], [], []
for i in range(len(self.fpn_stride)):
k_lbl = 'labels{}'.format(i)
if k_lbl in targets:
tag_labels.append(targets[k_lbl])
k_box = 'reg_target{}'.format(i)
if k_box in targets:
tag_bboxes.append(targets[k_box])
k_ctn = 'centerness{}'.format(i)
if k_ctn in targets:
tag_centerness.append(targets[k_ctn])
losses_fcos = self.fcos_loss(cls_logits, bboxes_reg, centerness,
tag_labels, tag_bboxes, tag_centerness)
return losses_fcos
def _post_process_by_level(self,
locations,
box_cls,
box_reg,
box_ctn,
sqrt_score=False):
box_scores = F.sigmoid(box_cls).flatten(2).transpose([0, 2, 1])
box_centerness = F.sigmoid(box_ctn).flatten(2).transpose([0, 2, 1])
pred_scores = box_scores * box_centerness
if sqrt_score:
pred_scores = paddle.sqrt(pred_scores)
box_reg_ch_last = box_reg.flatten(2).transpose([0, 2, 1])
box_reg_decoding = paddle.stack(
[
locations[:, 0] - box_reg_ch_last[:, :, 0],
locations[:, 1] - box_reg_ch_last[:, :, 1],
locations[:, 0] + box_reg_ch_last[:, :, 2],
locations[:, 1] + box_reg_ch_last[:, :, 3]
],
axis=1)
pred_boxes = box_reg_decoding.transpose([0, 2, 1])
return pred_scores, pred_boxes
def post_process(self, fcos_head_outs, scale_factor):
locations, cls_logits, bboxes_reg, centerness = fcos_head_outs
pred_bboxes, pred_scores = [], []
for pts, cls, reg, ctn in zip(locations, cls_logits, bboxes_reg,
centerness):
scores, boxes = self._post_process_by_level(pts, cls, reg, ctn,
self.sqrt_score)
pred_scores.append(scores)
pred_bboxes.append(boxes)
pred_bboxes = paddle.concat(pred_bboxes, axis=1)
pred_scores = paddle.concat(pred_scores, axis=1)
# scale bbox to origin
scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)
scale_factor = paddle.concat(
[scale_x, scale_y, scale_x, scale_y], axis=-1).reshape([-1, 1, 4])
pred_bboxes /= scale_factor
pred_scores = pred_scores.transpose([0, 2, 1])
bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
return bbox_pred, bbox_num
@register
class FCOSHead_ARSL(FCOSHead):
"""
FCOSHead of ARSL for semi-det(ssod)
Args:
fcos_feat (object): Instance of 'FCOSFeat'
num_classes (int): Number of classes
fpn_stride (list): The stride of each FPN Layer
prior_prob (float): Used to set the bias init for the class prediction layer
fcos_loss (object): Instance of 'FCOSLoss'
norm_reg_targets (bool): Normalization the regression target if true
centerness_on_reg (bool): The prediction of centerness on regression or clssification branch
nms (object): Instance of 'MultiClassNMS'
trt (bool): Whether to use trt in nms of deploy
"""
__inject__ = ['fcos_feat', 'fcos_loss', 'nms']
__shared__ = ['num_classes', 'trt']
def __init__(self,
num_classes=80,
fcos_feat='FCOSFeat',
fpn_stride=[8, 16, 32, 64, 128],
prior_prob=0.01,
multiply_strides_reg_targets=False,
norm_reg_targets=True,
centerness_on_reg=True,
num_shift=0.5,
sqrt_score=False,
fcos_loss='FCOSLossMILC',
nms='MultiClassNMS',
trt=False):
super(FCOSHead_ARSL, self).__init__()
self.fcos_feat = fcos_feat
self.num_classes = num_classes
self.fpn_stride = fpn_stride
self.prior_prob = prior_prob
self.fcos_loss = fcos_loss
self.norm_reg_targets = norm_reg_targets
self.centerness_on_reg = centerness_on_reg
self.multiply_strides_reg_targets = multiply_strides_reg_targets
self.num_shift = num_shift
self.nms = nms
if isinstance(self.nms, MultiClassNMS) and trt:
self.nms.trt = trt
self.sqrt_score = sqrt_score
conv_cls_name = "fcos_head_cls"
bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob)
self.fcos_head_cls = self.add_sublayer(
conv_cls_name,
nn.Conv2D(
in_channels=256,
out_channels=self.num_classes,
kernel_size=3,
stride=1,
padding=1,
weight_attr=ParamAttr(initializer=Normal(
mean=0., std=0.01)),
bias_attr=ParamAttr(
initializer=Constant(value=bias_init_value))))
conv_reg_name = "fcos_head_reg"
self.fcos_head_reg = self.add_sublayer(
conv_reg_name,
nn.Conv2D(
in_channels=256,
out_channels=4,
kernel_size=3,
stride=1,
padding=1,
weight_attr=ParamAttr(initializer=Normal(
mean=0., std=0.01)),
bias_attr=ParamAttr(initializer=Constant(value=0))))
conv_centerness_name = "fcos_head_centerness"
self.fcos_head_centerness = self.add_sublayer(
conv_centerness_name,
nn.Conv2D(
in_channels=256,
out_channels=1,
kernel_size=3,
stride=1,
padding=1,
weight_attr=ParamAttr(initializer=Normal(
mean=0., std=0.01)),
bias_attr=ParamAttr(initializer=Constant(value=0))))
self.scales_regs = []
for i in range(len(self.fpn_stride)):
lvl = int(math.log(int(self.fpn_stride[i]), 2))
feat_name = 'p{}_feat'.format(lvl)
scale_reg = self.add_sublayer(feat_name, ScaleReg())
self.scales_regs.append(scale_reg)
def forward(self, fpn_feats, targets=None):
assert len(fpn_feats) == len(
self.fpn_stride
), "The size of fpn_feats is not equal to size of fpn_stride"
cls_logits_list = []
bboxes_reg_list = []
centerness_list = []
for scale_reg, fpn_stride, fpn_feat in zip(self.scales_regs,
self.fpn_stride, fpn_feats):
fcos_cls_feat, fcos_reg_feat = self.fcos_feat(fpn_feat)
cls_logits = self.fcos_head_cls(fcos_cls_feat)
bbox_reg = scale_reg(self.fcos_head_reg(fcos_reg_feat))
if self.centerness_on_reg:
centerness = self.fcos_head_centerness(fcos_reg_feat)
else:
centerness = self.fcos_head_centerness(fcos_cls_feat)
if self.norm_reg_targets:
bbox_reg = F.relu(bbox_reg)
if not self.training:
bbox_reg = bbox_reg * fpn_stride
else:
bbox_reg = paddle.exp(bbox_reg)
cls_logits_list.append(cls_logits)
bboxes_reg_list.append(bbox_reg)
centerness_list.append(centerness)
if not self.training:
locations_list = []
for fpn_stride, feature in zip(self.fpn_stride, fpn_feats):
location = self._compute_locations_by_level(fpn_stride, feature)
locations_list.append(location)
return locations_list, cls_logits_list, bboxes_reg_list, centerness_list
else:
return cls_logits_list, bboxes_reg_list, centerness_list
def get_loss(self, fcos_head_outs, tag_labels, tag_bboxes, tag_centerness):
cls_logits, bboxes_reg, centerness = fcos_head_outs
return self.fcos_loss(cls_logits, bboxes_reg, centerness, tag_labels,
tag_bboxes, tag_centerness)
================================================
FILE: ppdet/modeling/heads/fcosr_head.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from .fcos_head import ScaleReg
from ..initializer import bias_init_with_prob, constant_, normal_
from ..ops import get_act_fn, anchor_generator
from ..rbox_utils import box2corners
from ..losses import ProbIoULoss
import numpy as np
__all__ = ['FCOSRHead']
def trunc_div(a, b):
ipt = paddle.divide(a, b)
sign_ipt = paddle.sign(ipt)
abs_ipt = paddle.abs(ipt)
abs_ipt = paddle.floor(abs_ipt)
out = paddle.multiply(sign_ipt, abs_ipt)
return out
def fmod(a, b):
return a - trunc_div(a, b) * b
def fmod_eval(a, b):
return a - a.divide(b).cast(paddle.int32).cast(paddle.float32) * b
class ConvBNLayer(nn.Layer):
def __init__(self,
ch_in,
ch_out,
filter_size=3,
stride=1,
groups=1,
padding=0,
norm_cfg={'name': 'gn',
'num_groups': 32},
act=None):
super(ConvBNLayer, self).__init__()
self.conv = nn.Conv2D(
in_channels=ch_in,
out_channels=ch_out,
kernel_size=filter_size,
stride=stride,
padding=padding,
groups=groups,
bias_attr=False)
norm_type = norm_cfg['name']
if norm_type in ['sync_bn', 'bn']:
self.norm = nn.BatchNorm2D(
ch_out,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
else:
groups = norm_cfg.get('num_groups', 1)
self.norm = nn.GroupNorm(
num_groups=groups,
num_channels=ch_out,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
self.act = get_act_fn(act) if act is None or isinstance(act, (
str, dict)) else act
def forward(self, x):
x = self.conv(x)
x = self.norm(x)
x = self.act(x)
return x
@register
class FCOSRHead(nn.Layer):
""" FCOSR Head, refer to https://arxiv.org/abs/2111.10780 for details """
__shared__ = ['num_classes', 'trt']
__inject__ = ['assigner', 'nms']
def __init__(self,
num_classes=15,
in_channels=256,
feat_channels=256,
stacked_convs=4,
act='relu',
fpn_strides=[4, 8, 16, 32, 64],
trt=False,
loss_weight={'class': 1.0,
'probiou': 1.0},
norm_cfg={'name': 'gn',
'num_groups': 32},
assigner='FCOSRAssigner',
nms='MultiClassNMS'):
super(FCOSRHead, self).__init__()
self.in_channels = in_channels
self.num_classes = num_classes
self.fpn_strides = fpn_strides
self.stacked_convs = stacked_convs
self.loss_weight = loss_weight
self.half_pi = paddle.to_tensor(
[1.5707963267948966], dtype=paddle.float32)
self.probiou_loss = ProbIoULoss(mode='l1')
act = get_act_fn(
act, trt=trt) if act is None or isinstance(act,
(str, dict)) else act
self.trt = trt
self.loss_weight = loss_weight
self.assigner = assigner
self.nms = nms
# stem
self.stem_cls = nn.LayerList()
self.stem_reg = nn.LayerList()
for i in range(self.stacked_convs):
self.stem_cls.append(
ConvBNLayer(
self.in_channels[i],
feat_channels,
filter_size=3,
stride=1,
padding=1,
norm_cfg=norm_cfg,
act=act))
self.stem_reg.append(
ConvBNLayer(
self.in_channels[i],
feat_channels,
filter_size=3,
stride=1,
padding=1,
norm_cfg=norm_cfg,
act=act))
self.scales = nn.LayerList(
[ScaleReg() for _ in range(len(fpn_strides))])
# prediction
self.pred_cls = nn.Conv2D(feat_channels, self.num_classes, 3, padding=1)
self.pred_xy = nn.Conv2D(feat_channels, 2, 3, padding=1)
self.pred_wh = nn.Conv2D(feat_channels, 2, 3, padding=1)
self.pred_angle = nn.Conv2D(feat_channels, 1, 3, padding=1)
self._init_weights()
def _init_weights(self):
for cls_, reg_ in zip(self.stem_cls, self.stem_reg):
normal_(cls_.conv.weight, std=0.01)
normal_(reg_.conv.weight, std=0.01)
bias_cls = bias_init_with_prob(0.01)
normal_(self.pred_cls.weight, std=0.01)
constant_(self.pred_cls.bias, bias_cls)
normal_(self.pred_xy.weight, std=0.01)
normal_(self.pred_wh.weight, std=0.01)
normal_(self.pred_angle.weight, std=0.01)
@classmethod
def from_config(cls, cfg, input_shape):
return {'in_channels': [i.channels for i in input_shape], }
def _generate_anchors(self, feats):
if self.trt:
anchor_points = []
for feat, stride in zip(feats, self.fpn_strides):
_, _, h, w = feat.shape
anchor, _ = anchor_generator(
feat,
stride * 4,
1.0, [1.0, 1.0, 1.0, 1.0], [stride, stride],
offset=0.5)
x1, y1, x2, y2 = paddle.split(anchor, 4, axis=-1)
xc = (x1 + x2 + 1) / 2
yc = (y1 + y2 + 1) / 2
anchor_point = paddle.concat(
[xc, yc], axis=-1).reshape((1, h * w, 2))
anchor_points.append(anchor_point)
anchor_points = paddle.concat(anchor_points, axis=1)
return anchor_points, None, None
else:
anchor_points = []
stride_tensor = []
num_anchors_list = []
for feat, stride in zip(feats, self.fpn_strides):
_, _, h, w = feat.shape
shift_x = (paddle.arange(end=w) + 0.5) * stride
shift_y = (paddle.arange(end=h) + 0.5) * stride
shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
anchor_point = paddle.cast(
paddle.stack(
[shift_x, shift_y], axis=-1), dtype='float32')
anchor_points.append(anchor_point.reshape([1, -1, 2]))
stride_tensor.append(
paddle.full(
[1, h * w, 1], stride, dtype='float32'))
num_anchors_list.append(h * w)
anchor_points = paddle.concat(anchor_points, axis=1)
stride_tensor = paddle.concat(stride_tensor, axis=1)
return anchor_points, stride_tensor, num_anchors_list
def forward(self, feats, target=None):
if self.training:
return self.forward_train(feats, target)
else:
return self.forward_eval(feats, target)
def forward_train(self, feats, target=None):
anchor_points, stride_tensor, num_anchors_list = self._generate_anchors(
feats)
cls_pred_list, reg_pred_list = [], []
for stride, feat, scale in zip(self.fpn_strides, feats, self.scales):
# cls
cls_feat = feat
for cls_layer in self.stem_cls:
cls_feat = cls_layer(cls_feat)
cls_pred = F.sigmoid(self.pred_cls(cls_feat))
cls_pred_list.append(cls_pred.flatten(2).transpose((0, 2, 1)))
# reg
reg_feat = feat
for reg_layer in self.stem_reg:
reg_feat = reg_layer(reg_feat)
reg_xy = scale(self.pred_xy(reg_feat)) * stride
reg_wh = F.elu(scale(self.pred_wh(reg_feat)) + 1.) * stride
reg_angle = self.pred_angle(reg_feat)
reg_angle = fmod(reg_angle, self.half_pi)
reg_pred = paddle.concat([reg_xy, reg_wh, reg_angle], axis=1)
reg_pred_list.append(reg_pred.flatten(2).transpose((0, 2, 1)))
cls_pred_list = paddle.concat(cls_pred_list, axis=1)
reg_pred_list = paddle.concat(reg_pred_list, axis=1)
return self.get_loss([
cls_pred_list, reg_pred_list, anchor_points, stride_tensor,
num_anchors_list
], target)
def forward_eval(self, feats, target=None):
cls_pred_list, reg_pred_list = [], []
anchor_points, _, _ = self._generate_anchors(feats)
for stride, feat, scale in zip(self.fpn_strides, feats, self.scales):
b, _, h, w = feat.shape
# cls
cls_feat = feat
for cls_layer in self.stem_cls:
cls_feat = cls_layer(cls_feat)
cls_pred = F.sigmoid(self.pred_cls(cls_feat))
cls_pred_list.append(cls_pred.reshape([b, self.num_classes, h * w]))
# reg
reg_feat = feat
for reg_layer in self.stem_reg:
reg_feat = reg_layer(reg_feat)
reg_xy = scale(self.pred_xy(reg_feat)) * stride
reg_wh = F.elu(scale(self.pred_wh(reg_feat)) + 1.) * stride
reg_angle = self.pred_angle(reg_feat)
reg_angle = fmod_eval(reg_angle, self.half_pi)
reg_pred = paddle.concat([reg_xy, reg_wh, reg_angle], axis=1)
reg_pred = reg_pred.reshape([b, 5, h * w]).transpose((0, 2, 1))
reg_pred_list.append(reg_pred)
cls_pred_list = paddle.concat(cls_pred_list, axis=2)
reg_pred_list = paddle.concat(reg_pred_list, axis=1)
reg_pred_list = self._bbox_decode(anchor_points, reg_pred_list)
return cls_pred_list, reg_pred_list
def _bbox_decode(self, points, reg_pred_list):
xy, wha = paddle.split(reg_pred_list, [2, 3], axis=-1)
xy = xy + points
return paddle.concat([xy, wha], axis=-1)
def _box2corners(self, pred_bboxes):
""" convert (x, y, w, h, angle) to (x1, y1, x2, y2, x3, y3, x4, y4)
Args:
pred_bboxes (Tensor): [B, N, 5]
Returns:
polys (Tensor): [B, N, 8]
"""
x, y, w, h, angle = paddle.split(pred_bboxes, 5, axis=-1)
cos_a_half = paddle.cos(angle) * 0.5
sin_a_half = paddle.sin(angle) * 0.5
w_x = cos_a_half * w
w_y = sin_a_half * w
h_x = -sin_a_half * h
h_y = cos_a_half * h
return paddle.concat(
[
x + w_x + h_x, y + w_y + h_y, x - w_x + h_x, y - w_y + h_y,
x - w_x - h_x, y - w_y - h_y, x + w_x - h_x, y + w_y - h_y
],
axis=-1)
def get_loss(self, head_outs, gt_meta):
cls_pred_list, reg_pred_list, anchor_points, stride_tensor, num_anchors_list = head_outs
gt_labels = gt_meta['gt_class']
gt_bboxes = gt_meta['gt_bbox']
gt_rboxes = gt_meta['gt_rbox']
pad_gt_mask = gt_meta['pad_gt_mask']
# decode
pred_rboxes = self._bbox_decode(anchor_points, reg_pred_list)
# label assignment
assigned_labels, assigned_rboxes, assigned_scores = \
self.assigner(
anchor_points,
stride_tensor,
num_anchors_list,
gt_labels,
gt_bboxes,
gt_rboxes,
pad_gt_mask,
self.num_classes,
pred_rboxes
)
# reg_loss
mask_positive = (assigned_labels != self.num_classes)
num_pos = mask_positive.sum().item()
if num_pos > 0:
bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 5])
pred_rboxes_pos = paddle.masked_select(pred_rboxes,
bbox_mask).reshape([-1, 5])
assigned_rboxes_pos = paddle.masked_select(
assigned_rboxes, bbox_mask).reshape([-1, 5])
bbox_weight = paddle.masked_select(
assigned_scores.sum(-1), mask_positive).reshape([-1])
avg_factor = bbox_weight.sum()
loss_probiou = self.probiou_loss(pred_rboxes_pos,
assigned_rboxes_pos)
loss_probiou = paddle.sum(loss_probiou * bbox_weight) / avg_factor
else:
loss_probiou = pred_rboxes.sum() * 0.
avg_factor = max(num_pos, 1.0)
# cls_loss
loss_cls = self._qfocal_loss(
cls_pred_list, assigned_scores, reduction='sum')
loss_cls = loss_cls / avg_factor
loss = self.loss_weight['class'] * loss_cls + \
self.loss_weight['probiou'] * loss_probiou
out_dict = {
'loss': loss,
'loss_probiou': loss_probiou,
'loss_cls': loss_cls
}
return out_dict
@staticmethod
def _qfocal_loss(score, label, gamma=2.0, reduction='sum'):
weight = (score - label).pow(gamma)
loss = F.binary_cross_entropy(
score, label, weight=weight, reduction=reduction)
return loss
def post_process(self, head_outs, scale_factor):
pred_scores, pred_rboxes = head_outs
# [B, N, 5] -> [B, N, 4, 2] -> [B, N, 8]
pred_rboxes = self._box2corners(pred_rboxes)
# scale bbox to origin
scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)
scale_factor = paddle.concat(
[
scale_x, scale_y, scale_x, scale_y, scale_x, scale_y, scale_x,
scale_y
],
axis=-1).reshape([-1, 1, 8])
pred_rboxes /= scale_factor
bbox_pred, bbox_num, before_nms_indexes = self.nms(pred_rboxes,
pred_scores)
return bbox_pred, bbox_num, before_nms_indexes
================================================
FILE: ppdet/modeling/heads/gfl_head.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# The code is based on:
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/gfl_head.py
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import Normal, Constant
from ppdet.core.workspace import register
from ppdet.modeling.bbox_utils import distance2bbox, bbox2distance, batch_distance2bbox
from ppdet.data.transform.atss_assigner import bbox_overlaps
__all__ = ['GFLHead', 'LDGFLHead']
class ScaleReg(nn.Layer):
"""
Parameter for scaling the regression outputs.
"""
def __init__(self):
super(ScaleReg, self).__init__()
self.scale_reg = self.create_parameter(
shape=[1],
attr=ParamAttr(initializer=Constant(value=1.)),
dtype="float32")
def forward(self, inputs):
out = inputs * self.scale_reg
return out
class Integral(nn.Layer):
"""A fixed layer for calculating integral result from distribution.
This layer calculates the target location by :math: `sum{P(y_i) * y_i}`,
P(y_i) denotes the softmax vector that represents the discrete distribution
y_i denotes the discrete set, usually {0, 1, 2, ..., reg_max}
Args:
reg_max (int): The maximal value of the discrete set. Default: 16. You
may want to reset it according to your new dataset or related
settings.
"""
def __init__(self, reg_max=16):
super(Integral, self).__init__()
self.reg_max = reg_max
self.register_buffer('project',
paddle.linspace(0, self.reg_max, self.reg_max + 1))
def forward(self, x):
"""Forward feature from the regression head to get integral result of
bounding box location.
Args:
x (Tensor): Features of the regression head, shape (N, 4*(n+1)),
n is self.reg_max.
Returns:
x (Tensor): Integral result of box locations, i.e., distance
offsets from the box center in four directions, shape (N, 4).
"""
x = F.softmax(x.reshape([-1, self.reg_max + 1]), axis=1)
x = F.linear(x, self.project)
if self.training:
x = x.reshape([-1, 4])
return x
@register
class DGQP(nn.Layer):
"""Distribution-Guided Quality Predictor of GFocal head
Args:
reg_topk (int): top-k statistics of distribution to guide LQE
reg_channels (int): hidden layer unit to generate LQE
add_mean (bool): Whether to calculate the mean of top-k statistics
"""
def __init__(self, reg_topk=4, reg_channels=64, add_mean=True):
super(DGQP, self).__init__()
self.reg_topk = reg_topk
self.reg_channels = reg_channels
self.add_mean = add_mean
self.total_dim = reg_topk
if add_mean:
self.total_dim += 1
self.reg_conv1 = self.add_sublayer(
'dgqp_reg_conv1',
nn.Conv2D(
in_channels=4 * self.total_dim,
out_channels=self.reg_channels,
kernel_size=1,
weight_attr=ParamAttr(initializer=Normal(
mean=0., std=0.01)),
bias_attr=ParamAttr(initializer=Constant(value=0))))
self.reg_conv2 = self.add_sublayer(
'dgqp_reg_conv2',
nn.Conv2D(
in_channels=self.reg_channels,
out_channels=1,
kernel_size=1,
weight_attr=ParamAttr(initializer=Normal(
mean=0., std=0.01)),
bias_attr=ParamAttr(initializer=Constant(value=0))))
def forward(self, x):
"""Forward feature from the regression head to get integral result of
bounding box location.
Args:
x (Tensor): Features of the regression head, shape (N, 4*(n+1)),
n is self.reg_max.
Returns:
x (Tensor): Integral result of box locations, i.e., distance
offsets from the box center in four directions, shape (N, 4).
"""
N, _, H, W = x.shape[:]
prob = F.softmax(x.reshape([N, 4, -1, H, W]), axis=2)
prob_topk, _ = prob.topk(self.reg_topk, axis=2)
if self.add_mean:
stat = paddle.concat(
[prob_topk, prob_topk.mean(
axis=2, keepdim=True)], axis=2)
else:
stat = prob_topk
y = F.relu(self.reg_conv1(stat.reshape([N, 4 * self.total_dim, H, W])))
y = F.sigmoid(self.reg_conv2(y))
return y
@register
class GFLHead(nn.Layer):
"""
GFLHead
Args:
conv_feat (object): Instance of 'FCOSFeat'
num_classes (int): Number of classes
fpn_stride (list): The stride of each FPN Layer
prior_prob (float): Used to set the bias init for the class prediction layer
loss_class (object): Instance of QualityFocalLoss.
loss_dfl (object): Instance of DistributionFocalLoss.
loss_bbox (object): Instance of bbox loss.
reg_max: Max value of integral set :math: `{0, ..., reg_max}`
n QFL setting. Default: 16.
"""
__inject__ = [
'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox', 'nms'
]
__shared__ = ['num_classes']
def __init__(self,
conv_feat='FCOSFeat',
dgqp_module=None,
num_classes=80,
fpn_stride=[8, 16, 32, 64, 128],
prior_prob=0.01,
loss_class='QualityFocalLoss',
loss_dfl='DistributionFocalLoss',
loss_bbox='GIoULoss',
reg_max=16,
feat_in_chan=256,
nms=None,
nms_pre=1000,
cell_offset=0):
super(GFLHead, self).__init__()
self.conv_feat = conv_feat
self.dgqp_module = dgqp_module
self.num_classes = num_classes
self.fpn_stride = fpn_stride
self.prior_prob = prior_prob
self.loss_qfl = loss_class
self.loss_dfl = loss_dfl
self.loss_bbox = loss_bbox
self.reg_max = reg_max
self.feat_in_chan = feat_in_chan
self.nms = nms
self.nms_pre = nms_pre
self.cell_offset = cell_offset
self.use_sigmoid = self.loss_qfl.use_sigmoid
if self.use_sigmoid:
self.cls_out_channels = self.num_classes
else:
self.cls_out_channels = self.num_classes + 1
conv_cls_name = "gfl_head_cls"
bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob)
self.gfl_head_cls = self.add_sublayer(
conv_cls_name,
nn.Conv2D(
in_channels=self.feat_in_chan,
out_channels=self.cls_out_channels,
kernel_size=3,
stride=1,
padding=1,
weight_attr=ParamAttr(initializer=Normal(
mean=0., std=0.01)),
bias_attr=ParamAttr(
initializer=Constant(value=bias_init_value))))
conv_reg_name = "gfl_head_reg"
self.gfl_head_reg = self.add_sublayer(
conv_reg_name,
nn.Conv2D(
in_channels=self.feat_in_chan,
out_channels=4 * (self.reg_max + 1),
kernel_size=3,
stride=1,
padding=1,
weight_attr=ParamAttr(initializer=Normal(
mean=0., std=0.01)),
bias_attr=ParamAttr(initializer=Constant(value=0))))
self.scales_regs = []
for i in range(len(self.fpn_stride)):
lvl = int(math.log(int(self.fpn_stride[i]), 2))
feat_name = 'p{}_feat'.format(lvl)
scale_reg = self.add_sublayer(feat_name, ScaleReg())
self.scales_regs.append(scale_reg)
self.distribution_project = Integral(self.reg_max)
def forward(self, fpn_feats):
assert len(fpn_feats) == len(
self.fpn_stride
), "The size of fpn_feats is not equal to size of fpn_stride"
cls_logits_list = []
bboxes_reg_list = []
for stride, scale_reg, fpn_feat in zip(self.fpn_stride,
self.scales_regs, fpn_feats):
conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat)
cls_score = self.gfl_head_cls(conv_cls_feat)
bbox_pred = scale_reg(self.gfl_head_reg(conv_reg_feat))
if self.dgqp_module:
quality_score = self.dgqp_module(bbox_pred)
cls_score = F.sigmoid(cls_score) * quality_score
if not self.training:
cls_score = F.sigmoid(cls_score.transpose([0, 2, 3, 1]))
bbox_pred = bbox_pred.transpose([0, 2, 3, 1])
b, cell_h, cell_w, _ = cls_score.shape
y, x = self.get_single_level_center_point(
[cell_h, cell_w], stride, cell_offset=self.cell_offset)
center_points = paddle.stack([x, y], axis=-1)
cls_score = cls_score.reshape([b, -1, self.cls_out_channels])
bbox_pred = self.distribution_project(bbox_pred) * stride
bbox_pred = bbox_pred.reshape([-1, cell_h * cell_w, 4])
# NOTE: If keep_ratio=False and image shape value that
# multiples of 32, distance2bbox not set max_shapes parameter
# to speed up model prediction. If need to set max_shapes,
# please use inputs['im_shape'].
bbox_pred = batch_distance2bbox(
center_points, bbox_pred, max_shapes=None)
cls_logits_list.append(cls_score)
bboxes_reg_list.append(bbox_pred)
return (cls_logits_list, bboxes_reg_list)
def _images_to_levels(self, target, num_level_anchors):
"""
Convert targets by image to targets by feature level.
"""
level_targets = []
start = 0
for n in num_level_anchors:
end = start + n
level_targets.append(target[:, start:end].squeeze(0))
start = end
return level_targets
def _grid_cells_to_center(self, grid_cells):
"""
Get center location of each gird cell
Args:
grid_cells: grid cells of a feature map
Returns:
center points
"""
cells_cx = (grid_cells[:, 2] + grid_cells[:, 0]) / 2
cells_cy = (grid_cells[:, 3] + grid_cells[:, 1]) / 2
return paddle.stack([cells_cx, cells_cy], axis=-1)
def get_loss(self, gfl_head_outs, gt_meta):
cls_logits, bboxes_reg = gfl_head_outs
num_level_anchors = [
featmap.shape[-2] * featmap.shape[-1] for featmap in cls_logits
]
grid_cells_list = self._images_to_levels(gt_meta['grid_cells'],
num_level_anchors)
labels_list = self._images_to_levels(gt_meta['labels'],
num_level_anchors)
label_weights_list = self._images_to_levels(gt_meta['label_weights'],
num_level_anchors)
bbox_targets_list = self._images_to_levels(gt_meta['bbox_targets'],
num_level_anchors)
num_total_pos = sum(gt_meta['pos_num'])
try:
paddle.distributed.all_reduce(num_total_pos)
num_total_pos = paddle.clip(
num_total_pos / paddle.distributed.get_world_size(), min=1)
except:
num_total_pos = max(num_total_pos, 1)
loss_bbox_list, loss_dfl_list, loss_qfl_list, avg_factor = [], [], [], []
for cls_score, bbox_pred, grid_cells, labels, label_weights, bbox_targets, stride in zip(
cls_logits, bboxes_reg, grid_cells_list, labels_list,
label_weights_list, bbox_targets_list, self.fpn_stride):
grid_cells = grid_cells.reshape([-1, 4])
cls_score = cls_score.transpose([0, 2, 3, 1]).reshape(
[-1, self.cls_out_channels])
bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape(
[-1, 4 * (self.reg_max + 1)])
bbox_targets = bbox_targets.reshape([-1, 4])
labels = labels.reshape([-1])
label_weights = label_weights.reshape([-1])
bg_class_ind = self.num_classes
pos_inds = paddle.nonzero(
paddle.logical_and((labels >= 0), (labels < bg_class_ind)),
as_tuple=False).squeeze(1)
score = np.zeros(labels.shape)
if len(pos_inds) > 0:
pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0)
pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0)
pos_grid_cells = paddle.gather(grid_cells, pos_inds, axis=0)
pos_grid_cell_centers = self._grid_cells_to_center(
pos_grid_cells) / stride
weight_targets = F.sigmoid(cls_score.detach())
weight_targets = paddle.gather(
weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0)
pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred)
pos_decode_bbox_pred = distance2bbox(pos_grid_cell_centers,
pos_bbox_pred_corners)
pos_decode_bbox_targets = pos_bbox_targets / stride
bbox_iou = bbox_overlaps(
pos_decode_bbox_pred.detach().numpy(),
pos_decode_bbox_targets.detach().numpy(),
is_aligned=True)
score[pos_inds.numpy()] = bbox_iou
pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1])
target_corners = bbox2distance(pos_grid_cell_centers,
pos_decode_bbox_targets,
self.reg_max).reshape([-1])
# regression loss
loss_bbox = paddle.sum(
self.loss_bbox(pos_decode_bbox_pred,
pos_decode_bbox_targets) * weight_targets)
# dfl loss
loss_dfl = self.loss_dfl(
pred_corners,
target_corners,
weight=weight_targets.expand([-1, 4]).reshape([-1]),
avg_factor=4.0)
else:
loss_bbox = bbox_pred.sum() * 0
loss_dfl = bbox_pred.sum() * 0
weight_targets = paddle.to_tensor([0], dtype='float32')
# qfl loss
score = paddle.to_tensor(score)
loss_qfl = self.loss_qfl(
cls_score, (labels, score),
weight=label_weights,
avg_factor=num_total_pos)
loss_bbox_list.append(loss_bbox)
loss_dfl_list.append(loss_dfl)
loss_qfl_list.append(loss_qfl)
avg_factor.append(weight_targets.sum())
avg_factor = sum(avg_factor)
try:
paddle.distributed.all_reduce(avg_factor)
avg_factor = paddle.clip(
avg_factor / paddle.distributed.get_world_size(), min=1)
except:
avg_factor = max(avg_factor.item(), 1)
if avg_factor <= 0:
loss_qfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
loss_bbox = paddle.to_tensor(
0, dtype='float32', stop_gradient=False)
loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
else:
losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list))
losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list))
loss_qfl = sum(loss_qfl_list)
loss_bbox = sum(losses_bbox)
loss_dfl = sum(losses_dfl)
loss_states = dict(
loss_qfl=loss_qfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl)
return loss_states
def get_single_level_center_point(self, featmap_size, stride,
cell_offset=0):
"""
Generate pixel centers of a single stage feature map.
Args:
featmap_size: height and width of the feature map
stride: down sample stride of the feature map
Returns:
y and x of the center points
"""
h, w = featmap_size
x_range = (paddle.arange(w, dtype='float32') + cell_offset) * stride
y_range = (paddle.arange(h, dtype='float32') + cell_offset) * stride
y, x = paddle.meshgrid(y_range, x_range)
y = y.flatten()
x = x.flatten()
return y, x
def post_process(self, gfl_head_outs, im_shape, scale_factor):
cls_scores, bboxes_reg = gfl_head_outs
bboxes = paddle.concat(bboxes_reg, axis=1)
# rescale: [h_scale, w_scale] -> [w_scale, h_scale, w_scale, h_scale]
im_scale = scale_factor.flip([1]).tile([1, 2]).unsqueeze(1)
bboxes /= im_scale
mlvl_scores = paddle.concat(cls_scores, axis=1)
mlvl_scores = mlvl_scores.transpose([0, 2, 1])
bbox_pred, bbox_num, _ = self.nms(bboxes, mlvl_scores)
return bbox_pred, bbox_num
@register
class LDGFLHead(GFLHead):
"""
GFLHead for LD distill
Args:
conv_feat (object): Instance of 'FCOSFeat'
num_classes (int): Number of classes
fpn_stride (list): The stride of each FPN Layer
prior_prob (float): Used to set the bias init for the class prediction layer
loss_class (object): Instance of QualityFocalLoss.
loss_dfl (object): Instance of DistributionFocalLoss.
loss_bbox (object): Instance of bbox loss.
reg_max: Max value of integral set :math: `{0, ..., reg_max}`
n QFL setting. Default: 16.
"""
__inject__ = [
'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox',
'loss_ld', 'loss_ld_vlr', 'loss_kd', 'nms'
]
__shared__ = ['num_classes']
def __init__(self,
conv_feat='FCOSFeat',
dgqp_module=None,
num_classes=80,
fpn_stride=[8, 16, 32, 64, 128],
prior_prob=0.01,
loss_class='QualityFocalLoss',
loss_dfl='DistributionFocalLoss',
loss_bbox='GIoULoss',
loss_ld='KnowledgeDistillationKLDivLoss',
loss_ld_vlr='KnowledgeDistillationKLDivLoss',
loss_kd='KnowledgeDistillationKLDivLoss',
reg_max=16,
feat_in_chan=256,
nms=None,
nms_pre=1000,
cell_offset=0):
super(LDGFLHead, self).__init__(
conv_feat=conv_feat,
dgqp_module=dgqp_module,
num_classes=num_classes,
fpn_stride=fpn_stride,
prior_prob=prior_prob,
loss_class=loss_class,
loss_dfl=loss_dfl,
loss_bbox=loss_bbox,
reg_max=reg_max,
feat_in_chan=feat_in_chan,
nms=nms,
nms_pre=nms_pre,
cell_offset=cell_offset)
self.loss_ld = loss_ld
self.loss_kd = loss_kd
self.loss_ld_vlr = loss_ld_vlr
def forward(self, fpn_feats):
assert len(fpn_feats) == len(
self.fpn_stride
), "The size of fpn_feats is not equal to size of fpn_stride"
cls_logits_list = []
bboxes_reg_list = []
for stride, scale_reg, fpn_feat in zip(self.fpn_stride,
self.scales_regs, fpn_feats):
conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat)
cls_score = self.gfl_head_cls(conv_cls_feat)
bbox_pred = scale_reg(self.gfl_head_reg(conv_reg_feat))
if self.dgqp_module:
quality_score = self.dgqp_module(bbox_pred)
cls_score = F.sigmoid(cls_score) * quality_score
if not self.training:
cls_score = F.sigmoid(cls_score.transpose([0, 2, 3, 1]))
bbox_pred = bbox_pred.transpose([0, 2, 3, 1])
b, cell_h, cell_w, _ = cls_score.shape
y, x = self.get_single_level_center_point(
[cell_h, cell_w], stride, cell_offset=self.cell_offset)
center_points = paddle.stack([x, y], axis=-1)
cls_score = cls_score.reshape([b, -1, self.cls_out_channels])
bbox_pred = self.distribution_project(bbox_pred) * stride
bbox_pred = bbox_pred.reshape([b, cell_h * cell_w, 4])
# NOTE: If keep_ratio=False and image shape value that
# multiples of 32, distance2bbox not set max_shapes parameter
# to speed up model prediction. If need to set max_shapes,
# please use inputs['im_shape'].
bbox_pred = batch_distance2bbox(
center_points, bbox_pred, max_shapes=None)
cls_logits_list.append(cls_score)
bboxes_reg_list.append(bbox_pred)
return (cls_logits_list, bboxes_reg_list)
def get_loss(self, gfl_head_outs, gt_meta, soft_label_list,
soft_targets_list):
cls_logits, bboxes_reg = gfl_head_outs
num_level_anchors = [
featmap.shape[-2] * featmap.shape[-1] for featmap in cls_logits
]
grid_cells_list = self._images_to_levels(gt_meta['grid_cells'],
num_level_anchors)
labels_list = self._images_to_levels(gt_meta['labels'],
num_level_anchors)
label_weights_list = self._images_to_levels(gt_meta['label_weights'],
num_level_anchors)
bbox_targets_list = self._images_to_levels(gt_meta['bbox_targets'],
num_level_anchors)
# vlr regions
vlr_regions_list = self._images_to_levels(gt_meta['vlr_regions'],
num_level_anchors)
num_total_pos = sum(gt_meta['pos_num'])
try:
paddle.distributed.all_reduce(num_total_pos)
num_total_pos = paddle.clip(
num_total_pos / paddle.distributed.get_world_size(), min=1.)
except:
num_total_pos = max(num_total_pos, 1)
loss_bbox_list, loss_dfl_list, loss_qfl_list, loss_ld_list, avg_factor = [], [], [], [], []
loss_ld_vlr_list, loss_kd_list = [], []
for cls_score, bbox_pred, grid_cells, labels, label_weights, bbox_targets, stride, soft_targets,\
soft_label, vlr_region in zip(
cls_logits, bboxes_reg, grid_cells_list, labels_list,
label_weights_list, bbox_targets_list, self.fpn_stride, soft_targets_list,
soft_label_list, vlr_regions_list):
grid_cells = grid_cells.reshape([-1, 4])
cls_score = cls_score.transpose([0, 2, 3, 1]).reshape(
[-1, self.cls_out_channels])
bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape(
[-1, 4 * (self.reg_max + 1)])
soft_targets = soft_targets.transpose([0, 2, 3, 1]).reshape(
[-1, 4 * (self.reg_max + 1)])
soft_label = soft_label.transpose([0, 2, 3, 1]).reshape(
[-1, self.cls_out_channels])
# feture im
# teacher_x = teacher_x.transpose([0, 2, 3, 1]).reshape([-1, 256])
# x = x.transpose([0, 2, 3, 1]).reshape([-1, 256])
bbox_targets = bbox_targets.reshape([-1, 4])
labels = labels.reshape([-1])
label_weights = label_weights.reshape([-1])
vlr_region = vlr_region.reshape([-1])
bg_class_ind = self.num_classes
pos_inds = paddle.nonzero(
paddle.logical_and((labels >= 0), (labels < bg_class_ind)),
as_tuple=False).squeeze(1)
score = np.zeros(labels.shape)
remain_inds = (vlr_region > 0).nonzero()
if len(pos_inds) > 0:
pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0)
pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0)
pos_grid_cells = paddle.gather(grid_cells, pos_inds, axis=0)
pos_grid_cell_centers = self._grid_cells_to_center(
pos_grid_cells) / stride
weight_targets = F.sigmoid(cls_score.detach())
weight_targets = paddle.gather(
weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0)
pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred)
pos_decode_bbox_pred = distance2bbox(pos_grid_cell_centers,
pos_bbox_pred_corners)
pos_decode_bbox_targets = pos_bbox_targets / stride
bbox_iou = bbox_overlaps(
pos_decode_bbox_pred.detach().numpy(),
pos_decode_bbox_targets.detach().numpy(),
is_aligned=True)
score[pos_inds.numpy()] = bbox_iou
pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1])
pos_soft_targets = paddle.gather(soft_targets, pos_inds, axis=0)
soft_corners = pos_soft_targets.reshape([-1, self.reg_max + 1])
target_corners = bbox2distance(pos_grid_cell_centers,
pos_decode_bbox_targets,
self.reg_max).reshape([-1])
# regression loss
loss_bbox = paddle.sum(
self.loss_bbox(pos_decode_bbox_pred,
pos_decode_bbox_targets) * weight_targets)
# dfl loss
loss_dfl = self.loss_dfl(
pred_corners,
target_corners,
weight=weight_targets.expand([-1, 4]).reshape([-1]),
avg_factor=4.0)
# ld loss
loss_ld = self.loss_ld(
pred_corners,
soft_corners,
weight=weight_targets.expand([-1, 4]).reshape([-1]),
avg_factor=4.0)
loss_kd = self.loss_kd(
paddle.gather(
cls_score, pos_inds, axis=0),
paddle.gather(
soft_label, pos_inds, axis=0),
weight=paddle.gather(
label_weights, pos_inds, axis=0),
avg_factor=pos_inds.shape[0])
else:
loss_bbox = bbox_pred.sum() * 0
loss_dfl = bbox_pred.sum() * 0
loss_ld = bbox_pred.sum() * 0
loss_kd = bbox_pred.sum() * 0
weight_targets = paddle.to_tensor([0], dtype='float32')
if len(remain_inds) > 0:
neg_pred_corners = bbox_pred[remain_inds].reshape(
[-1, self.reg_max + 1])
neg_soft_corners = soft_targets[remain_inds].reshape(
[-1, self.reg_max + 1])
remain_targets = vlr_region[remain_inds]
loss_ld_vlr = self.loss_ld_vlr(
neg_pred_corners,
neg_soft_corners,
weight=remain_targets.expand([-1, 4]).reshape([-1]),
avg_factor=16.0)
else:
loss_ld_vlr = bbox_pred.sum() * 0
# qfl loss
score = paddle.to_tensor(score)
loss_qfl = self.loss_qfl(
cls_score, (labels, score),
weight=label_weights,
avg_factor=num_total_pos)
loss_bbox_list.append(loss_bbox)
loss_dfl_list.append(loss_dfl)
loss_qfl_list.append(loss_qfl)
loss_ld_list.append(loss_ld)
loss_ld_vlr_list.append(loss_ld_vlr)
loss_kd_list.append(loss_kd)
avg_factor.append(weight_targets.sum())
avg_factor = sum(avg_factor) # + 1e-6
try:
paddle.distributed.all_reduce(avg_factor)
avg_factor = paddle.clip(
avg_factor / paddle.distributed.get_world_size(), min=1)
except:
avg_factor = max(avg_factor.item(), 1)
if avg_factor <= 0:
loss_qfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
loss_bbox = paddle.to_tensor(
0, dtype='float32', stop_gradient=False)
loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
loss_ld = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
loss_ld_vlr = paddle.to_tensor(
0, dtype='float32', stop_gradient=False)
loss_kd = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
else:
losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list))
losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list))
loss_qfl = sum(loss_qfl_list)
loss_bbox = sum(losses_bbox)
loss_dfl = sum(losses_dfl)
loss_ld = sum(loss_ld_list)
loss_ld_vlr = sum(loss_ld_vlr_list)
loss_kd = sum(loss_kd_list)
loss_states = dict(
loss_qfl=loss_qfl,
loss_bbox=loss_bbox,
loss_dfl=loss_dfl,
loss_ld=loss_ld,
loss_ld_vlr=loss_ld_vlr,
loss_kd=loss_kd)
return loss_states
================================================
FILE: ppdet/modeling/heads/keypoint_hrhrnet_head.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
from ppdet.core.workspace import register
from .. import layers as L
from ..backbones.hrnet import BasicBlock
@register
class HrHRNetHead(nn.Layer):
__inject__ = ['loss']
def __init__(self, num_joints, loss='HrHRNetLoss', swahr=False, width=32):
"""
Head for HigherHRNet network
Args:
num_joints (int): number of keypoints
hrloss (object): HrHRNetLoss instance
swahr (bool): whether to use swahr
width (int): hrnet channel width
"""
super(HrHRNetHead, self).__init__()
self.loss = loss
self.num_joints = num_joints
num_featout1 = num_joints * 2
num_featout2 = num_joints
self.swahr = swahr
self.conv1 = L.Conv2d(width, num_featout1, 1, 1, 0, bias=True)
self.conv2 = L.Conv2d(width, num_featout2, 1, 1, 0, bias=True)
self.deconv = nn.Sequential(
L.ConvTranspose2d(
num_featout1 + width, width, 4, 2, 1, 0, bias=False),
L.BatchNorm2d(width),
L.ReLU())
self.blocks = nn.Sequential(*(BasicBlock(
num_channels=width,
num_filters=width,
has_se=False,
freeze_norm=False,
name='HrHRNetHead_{}'.format(i)) for i in range(4)))
self.interpolate = L.Upsample(2, mode='bilinear')
self.concat = L.Concat(dim=1)
if swahr:
self.scalelayer0 = nn.Sequential(
L.Conv2d(
width, num_joints, 1, 1, 0, bias=True),
L.BatchNorm2d(num_joints),
L.ReLU(),
L.Conv2d(
num_joints,
num_joints,
9,
1,
4,
groups=num_joints,
bias=True))
self.scalelayer1 = nn.Sequential(
L.Conv2d(
width, num_joints, 1, 1, 0, bias=True),
L.BatchNorm2d(num_joints),
L.ReLU(),
L.Conv2d(
num_joints,
num_joints,
9,
1,
4,
groups=num_joints,
bias=True))
def forward(self, feats, targets=None):
x1 = feats[0]
xo1 = self.conv1(x1)
x2 = self.blocks(self.deconv(self.concat((x1, xo1))))
xo2 = self.conv2(x2)
num_joints = self.num_joints
if self.training:
heatmap1, tagmap = paddle.split(xo1, 2, axis=1)
if self.swahr:
so1 = self.scalelayer0(x1)
so2 = self.scalelayer1(x2)
hrhrnet_outputs = ([heatmap1, so1], [xo2, so2], tagmap)
return self.loss(hrhrnet_outputs, targets)
else:
hrhrnet_outputs = (heatmap1, xo2, tagmap)
return self.loss(hrhrnet_outputs, targets)
# averaged heatmap, upsampled tagmap
upsampled = self.interpolate(xo1)
avg = (upsampled[:, :num_joints] + xo2[:, :num_joints]) / 2
return avg, upsampled[:, num_joints:]
================================================
FILE: ppdet/modeling/heads/mask_head.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import KaimingNormal
from ppdet.core.workspace import register, create
from ppdet.modeling.layers import ConvNormLayer
from .roi_extractor import RoIAlign
from ..cls_utils import _get_class_default_kwargs
@register
class MaskFeat(nn.Layer):
"""
Feature extraction in Mask head
Args:
in_channel (int): Input channels
out_channel (int): Output channels
num_convs (int): The number of conv layers, default 4
norm_type (string | None): Norm type, bn, gn, sync_bn are available,
default None
"""
def __init__(self,
in_channel=256,
out_channel=256,
num_convs=4,
norm_type=None):
super(MaskFeat, self).__init__()
self.num_convs = num_convs
self.in_channel = in_channel
self.out_channel = out_channel
self.norm_type = norm_type
fan_conv = out_channel * 3 * 3
fan_deconv = out_channel * 2 * 2
mask_conv = nn.Sequential()
if norm_type == 'gn':
for i in range(self.num_convs):
conv_name = 'mask_inter_feat_{}'.format(i + 1)
mask_conv.add_sublayer(
conv_name,
ConvNormLayer(
ch_in=in_channel if i == 0 else out_channel,
ch_out=out_channel,
filter_size=3,
stride=1,
norm_type=self.norm_type,
initializer=KaimingNormal(fan_in=fan_conv),
skip_quant=True))
mask_conv.add_sublayer(conv_name + 'act', nn.ReLU())
else:
for i in range(self.num_convs):
conv_name = 'mask_inter_feat_{}'.format(i + 1)
conv = nn.Conv2D(
in_channels=in_channel if i == 0 else out_channel,
out_channels=out_channel,
kernel_size=3,
padding=1,
weight_attr=paddle.ParamAttr(
initializer=KaimingNormal(fan_in=fan_conv)))
conv.skip_quant = True
mask_conv.add_sublayer(conv_name, conv)
mask_conv.add_sublayer(conv_name + 'act', nn.ReLU())
mask_conv.add_sublayer(
'conv5_mask',
nn.Conv2DTranspose(
in_channels=self.out_channel if num_convs > 0 else self.in_channel,
out_channels=self.out_channel,
kernel_size=2,
stride=2,
weight_attr=paddle.ParamAttr(
initializer=KaimingNormal(fan_in=fan_deconv))))
mask_conv.add_sublayer('conv5_mask' + 'act', nn.ReLU())
self.upsample = mask_conv
@classmethod
def from_config(cls, cfg, input_shape):
if isinstance(input_shape, (list, tuple)):
input_shape = input_shape[0]
return {'in_channel': input_shape.channels, }
def out_channels(self):
return self.out_channel
def forward(self, feats):
return self.upsample(feats)
@register
class MaskHead(nn.Layer):
__shared__ = ['num_classes', 'export_onnx']
__inject__ = ['mask_assigner']
"""
RCNN mask head
Args:
head (nn.Layer): Extract feature in mask head
roi_extractor (object): The module of RoI Extractor
mask_assigner (object): The module of Mask Assigner,
label and sample the mask
num_classes (int): The number of classes
share_bbox_feat (bool): Whether to share the feature from bbox head,
default false
"""
def __init__(self,
head,
roi_extractor=_get_class_default_kwargs(RoIAlign),
mask_assigner='MaskAssigner',
num_classes=80,
share_bbox_feat=False,
export_onnx=False):
super(MaskHead, self).__init__()
self.num_classes = num_classes
self.export_onnx = export_onnx
self.roi_extractor = roi_extractor
if isinstance(roi_extractor, dict):
self.roi_extractor = RoIAlign(**roi_extractor)
self.head = head
self.in_channels = head.out_channels()
self.mask_assigner = mask_assigner
self.share_bbox_feat = share_bbox_feat
self.bbox_head = None
self.mask_fcn_logits = nn.Conv2D(
in_channels=self.in_channels,
out_channels=self.num_classes,
kernel_size=1,
weight_attr=paddle.ParamAttr(initializer=KaimingNormal(
fan_in=self.num_classes)))
self.mask_fcn_logits.skip_quant = True
@classmethod
def from_config(cls, cfg, input_shape):
roi_pooler = cfg['roi_extractor']
assert isinstance(roi_pooler, dict)
kwargs = RoIAlign.from_config(cfg, input_shape)
roi_pooler.update(kwargs)
kwargs = {'input_shape': input_shape}
head = create(cfg['head'], **kwargs)
return {
'roi_extractor': roi_pooler,
'head': head,
}
def get_loss(self, mask_logits, mask_label, mask_target, mask_weight):
mask_label = F.one_hot(mask_label, self.num_classes).unsqueeze([2, 3])
mask_label = paddle.expand_as(mask_label, mask_logits)
mask_label.stop_gradient = True
mask_pred = paddle.gather_nd(mask_logits, paddle.nonzero(mask_label))
shape = mask_logits.shape
mask_pred = paddle.reshape(mask_pred, [shape[0], shape[2], shape[3]])
mask_target = mask_target.cast('float32')
mask_weight = mask_weight.unsqueeze([1, 2])
loss_mask = F.binary_cross_entropy_with_logits(
mask_pred, mask_target, weight=mask_weight, reduction="mean")
return loss_mask
def forward_train(self, body_feats, rois, rois_num, inputs, targets,
bbox_feat):
"""
body_feats (list[Tensor]): Multi-level backbone features
rois (list[Tensor]): Proposals for each batch with shape [N, 4]
rois_num (Tensor): The number of proposals for each batch
inputs (dict): ground truth info
"""
tgt_labels, _, tgt_gt_inds = targets
rois, rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights = self.mask_assigner(
rois, tgt_labels, tgt_gt_inds, inputs)
if self.share_bbox_feat:
rois_feat = paddle.gather(bbox_feat, mask_index)
else:
rois_feat = self.roi_extractor(body_feats, rois, rois_num)
mask_feat = self.head(rois_feat)
mask_logits = self.mask_fcn_logits(mask_feat)
loss_mask = self.get_loss(mask_logits, tgt_classes, tgt_masks,
tgt_weights)
return {'loss_mask': loss_mask}
def forward_test(self,
body_feats,
rois,
rois_num,
scale_factor,
feat_func=None):
"""
body_feats (list[Tensor]): Multi-level backbone features
rois (Tensor): Prediction from bbox head with shape [N, 6]
rois_num (Tensor): The number of prediction for each batch
scale_factor (Tensor): The scale factor from origin size to input size
"""
if not self.export_onnx and rois.shape[0] == 0:
mask_out = paddle.full([1, 1, 1], -1)
else:
bbox = [rois[:, 2:]]
labels = rois[:, 0].cast('int32')
rois_feat = self.roi_extractor(body_feats, bbox, rois_num)
if self.share_bbox_feat:
assert feat_func is not None
rois_feat = feat_func(rois_feat)
mask_feat = self.head(rois_feat)
mask_logit = self.mask_fcn_logits(mask_feat)
if self.num_classes == 1:
mask_out = F.sigmoid(mask_logit)[:, 0, :, :]
else:
num_masks = mask_logit.shape[0]
index = paddle.arange(num_masks).cast('int32')
mask_out = mask_logit[index, labels]
mask_out_shape = mask_out.shape
mask_out = paddle.reshape(mask_out,
index.shape + [mask_out_shape[-2]] + [mask_out_shape[-1]])
mask_out = F.sigmoid(mask_out)
return mask_out
def forward(self,
body_feats,
rois,
rois_num,
inputs,
targets=None,
bbox_feat=None,
feat_func=None):
if self.training:
return self.forward_train(body_feats, rois, rois_num, inputs,
targets, bbox_feat)
else:
im_scale = inputs['scale_factor']
return self.forward_test(body_feats, rois, rois_num, im_scale,
feat_func)
================================================
FILE: ppdet/modeling/heads/petr_head.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
this code is base on https://github.com/hikvision-research/opera/blob/main/opera/models/dense_heads/petr_head.py
"""
import copy
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
import paddle.distributed as dist
from ..transformers.petr_transformer import inverse_sigmoid, masked_fill
from ..initializer import constant_, normal_
__all__ = ["PETRHead"]
from functools import partial
def bias_init_with_prob(prior_prob: float) -> float:
"""initialize conv/fc bias value according to a given probability value."""
bias_init = float(-np.log((1 - prior_prob) / prior_prob))
return bias_init
def multi_apply(func, *args, **kwargs):
"""Apply function to a list of arguments.
Note:
This function applies the ``func`` to multiple inputs and
map the multiple outputs of the ``func`` into different
list. Each list contains the same type of outputs corresponding
to different inputs.
Args:
func (Function): A function that will be applied to a list of
arguments
Returns:
tuple(list): A tuple containing multiple list, each list contains \
a kind of returned results by the function
"""
pfunc = partial(func, **kwargs) if kwargs else func
map_results = map(pfunc, *args)
res = tuple(map(list, zip(*map_results)))
return res
def reduce_mean(tensor):
""""Obtain the mean of tensor on different GPUs."""
if not (dist.get_world_size() and dist.is_initialized()):
return tensor
tensor = tensor.clone()
dist.all_reduce(
tensor.divide(
paddle.to_tensor(
dist.get_world_size(), dtype='float32')),
op=dist.ReduceOp.SUM)
return tensor
def gaussian_radius(det_size, min_overlap=0.7):
"""calculate gaussian radius according to object size.
"""
height, width = det_size
a1 = 1
b1 = (height + width)
c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
sq1 = paddle.sqrt(b1**2 - 4 * a1 * c1)
r1 = (b1 + sq1) / 2
a2 = 4
b2 = 2 * (height + width)
c2 = (1 - min_overlap) * width * height
sq2 = paddle.sqrt(b2**2 - 4 * a2 * c2)
r2 = (b2 + sq2) / 2
a3 = 4 * min_overlap
b3 = -2 * min_overlap * (height + width)
c3 = (min_overlap - 1) * width * height
sq3 = paddle.sqrt(b3**2 - 4 * a3 * c3)
r3 = (b3 + sq3) / 2
return min(r1, r2, r3)
def gaussian2D(shape, sigma=1):
m, n = [(ss - 1.) / 2. for ss in shape]
y = paddle.arange(-m, m + 1, dtype="float32")[:, None]
x = paddle.arange(-n, n + 1, dtype="float32")[None, :]
# y, x = np.ogrid[-m:m + 1, -n:n + 1]
h = paddle.exp(-(x * x + y * y) / (2 * sigma * sigma))
h[h < np.finfo(np.float32).eps * h.max()] = 0
return h
def draw_umich_gaussian(heatmap, center, radius, k=1):
diameter = 2 * radius + 1
gaussian = gaussian2D((diameter, diameter), sigma=diameter / 6)
gaussian = paddle.to_tensor(gaussian, dtype=heatmap.dtype)
x, y = int(center[0]), int(center[1])
radius = int(radius)
height, width = heatmap.shape[0:2]
left, right = min(x, radius), min(width - x, radius + 1)
top, bottom = min(y, radius), min(height - y, radius + 1)
masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:
radius + right]
# assert masked_gaussian.equal(1).float().sum() == 1
if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
heatmap[y - top:y + bottom, x - left:x + right] = paddle.maximum(
masked_heatmap, masked_gaussian * k)
return heatmap
@register
class PETRHead(nn.Layer):
"""Head of `End-to-End Multi-Person Pose Estimation with Transformers`.
Args:
num_classes (int): Number of categories excluding the background.
in_channels (int): Number of channels in the input feature map.
num_query (int): Number of query in Transformer.
num_kpt_fcs (int, optional): Number of fully-connected layers used in
`FFN`, which is then used for the keypoint regression head.
Default 2.
transformer (obj:`mmcv.ConfigDict`|dict): ConfigDict is used for
building the Encoder and Decoder. Default: None.
sync_cls_avg_factor (bool): Whether to sync the avg_factor of
all ranks. Default to False.
positional_encoding (obj:`mmcv.ConfigDict`|dict):
Config for position encoding.
loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the
classification loss. Default `CrossEntropyLoss`.
loss_kpt (obj:`mmcv.ConfigDict`|dict): Config of the
regression loss. Default `L1Loss`.
loss_oks (obj:`mmcv.ConfigDict`|dict): Config of the
regression oks loss. Default `OKSLoss`.
loss_hm (obj:`mmcv.ConfigDict`|dict): Config of the
regression heatmap loss. Default `NegLoss`.
as_two_stage (bool) : Whether to generate the proposal from
the outputs of encoder.
with_kpt_refine (bool): Whether to refine the reference points
in the decoder. Defaults to True.
test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of
transformer head.
init_cfg (dict or list[dict], optional): Initialization config dict.
Default: None.
"""
__inject__ = [
"transformer", "positional_encoding", "assigner", "sampler", "loss_cls",
"loss_kpt", "loss_oks", "loss_hm", "loss_kpt_rpn", "loss_kpt_refine",
"loss_oks_refine"
]
def __init__(self,
num_classes,
in_channels,
num_query=100,
num_kpt_fcs=2,
num_keypoints=17,
transformer=None,
sync_cls_avg_factor=True,
positional_encoding='SinePositionalEncoding',
loss_cls='FocalLoss',
loss_kpt='L1Loss',
loss_oks='OKSLoss',
loss_hm='CenterFocalLoss',
with_kpt_refine=True,
assigner='PoseHungarianAssigner',
sampler='PseudoSampler',
loss_kpt_rpn='L1Loss',
loss_kpt_refine='L1Loss',
loss_oks_refine='opera.OKSLoss',
test_cfg=dict(max_per_img=100),
init_cfg=None,
**kwargs):
# NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
# since it brings inconvenience when the initialization of
# `AnchorFreeHead` is called.
super().__init__()
self.bg_cls_weight = 0
self.sync_cls_avg_factor = sync_cls_avg_factor
self.assigner = assigner
self.sampler = sampler
self.num_query = num_query
self.num_classes = num_classes
self.in_channels = in_channels
self.num_kpt_fcs = num_kpt_fcs
self.test_cfg = test_cfg
self.fp16_enabled = False
self.as_two_stage = transformer.as_two_stage
self.with_kpt_refine = with_kpt_refine
self.num_keypoints = num_keypoints
self.loss_cls = loss_cls
self.loss_kpt = loss_kpt
self.loss_kpt_rpn = loss_kpt_rpn
self.loss_kpt_refine = loss_kpt_refine
self.loss_oks = loss_oks
self.loss_oks_refine = loss_oks_refine
self.loss_hm = loss_hm
if self.loss_cls.use_sigmoid:
self.cls_out_channels = num_classes
else:
self.cls_out_channels = num_classes + 1
self.positional_encoding = positional_encoding
self.transformer = transformer
self.embed_dims = self.transformer.embed_dims
# assert 'num_feats' in positional_encoding
num_feats = positional_encoding.num_pos_feats
assert num_feats * 2 == self.embed_dims, 'embed_dims should' \
f' be exactly 2 times of num_feats. Found {self.embed_dims}' \
f' and {num_feats}.'
self._init_layers()
self.init_weights()
def _init_layers(self):
"""Initialize classification branch and keypoint branch of head."""
fc_cls = nn.Linear(self.embed_dims, self.cls_out_channels)
kpt_branch = []
kpt_branch.append(nn.Linear(self.embed_dims, 512))
kpt_branch.append(nn.ReLU())
for _ in range(self.num_kpt_fcs):
kpt_branch.append(nn.Linear(512, 512))
kpt_branch.append(nn.ReLU())
kpt_branch.append(nn.Linear(512, 2 * self.num_keypoints))
kpt_branch = nn.Sequential(*kpt_branch)
def _get_clones(module, N):
return nn.LayerList([copy.deepcopy(module) for i in range(N)])
# last kpt_branch is used to generate proposal from
# encode feature map when as_two_stage is True.
num_pred = (self.transformer.decoder.num_layers + 1) if \
self.as_two_stage else self.transformer.decoder.num_layers
if self.with_kpt_refine:
self.cls_branches = _get_clones(fc_cls, num_pred)
self.kpt_branches = _get_clones(kpt_branch, num_pred)
else:
self.cls_branches = nn.LayerList([fc_cls for _ in range(num_pred)])
self.kpt_branches = nn.LayerList(
[kpt_branch for _ in range(num_pred)])
self.query_embedding = nn.Embedding(self.num_query, self.embed_dims * 2)
refine_kpt_branch = []
for _ in range(self.num_kpt_fcs):
refine_kpt_branch.append(
nn.Linear(self.embed_dims, self.embed_dims))
refine_kpt_branch.append(nn.ReLU())
refine_kpt_branch.append(nn.Linear(self.embed_dims, 2))
refine_kpt_branch = nn.Sequential(*refine_kpt_branch)
if self.with_kpt_refine:
num_pred = self.transformer.refine_decoder.num_layers
self.refine_kpt_branches = _get_clones(refine_kpt_branch, num_pred)
self.fc_hm = nn.Linear(self.embed_dims, self.num_keypoints)
def init_weights(self):
"""Initialize weights of the PETR head."""
self.transformer.init_weights()
if self.loss_cls.use_sigmoid:
bias_init = bias_init_with_prob(0.01)
for m in self.cls_branches:
constant_(m.bias, bias_init)
for m in self.kpt_branches:
constant_(m[-1].bias, 0)
# initialization of keypoint refinement branch
if self.with_kpt_refine:
for m in self.refine_kpt_branches:
constant_(m[-1].bias, 0)
# initialize bias for heatmap prediction
bias_init = bias_init_with_prob(0.1)
normal_(self.fc_hm.weight, std=0.01)
constant_(self.fc_hm.bias, bias_init)
def forward(self, mlvl_feats, img_metas):
"""Forward function.
Args:
mlvl_feats (tuple[Tensor]): Features from the upstream
network, each is a 4D-tensor with shape
(N, C, H, W).
img_metas (list[dict]): List of image information.
Returns:
outputs_classes (Tensor): Outputs from the classification head,
shape [nb_dec, bs, num_query, cls_out_channels]. Note
cls_out_channels should include background.
outputs_kpts (Tensor): Sigmoid outputs from the regression
head with normalized coordinate format (cx, cy, w, h).
Shape [nb_dec, bs, num_query, K*2].
enc_outputs_class (Tensor): The score of each point on encode
feature map, has shape (N, h*w, num_class). Only when
as_two_stage is Ture it would be returned, otherwise
`None` would be returned.
enc_outputs_kpt (Tensor): The proposal generate from the
encode feature map, has shape (N, h*w, K*2). Only when
as_two_stage is Ture it would be returned, otherwise
`None` would be returned.
"""
batch_size = mlvl_feats[0].shape[0]
input_img_h, input_img_w = img_metas[0]['batch_input_shape']
img_masks = paddle.zeros(
(batch_size, input_img_h, input_img_w), dtype=mlvl_feats[0].dtype)
for img_id in range(batch_size):
img_h, img_w, _ = img_metas[img_id]['img_shape']
img_masks[img_id, :img_h, :img_w] = 1
mlvl_masks = []
mlvl_positional_encodings = []
for feat in mlvl_feats:
mlvl_masks.append(
F.interpolate(
img_masks[None], size=feat.shape[-2:]).squeeze(0))
mlvl_positional_encodings.append(
self.positional_encoding(mlvl_masks[-1]).transpose(
[0, 3, 1, 2]))
query_embeds = self.query_embedding.weight
hs, init_reference, inter_references, \
enc_outputs_class, enc_outputs_kpt, hm_proto, memory = \
self.transformer(
mlvl_feats,
mlvl_masks,
query_embeds,
mlvl_positional_encodings,
kpt_branches=self.kpt_branches \
if self.with_kpt_refine else None, # noqa:E501
cls_branches=self.cls_branches \
if self.as_two_stage else None # noqa:E501
)
outputs_classes = []
outputs_kpts = []
for lvl in range(hs.shape[0]):
if lvl == 0:
reference = init_reference
else:
reference = inter_references[lvl - 1]
reference = inverse_sigmoid(reference)
outputs_class = self.cls_branches[lvl](hs[lvl])
tmp_kpt = self.kpt_branches[lvl](hs[lvl])
assert reference.shape[-1] == self.num_keypoints * 2
tmp_kpt += reference
outputs_kpt = F.sigmoid(tmp_kpt)
outputs_classes.append(outputs_class)
outputs_kpts.append(outputs_kpt)
outputs_classes = paddle.stack(outputs_classes)
outputs_kpts = paddle.stack(outputs_kpts)
if hm_proto is not None:
# get heatmap prediction (training phase)
hm_memory, hm_mask = hm_proto
hm_pred = self.fc_hm(hm_memory)
hm_proto = (hm_pred.transpose((0, 3, 1, 2)), hm_mask)
if self.as_two_stage:
return outputs_classes, outputs_kpts, \
enc_outputs_class, F.sigmoid(enc_outputs_kpt), \
hm_proto, memory, mlvl_masks
else:
raise RuntimeError('only "as_two_stage=True" is supported.')
def forward_refine(self, memory, mlvl_masks, refine_targets, losses,
img_metas):
"""Forward function.
Args:
mlvl_masks (tuple[Tensor]): The key_padding_mask from
different level used for encoder and decoder,
each is a 3D-tensor with shape (bs, H, W).
losses (dict[str, Tensor]): A dictionary of loss components.
img_metas (list[dict]): List of image information.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
kpt_preds, kpt_targets, area_targets, kpt_weights = refine_targets
pos_inds = kpt_weights.sum(-1) > 0
if not pos_inds.any():
pos_kpt_preds = paddle.zeros_like(kpt_preds[:1])
pos_img_inds = paddle.zeros([1], dtype="int64")
else:
pos_kpt_preds = kpt_preds[pos_inds]
pos_img_inds = (pos_inds.nonzero() /
self.num_query).squeeze(1).astype("int64")
hs, init_reference, inter_references = self.transformer.forward_refine(
mlvl_masks,
memory,
pos_kpt_preds.detach(),
pos_img_inds,
kpt_branches=self.refine_kpt_branches
if self.with_kpt_refine else None, # noqa:E501
)
outputs_kpts = []
for lvl in range(hs.shape[0]):
if lvl == 0:
reference = init_reference
else:
reference = inter_references[lvl - 1]
reference = inverse_sigmoid(reference)
tmp_kpt = self.refine_kpt_branches[lvl](hs[lvl])
assert reference.shape[-1] == 2
tmp_kpt += reference
outputs_kpt = F.sigmoid(tmp_kpt)
outputs_kpts.append(outputs_kpt)
outputs_kpts = paddle.stack(outputs_kpts)
if not self.training:
return outputs_kpts
num_valid_kpt = paddle.clip(
reduce_mean(kpt_weights.sum()), min=1).item()
num_total_pos = paddle.to_tensor(
[outputs_kpts.shape[1]], dtype=kpt_weights.dtype)
num_total_pos = paddle.clip(reduce_mean(num_total_pos), min=1).item()
if not pos_inds.any():
for i, kpt_refine_preds in enumerate(outputs_kpts):
loss_kpt = loss_oks = kpt_refine_preds.sum() * 0
losses[f'd{i}.loss_kpt_refine'] = loss_kpt
losses[f'd{i}.loss_oks_refine'] = loss_oks
continue
return losses
batch_size = mlvl_masks[0].shape[0]
factors = []
for img_id in range(batch_size):
img_h, img_w, _ = img_metas[img_id]['img_shape']
factor = paddle.to_tensor(
[img_w, img_h, img_w, img_h],
dtype="float32").squeeze(-1).unsqueeze(0).tile(
(self.num_query, 1))
factors.append(factor)
factors = paddle.concat(factors, 0)
factors = factors[pos_inds][:, :2].tile((1, kpt_preds.shape[-1] // 2))
pos_kpt_weights = kpt_weights[pos_inds]
pos_kpt_targets = kpt_targets[pos_inds]
pos_kpt_targets_scaled = pos_kpt_targets * factors
pos_areas = area_targets[pos_inds]
pos_valid = kpt_weights[pos_inds][:, 0::2]
for i, kpt_refine_preds in enumerate(outputs_kpts):
if not pos_inds.any():
print("refine kpt and oks skip")
loss_kpt = loss_oks = kpt_refine_preds.sum() * 0
losses[f'd{i}.loss_kpt_refine'] = loss_kpt
losses[f'd{i}.loss_oks_refine'] = loss_oks
continue
# kpt L1 Loss
pos_refine_preds = kpt_refine_preds.reshape(
(kpt_refine_preds.shape[0], -1))
loss_kpt = self.loss_kpt_refine(
pos_refine_preds,
pos_kpt_targets,
pos_kpt_weights,
avg_factor=num_valid_kpt)
losses[f'd{i}.loss_kpt_refine'] = loss_kpt
# kpt oks loss
pos_refine_preds_scaled = pos_refine_preds * factors
assert (pos_areas > 0).all()
loss_oks = self.loss_oks_refine(
pos_refine_preds_scaled,
pos_kpt_targets_scaled,
pos_valid,
pos_areas,
avg_factor=num_total_pos)
losses[f'd{i}.loss_oks_refine'] = loss_oks
return losses
# over-write because img_metas are needed as inputs for bbox_head.
def forward_train(self,
x,
img_metas,
gt_bboxes,
gt_labels=None,
gt_keypoints=None,
gt_areas=None,
gt_bboxes_ignore=None,
proposal_cfg=None,
**kwargs):
"""Forward function for training mode.
Args:
x (list[Tensor]): Features from backbone.
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes (list[Tensor]): Ground truth bboxes of the image,
shape (num_gts, 4).
gt_labels (list[Tensor]): Ground truth labels of each box,
shape (num_gts,).
gt_keypoints (list[Tensor]): Ground truth keypoints of the image,
shape (num_gts, K*3).
gt_areas (list[Tensor]): Ground truth mask areas of each box,
shape (num_gts,).
gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be
ignored, shape (num_ignored_gts, 4).
proposal_cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert proposal_cfg is None, '"proposal_cfg" must be None'
outs = self(x, img_metas)
memory, mlvl_masks = outs[-2:]
outs = outs[:-2]
if gt_labels is None:
loss_inputs = outs + (gt_bboxes, gt_keypoints, gt_areas, img_metas)
else:
loss_inputs = outs + (gt_bboxes, gt_labels, gt_keypoints, gt_areas,
img_metas)
losses_and_targets = self.loss(
*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
# losses = losses_and_targets
losses, refine_targets = losses_and_targets
# get pose refinement loss
losses = self.forward_refine(memory, mlvl_masks, refine_targets, losses,
img_metas)
return losses
def loss(self,
all_cls_scores,
all_kpt_preds,
enc_cls_scores,
enc_kpt_preds,
enc_hm_proto,
gt_bboxes_list,
gt_labels_list,
gt_keypoints_list,
gt_areas_list,
img_metas,
gt_bboxes_ignore=None):
"""Loss function.
Args:
all_cls_scores (Tensor): Classification score of all
decoder layers, has shape
[nb_dec, bs, num_query, cls_out_channels].
all_kpt_preds (Tensor): Sigmoid regression
outputs of all decode layers. Each is a 4D-tensor with
normalized coordinate format (x_{i}, y_{i}) and shape
[nb_dec, bs, num_query, K*2].
enc_cls_scores (Tensor): Classification scores of
points on encode feature map, has shape
(N, h*w, num_classes). Only be passed when as_two_stage is
True, otherwise is None.
enc_kpt_preds (Tensor): Regression results of each points
on the encode feature map, has shape (N, h*w, K*2). Only be
passed when as_two_stage is True, otherwise is None.
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
gt_keypoints_list (list[Tensor]): Ground truth keypoints for each
image with shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v,
..., p^{K}_x, p^{K}_y, p^{K}_v] format.
gt_areas_list (list[Tensor]): Ground truth mask areas for each
image with shape (num_gts, ).
img_metas (list[dict]): List of image meta information.
gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert gt_bboxes_ignore is None, \
f'{self.__class__.__name__} only supports ' \
f'for gt_bboxes_ignore setting to None.'
num_dec_layers = len(all_cls_scores)
all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
all_gt_keypoints_list = [
gt_keypoints_list for _ in range(num_dec_layers)
]
all_gt_areas_list = [gt_areas_list for _ in range(num_dec_layers)]
img_metas_list = [img_metas for _ in range(num_dec_layers)]
losses_cls, losses_kpt, losses_oks, kpt_preds_list, kpt_targets_list, \
area_targets_list, kpt_weights_list = multi_apply(
self.loss_single, all_cls_scores, all_kpt_preds,
all_gt_labels_list, all_gt_keypoints_list,
all_gt_areas_list, img_metas_list)
loss_dict = dict()
# loss of proposal generated from encode feature map.
if enc_cls_scores is not None:
binary_labels_list = [
paddle.zeros_like(gt_labels_list[i])
for i in range(len(img_metas))
]
enc_loss_cls, enc_losses_kpt = \
self.loss_single_rpn(
enc_cls_scores, enc_kpt_preds, binary_labels_list,
gt_keypoints_list, gt_areas_list, img_metas)
loss_dict['enc_loss_cls'] = enc_loss_cls
loss_dict['enc_loss_kpt'] = enc_losses_kpt
# loss from the last decoder layer
loss_dict['loss_cls'] = losses_cls[-1]
loss_dict['loss_kpt'] = losses_kpt[-1]
loss_dict['loss_oks'] = losses_oks[-1]
# loss from other decoder layers
num_dec_layer = 0
for loss_cls_i, loss_kpt_i, loss_oks_i in zip(
losses_cls[:-1], losses_kpt[:-1], losses_oks[:-1]):
loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
loss_dict[f'd{num_dec_layer}.loss_kpt'] = loss_kpt_i
loss_dict[f'd{num_dec_layer}.loss_oks'] = loss_oks_i
num_dec_layer += 1
# losses of heatmap generated from P3 feature map
hm_pred, hm_mask = enc_hm_proto
loss_hm = self.loss_heatmap(hm_pred, hm_mask, gt_keypoints_list,
gt_labels_list, gt_bboxes_list)
loss_dict['loss_hm'] = loss_hm
return loss_dict, (kpt_preds_list[-1], kpt_targets_list[-1],
area_targets_list[-1], kpt_weights_list[-1])
def loss_heatmap(self, hm_pred, hm_mask, gt_keypoints, gt_labels,
gt_bboxes):
assert hm_pred.shape[-2:] == hm_mask.shape[-2:]
num_img, _, h, w = hm_pred.shape
# placeholder of heatmap target (Gaussian distribution)
hm_target = paddle.zeros(hm_pred.shape, hm_pred.dtype)
for i, (gt_label, gt_bbox, gt_keypoint
) in enumerate(zip(gt_labels, gt_bboxes, gt_keypoints)):
if gt_label.shape[0] == 0:
continue
gt_keypoint = gt_keypoint.reshape((gt_keypoint.shape[0], -1,
3)).clone()
gt_keypoint[..., :2] /= 8
assert gt_keypoint[..., 0].max() <= w + 0.5 # new coordinate system
assert gt_keypoint[..., 1].max() <= h + 0.5 # new coordinate system
gt_bbox /= 8
gt_w = gt_bbox[:, 2] - gt_bbox[:, 0]
gt_h = gt_bbox[:, 3] - gt_bbox[:, 1]
for j in range(gt_label.shape[0]):
# get heatmap radius
kp_radius = paddle.clip(
paddle.floor(
gaussian_radius(
(gt_h[j], gt_w[j]), min_overlap=0.9)),
min=0,
max=3)
for k in range(self.num_keypoints):
if gt_keypoint[j, k, 2] > 0:
gt_kp = gt_keypoint[j, k, :2]
gt_kp_int = paddle.floor(gt_kp)
hm_target[i, k] = draw_umich_gaussian(
hm_target[i, k], gt_kp_int, kp_radius)
# compute heatmap loss
hm_pred = paddle.clip(
F.sigmoid(hm_pred), min=1e-4, max=1 - 1e-4) # refer to CenterNet
loss_hm = self.loss_hm(
hm_pred,
hm_target.detach(),
mask=~hm_mask.astype("bool").unsqueeze(1))
return loss_hm
def loss_single(self, cls_scores, kpt_preds, gt_labels_list,
gt_keypoints_list, gt_areas_list, img_metas):
"""Loss function for outputs from a single decoder layer of a single
feature level.
Args:
cls_scores (Tensor): Box score logits from a single decoder layer
for all images. Shape [bs, num_query, cls_out_channels].
kpt_preds (Tensor): Sigmoid outputs from a single decoder layer
for all images, with normalized coordinate (x_{i}, y_{i}) and
shape [bs, num_query, K*2].
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
gt_keypoints_list (list[Tensor]): Ground truth keypoints for each
image with shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v,
..., p^{K}_x, p^{K}_y, p^{K}_v] format.
gt_areas_list (list[Tensor]): Ground truth mask areas for each
image with shape (num_gts, ).
img_metas (list[dict]): List of image meta information.
Returns:
dict[str, Tensor]: A dictionary of loss components for outputs from
a single decoder layer.
"""
num_imgs = cls_scores.shape[0]
cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
kpt_preds_list = [kpt_preds[i] for i in range(num_imgs)]
cls_reg_targets = self.get_targets(cls_scores_list, kpt_preds_list,
gt_labels_list, gt_keypoints_list,
gt_areas_list, img_metas)
(labels_list, label_weights_list, kpt_targets_list, kpt_weights_list,
area_targets_list, num_total_pos, num_total_neg) = cls_reg_targets
labels = paddle.concat(labels_list, 0)
label_weights = paddle.concat(label_weights_list, 0)
kpt_targets = paddle.concat(kpt_targets_list, 0)
kpt_weights = paddle.concat(kpt_weights_list, 0)
area_targets = paddle.concat(area_targets_list, 0)
# classification loss
cls_scores = cls_scores.reshape((-1, self.cls_out_channels))
# construct weighted avg_factor to match with the official DETR repo
cls_avg_factor = num_total_pos * 1.0 + \
num_total_neg * self.bg_cls_weight
if self.sync_cls_avg_factor:
cls_avg_factor = reduce_mean(
paddle.to_tensor(
[cls_avg_factor], dtype=cls_scores.dtype))
cls_avg_factor = max(cls_avg_factor, 1)
loss_cls = self.loss_cls(
cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
# Compute the average number of gt keypoints accross all gpus, for
# normalization purposes
num_total_pos = paddle.to_tensor([num_total_pos], dtype=loss_cls.dtype)
num_total_pos = paddle.clip(reduce_mean(num_total_pos), min=1).item()
# construct factors used for rescale keypoints
factors = []
for img_meta, kpt_pred in zip(img_metas, kpt_preds):
img_h, img_w, _ = img_meta['img_shape']
factor = paddle.to_tensor(
[img_w, img_h, img_w, img_h],
dtype=kpt_pred.dtype).squeeze().unsqueeze(0).tile(
(kpt_pred.shape[0], 1))
factors.append(factor)
factors = paddle.concat(factors, 0)
# keypoint regression loss
kpt_preds = kpt_preds.reshape((-1, kpt_preds.shape[-1]))
num_valid_kpt = paddle.clip(
reduce_mean(kpt_weights.sum()), min=1).item()
# assert num_valid_kpt == (kpt_targets>0).sum().item()
loss_kpt = self.loss_kpt(
kpt_preds,
kpt_targets.detach(),
kpt_weights.detach(),
avg_factor=num_valid_kpt)
# keypoint oks loss
pos_inds = kpt_weights.sum(-1) > 0
if not pos_inds.any():
loss_oks = kpt_preds.sum() * 0
else:
factors = factors[pos_inds][:, :2].tile((
(1, kpt_preds.shape[-1] // 2)))
pos_kpt_preds = kpt_preds[pos_inds] * factors
pos_kpt_targets = kpt_targets[pos_inds] * factors
pos_areas = area_targets[pos_inds]
pos_valid = kpt_weights[pos_inds][..., 0::2]
assert (pos_areas > 0).all()
loss_oks = self.loss_oks(
pos_kpt_preds,
pos_kpt_targets,
pos_valid,
pos_areas,
avg_factor=num_total_pos)
return loss_cls, loss_kpt, loss_oks, kpt_preds, kpt_targets, \
area_targets, kpt_weights
def get_targets(self, cls_scores_list, kpt_preds_list, gt_labels_list,
gt_keypoints_list, gt_areas_list, img_metas):
"""Compute regression and classification targets for a batch image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_scores_list (list[Tensor]): Box score logits from a single
decoder layer for each image with shape [num_query,
cls_out_channels].
kpt_preds_list (list[Tensor]): Sigmoid outputs from a single
decoder layer for each image, with normalized coordinate
(x_{i}, y_{i}) and shape [num_query, K*2].
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
gt_keypoints_list (list[Tensor]): Ground truth keypoints for each
image with shape (num_gts, K*3).
gt_areas_list (list[Tensor]): Ground truth mask areas for each
image with shape (num_gts, ).
img_metas (list[dict]): List of image meta information.
Returns:
tuple: a tuple containing the following targets.
- labels_list (list[Tensor]): Labels for all images.
- label_weights_list (list[Tensor]): Label weights for all
images.
- kpt_targets_list (list[Tensor]): Keypoint targets for all
images.
- kpt_weights_list (list[Tensor]): Keypoint weights for all
images.
- area_targets_list (list[Tensor]): area targets for all
images.
- num_total_pos (int): Number of positive samples in all
images.
- num_total_neg (int): Number of negative samples in all
images.
"""
(labels_list, label_weights_list, kpt_targets_list, kpt_weights_list,
area_targets_list, pos_inds_list, neg_inds_list) = multi_apply(
self._get_target_single, cls_scores_list, kpt_preds_list,
gt_labels_list, gt_keypoints_list, gt_areas_list, img_metas)
num_total_pos = sum((inds.numel() for inds in pos_inds_list))
num_total_neg = sum((inds.numel() for inds in neg_inds_list))
return (labels_list, label_weights_list, kpt_targets_list,
kpt_weights_list, area_targets_list, num_total_pos,
num_total_neg)
def _get_target_single(self, cls_score, kpt_pred, gt_labels, gt_keypoints,
gt_areas, img_meta):
"""Compute regression and classification targets for one image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_score (Tensor): Box score logits from a single decoder layer
for one image. Shape [num_query, cls_out_channels].
kpt_pred (Tensor): Sigmoid outputs from a single decoder layer
for one image, with normalized coordinate (x_{i}, y_{i}) and
shape [num_query, K*2].
gt_labels (Tensor): Ground truth class indices for one image
with shape (num_gts, ).
gt_keypoints (Tensor): Ground truth keypoints for one image with
shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v, ..., \
p^{K}_x, p^{K}_y, p^{K}_v] format.
gt_areas (Tensor): Ground truth mask areas for one image
with shape (num_gts, ).
img_meta (dict): Meta information for one image.
Returns:
tuple[Tensor]: a tuple containing the following for one image.
- labels (Tensor): Labels of each image.
- label_weights (Tensor): Label weights of each image.
- kpt_targets (Tensor): Keypoint targets of each image.
- kpt_weights (Tensor): Keypoint weights of each image.
- area_targets (Tensor): Area targets of each image.
- pos_inds (Tensor): Sampled positive indices for each image.
- neg_inds (Tensor): Sampled negative indices for each image.
"""
num_bboxes = kpt_pred.shape[0]
# assigner and sampler
assign_result = self.assigner.assign(cls_score, kpt_pred, gt_labels,
gt_keypoints, gt_areas, img_meta)
sampling_result = self.sampler.sample(assign_result, kpt_pred,
gt_keypoints)
pos_inds = sampling_result.pos_inds
neg_inds = sampling_result.neg_inds
# label targets
labels = paddle.full((num_bboxes, ), self.num_classes, dtype="int64")
label_weights = paddle.ones((num_bboxes, ), dtype=gt_labels.dtype)
kpt_targets = paddle.zeros_like(kpt_pred)
kpt_weights = paddle.zeros_like(kpt_pred)
area_targets = paddle.zeros((kpt_pred.shape[0], ), dtype=kpt_pred.dtype)
if pos_inds.size == 0:
return (labels, label_weights, kpt_targets, kpt_weights,
area_targets, pos_inds, neg_inds)
labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds][
..., 0].astype("int64")
img_h, img_w, _ = img_meta['img_shape']
# keypoint targets
pos_gt_kpts = gt_keypoints[sampling_result.pos_assigned_gt_inds]
pos_gt_kpts = pos_gt_kpts.reshape(
(len(sampling_result.pos_assigned_gt_inds), -1, 3))
valid_idx = pos_gt_kpts[:, :, 2] > 0
pos_kpt_weights = kpt_weights[pos_inds].reshape(
(pos_gt_kpts.shape[0], kpt_weights.shape[-1] // 2, 2))
# pos_kpt_weights[valid_idx][...] = 1.0
pos_kpt_weights = masked_fill(pos_kpt_weights,
valid_idx.unsqueeze(-1), 1.0)
kpt_weights[pos_inds] = pos_kpt_weights.reshape(
(pos_kpt_weights.shape[0], kpt_pred.shape[-1]))
factor = paddle.to_tensor(
[img_w, img_h], dtype=kpt_pred.dtype).squeeze().unsqueeze(0)
pos_gt_kpts_normalized = pos_gt_kpts[..., :2]
pos_gt_kpts_normalized[..., 0] = pos_gt_kpts_normalized[..., 0] / \
factor[:, 0:1]
pos_gt_kpts_normalized[..., 1] = pos_gt_kpts_normalized[..., 1] / \
factor[:, 1:2]
kpt_targets[pos_inds] = pos_gt_kpts_normalized.reshape(
(pos_gt_kpts.shape[0], kpt_pred.shape[-1]))
pos_gt_areas = gt_areas[sampling_result.pos_assigned_gt_inds][..., 0]
area_targets[pos_inds] = pos_gt_areas
return (labels, label_weights, kpt_targets, kpt_weights, area_targets,
pos_inds, neg_inds)
def loss_single_rpn(self, cls_scores, kpt_preds, gt_labels_list,
gt_keypoints_list, gt_areas_list, img_metas):
"""Loss function for outputs from a single decoder layer of a single
feature level.
Args:
cls_scores (Tensor): Box score logits from a single decoder layer
for all images. Shape [bs, num_query, cls_out_channels].
kpt_preds (Tensor): Sigmoid outputs from a single decoder layer
for all images, with normalized coordinate (x_{i}, y_{i}) and
shape [bs, num_query, K*2].
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
gt_keypoints_list (list[Tensor]): Ground truth keypoints for each
image with shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v,
..., p^{K}_x, p^{K}_y, p^{K}_v] format.
gt_areas_list (list[Tensor]): Ground truth mask areas for each
image with shape (num_gts, ).
img_metas (list[dict]): List of image meta information.
Returns:
dict[str, Tensor]: A dictionary of loss components for outputs from
a single decoder layer.
"""
num_imgs = cls_scores.shape[0]
cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
kpt_preds_list = [kpt_preds[i] for i in range(num_imgs)]
cls_reg_targets = self.get_targets(cls_scores_list, kpt_preds_list,
gt_labels_list, gt_keypoints_list,
gt_areas_list, img_metas)
(labels_list, label_weights_list, kpt_targets_list, kpt_weights_list,
area_targets_list, num_total_pos, num_total_neg) = cls_reg_targets
labels = paddle.concat(labels_list, 0)
label_weights = paddle.concat(label_weights_list, 0)
kpt_targets = paddle.concat(kpt_targets_list, 0)
kpt_weights = paddle.concat(kpt_weights_list, 0)
# classification loss
cls_scores = cls_scores.reshape((-1, self.cls_out_channels))
# construct weighted avg_factor to match with the official DETR repo
cls_avg_factor = num_total_pos * 1.0 + \
num_total_neg * self.bg_cls_weight
if self.sync_cls_avg_factor:
cls_avg_factor = reduce_mean(
paddle.to_tensor(
[cls_avg_factor], dtype=cls_scores.dtype))
cls_avg_factor = max(cls_avg_factor, 1)
cls_avg_factor = max(cls_avg_factor, 1)
loss_cls = self.loss_cls(
cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
# Compute the average number of gt keypoints accross all gpus, for
# normalization purposes
# num_total_pos = loss_cls.to_tensor([num_total_pos])
# num_total_pos = paddle.clip(reduce_mean(num_total_pos), min=1).item()
# keypoint regression loss
kpt_preds = kpt_preds.reshape((-1, kpt_preds.shape[-1]))
num_valid_kpt = paddle.clip(
reduce_mean(kpt_weights.sum()), min=1).item()
# assert num_valid_kpt == (kpt_targets>0).sum().item()
loss_kpt = self.loss_kpt_rpn(
kpt_preds, kpt_targets, kpt_weights, avg_factor=num_valid_kpt)
return loss_cls, loss_kpt
def get_bboxes(self,
all_cls_scores,
all_kpt_preds,
enc_cls_scores,
enc_kpt_preds,
hm_proto,
memory,
mlvl_masks,
img_metas,
rescale=False):
"""Transform network outputs for a batch into bbox predictions.
Args:
all_cls_scores (Tensor): Classification score of all
decoder layers, has shape
[nb_dec, bs, num_query, cls_out_channels].
all_kpt_preds (Tensor): Sigmoid regression
outputs of all decode layers. Each is a 4D-tensor with
normalized coordinate format (x_{i}, y_{i}) and shape
[nb_dec, bs, num_query, K*2].
enc_cls_scores (Tensor): Classification scores of points on
encode feature map, has shape (N, h*w, num_classes).
Only be passed when as_two_stage is True, otherwise is None.
enc_kpt_preds (Tensor): Regression results of each points
on the encode feature map, has shape (N, h*w, K*2). Only be
passed when as_two_stage is True, otherwise is None.
img_metas (list[dict]): Meta information of each image.
rescale (bool, optional): If True, return boxes in original
image space. Defalut False.
Returns:
list[list[Tensor, Tensor]]: Each item in result_list is 3-tuple.
The first item is an (n, 5) tensor, where the first 4 columns
are bounding box positions (tl_x, tl_y, br_x, br_y) and the
5-th column is a score between 0 and 1. The second item is a
(n,) tensor where each item is the predicted class label of
the corresponding box. The third item is an (n, K, 3) tensor
with [p^{1}_x, p^{1}_y, p^{1}_v, ..., p^{K}_x, p^{K}_y,
p^{K}_v] format.
"""
cls_scores = all_cls_scores[-1]
kpt_preds = all_kpt_preds[-1]
result_list = []
for img_id in range(len(img_metas)):
cls_score = cls_scores[img_id]
kpt_pred = kpt_preds[img_id]
img_shape = img_metas[img_id]['img_shape']
scale_factor = img_metas[img_id]['scale_factor']
# TODO: only support single image test
# memory_i = memory[:, img_id, :]
# mlvl_mask = mlvl_masks[img_id]
proposals = self._get_bboxes_single(cls_score, kpt_pred, img_shape,
scale_factor, memory,
mlvl_masks, rescale)
result_list.append(proposals)
return result_list
def _get_bboxes_single(self,
cls_score,
kpt_pred,
img_shape,
scale_factor,
memory,
mlvl_masks,
rescale=False):
"""Transform outputs from the last decoder layer into bbox predictions
for each image.
Args:
cls_score (Tensor): Box score logits from the last decoder layer
for each image. Shape [num_query, cls_out_channels].
kpt_pred (Tensor): Sigmoid outputs from the last decoder layer
for each image, with coordinate format (x_{i}, y_{i}) and
shape [num_query, K*2].
img_shape (tuple[int]): Shape of input image, (height, width, 3).
scale_factor (ndarray, optional): Scale factor of the image arange
as (w_scale, h_scale, w_scale, h_scale).
rescale (bool, optional): If True, return boxes in original image
space. Default False.
Returns:
tuple[Tensor]: Results of detected bboxes and labels.
- det_bboxes: Predicted bboxes with shape [num_query, 5],
where the first 4 columns are bounding box positions
(tl_x, tl_y, br_x, br_y) and the 5-th column are scores
between 0 and 1.
- det_labels: Predicted labels of the corresponding box with
shape [num_query].
- det_kpts: Predicted keypoints with shape [num_query, K, 3].
"""
assert len(cls_score) == len(kpt_pred)
max_per_img = self.test_cfg.get('max_per_img', self.num_query)
# exclude background
if self.loss_cls.use_sigmoid:
cls_score = F.sigmoid(cls_score)
scores, indexs = cls_score.reshape([-1]).topk(max_per_img)
det_labels = indexs % self.num_classes
bbox_index = indexs // self.num_classes
kpt_pred = kpt_pred[bbox_index]
else:
scores, det_labels = F.softmax(cls_score, axis=-1)[..., :-1].max(-1)
scores, bbox_index = scores.topk(max_per_img)
kpt_pred = kpt_pred[bbox_index]
det_labels = det_labels[bbox_index]
# ----- results after pose decoder -----
# det_kpts = kpt_pred.reshape((kpt_pred.shape[0], -1, 2))
# ----- results after joint decoder (default) -----
# import time
# start = time.time()
refine_targets = (kpt_pred, None, None, paddle.ones_like(kpt_pred))
refine_outputs = self.forward_refine(memory, mlvl_masks, refine_targets,
None, None)
# end = time.time()
# print(f'refine time: {end - start:.6f}')
det_kpts = refine_outputs[-1]
det_kpts[..., 0] = det_kpts[..., 0] * img_shape[1]
det_kpts[..., 1] = det_kpts[..., 1] * img_shape[0]
det_kpts[..., 0].clip_(min=0, max=img_shape[1])
det_kpts[..., 1].clip_(min=0, max=img_shape[0])
if rescale:
det_kpts /= paddle.to_tensor(
scale_factor[:2],
dtype=det_kpts.dtype).unsqueeze(0).unsqueeze(0)
# use circumscribed rectangle box of keypoints as det bboxes
x1 = det_kpts[..., 0].min(axis=1, keepdim=True)
y1 = det_kpts[..., 1].min(axis=1, keepdim=True)
x2 = det_kpts[..., 0].max(axis=1, keepdim=True)
y2 = det_kpts[..., 1].max(axis=1, keepdim=True)
det_bboxes = paddle.concat([x1, y1, x2, y2], axis=1)
det_bboxes = paddle.concat((det_bboxes, scores.unsqueeze(1)), -1)
det_kpts = paddle.concat(
(det_kpts, paddle.ones(
det_kpts[..., :1].shape, dtype=det_kpts.dtype)),
axis=2)
return det_bboxes, det_labels, det_kpts
def simple_test(self, feats, img_metas, rescale=False):
"""Test det bboxes without test-time augmentation.
Args:
feats (tuple[paddle.Tensor]): Multi-level features from the
upstream network, each is a 4D-tensor.
img_metas (list[dict]): List of image information.
rescale (bool, optional): Whether to rescale the results.
Defaults to False.
Returns:
list[tuple[Tensor, Tensor, Tensor]]: Each item in result_list is
3-tuple. The first item is ``bboxes`` with shape (n, 5),
where 5 represent (tl_x, tl_y, br_x, br_y, score).
The shape of the second tensor in the tuple is ``labels``
with shape (n,). The third item is ``kpts`` with shape
(n, K, 3), in [p^{1}_x, p^{1}_y, p^{1}_v, p^{K}_x, p^{K}_y,
p^{K}_v] format.
"""
# forward of this head requires img_metas
outs = self.forward(feats, img_metas)
results_list = self.get_bboxes(*outs, img_metas, rescale=rescale)
return results_list
def get_loss(self, boxes, scores, gt_bbox, gt_class, prior_boxes):
return self.loss(boxes, scores, gt_bbox, gt_class, prior_boxes)
================================================
FILE: ppdet/modeling/heads/pico_head.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import Normal, Constant
from ppdet.modeling.ops import get_static_shape
from ..initializer import normal_
from ..assigners.utils import generate_anchors_for_grid_cell
from ..bbox_utils import bbox_center, batch_distance2bbox, bbox2distance
from ppdet.core.workspace import register
from ppdet.modeling.layers import ConvNormLayer
from .simota_head import OTAVFLHead
from .gfl_head import Integral, GFLHead
from ppdet.modeling.necks.csp_pan import DPModule
eps = 1e-9
__all__ = ['PicoHead', 'PicoHeadV2', 'PicoFeat']
def npu_avg_pool2d(feat, w, h):
batch_size, channels, _, _ = feat.shape
feat_flat = paddle.reshape(feat, [batch_size, channels, -1])
feat_mean = paddle.mean(feat_flat, axis=2)
feat_mean = paddle.reshape(
feat_mean, [batch_size, channels, w, h])
return feat_mean
class PicoSE(nn.Layer):
def __init__(self, feat_channels):
super(PicoSE, self).__init__()
self.fc = nn.Conv2D(feat_channels, feat_channels, 1)
self.conv = ConvNormLayer(feat_channels, feat_channels, 1, 1)
self._init_weights()
def _init_weights(self):
normal_(self.fc.weight, std=0.001)
def forward(self, feat, avg_feat):
weight = F.sigmoid(self.fc(avg_feat))
out = self.conv(feat * weight)
return out
@register
class PicoFeat(nn.Layer):
"""
PicoFeat of PicoDet
Args:
feat_in (int): The channel number of input Tensor.
feat_out (int): The channel number of output Tensor.
num_convs (int): The convolution number of the LiteGFLFeat.
norm_type (str): Normalization type, 'bn'/'sync_bn'/'gn'.
share_cls_reg (bool): Whether to share the cls and reg output.
act (str): The act of per layers.
use_se (bool): Whether to use se module.
"""
def __init__(self,
feat_in=256,
feat_out=96,
num_fpn_stride=3,
num_convs=2,
norm_type='bn',
share_cls_reg=False,
act='hard_swish',
use_se=False):
super(PicoFeat, self).__init__()
self.num_convs = num_convs
self.norm_type = norm_type
self.share_cls_reg = share_cls_reg
self.act = act
self.use_se = use_se
self.cls_convs = []
self.reg_convs = []
if paddle.device.get_device().startswith("npu"):
self.device = "npu"
else:
self.device = None
if use_se:
assert share_cls_reg == True, \
'In the case of using se, share_cls_reg must be set to True'
self.se = nn.LayerList()
for stage_idx in range(num_fpn_stride):
cls_subnet_convs = []
reg_subnet_convs = []
for i in range(self.num_convs):
in_c = feat_in if i == 0 else feat_out
cls_conv_dw = self.add_sublayer(
'cls_conv_dw{}.{}'.format(stage_idx, i),
ConvNormLayer(
ch_in=in_c,
ch_out=feat_out,
filter_size=5,
stride=1,
groups=feat_out,
norm_type=norm_type,
bias_on=False,
lr_scale=2.))
cls_subnet_convs.append(cls_conv_dw)
cls_conv_pw = self.add_sublayer(
'cls_conv_pw{}.{}'.format(stage_idx, i),
ConvNormLayer(
ch_in=in_c,
ch_out=feat_out,
filter_size=1,
stride=1,
norm_type=norm_type,
bias_on=False,
lr_scale=2.))
cls_subnet_convs.append(cls_conv_pw)
if not self.share_cls_reg:
reg_conv_dw = self.add_sublayer(
'reg_conv_dw{}.{}'.format(stage_idx, i),
ConvNormLayer(
ch_in=in_c,
ch_out=feat_out,
filter_size=5,
stride=1,
groups=feat_out,
norm_type=norm_type,
bias_on=False,
lr_scale=2.))
reg_subnet_convs.append(reg_conv_dw)
reg_conv_pw = self.add_sublayer(
'reg_conv_pw{}.{}'.format(stage_idx, i),
ConvNormLayer(
ch_in=in_c,
ch_out=feat_out,
filter_size=1,
stride=1,
norm_type=norm_type,
bias_on=False,
lr_scale=2.))
reg_subnet_convs.append(reg_conv_pw)
self.cls_convs.append(cls_subnet_convs)
self.reg_convs.append(reg_subnet_convs)
if use_se:
self.se.append(PicoSE(feat_out))
def act_func(self, x):
if self.act == "leaky_relu":
x = F.leaky_relu(x)
elif self.act == "hard_swish":
x = F.hardswish(x)
elif self.act == "relu6":
x = F.relu6(x)
return x
def forward(self, fpn_feat, stage_idx):
assert stage_idx < len(self.cls_convs)
cls_feat = fpn_feat
reg_feat = fpn_feat
for i in range(len(self.cls_convs[stage_idx])):
cls_feat = self.act_func(self.cls_convs[stage_idx][i](cls_feat))
reg_feat = cls_feat
if not self.share_cls_reg:
reg_feat = self.act_func(self.reg_convs[stage_idx][i](reg_feat))
if self.use_se:
if self.device == "npu":
avg_feat = npu_avg_pool2d(cls_feat, 1, 1)
else:
avg_feat = F.adaptive_avg_pool2d(cls_feat, (1, 1))
se_feat = self.act_func(self.se[stage_idx](cls_feat, avg_feat))
return cls_feat, se_feat
return cls_feat, reg_feat
@register
class PicoHead(OTAVFLHead):
"""
PicoHead
Args:
conv_feat (object): Instance of 'PicoFeat'
num_classes (int): Number of classes
fpn_stride (list): The stride of each FPN Layer
prior_prob (float): Used to set the bias init for the class prediction layer
loss_class (object): Instance of VariFocalLoss.
loss_dfl (object): Instance of DistributionFocalLoss.
loss_bbox (object): Instance of bbox loss.
assigner (object): Instance of label assigner.
reg_max: Max value of integral set :math: `{0, ..., reg_max}`
n QFL setting. Default: 7.
"""
__inject__ = [
'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox',
'assigner', 'nms'
]
__shared__ = ['num_classes', 'eval_size']
def __init__(self,
conv_feat='PicoFeat',
dgqp_module=None,
num_classes=80,
fpn_stride=[8, 16, 32],
prior_prob=0.01,
loss_class='VariFocalLoss',
loss_dfl='DistributionFocalLoss',
loss_bbox='GIoULoss',
assigner='SimOTAAssigner',
reg_max=16,
feat_in_chan=96,
nms=None,
nms_pre=1000,
cell_offset=0,
eval_size=None):
super(PicoHead, self).__init__(
conv_feat=conv_feat,
dgqp_module=dgqp_module,
num_classes=num_classes,
fpn_stride=fpn_stride,
prior_prob=prior_prob,
loss_class=loss_class,
loss_dfl=loss_dfl,
loss_bbox=loss_bbox,
assigner=assigner,
reg_max=reg_max,
feat_in_chan=feat_in_chan,
nms=nms,
nms_pre=nms_pre,
cell_offset=cell_offset)
self.conv_feat = conv_feat
self.num_classes = num_classes
self.fpn_stride = fpn_stride
self.prior_prob = prior_prob
self.loss_vfl = loss_class
self.loss_dfl = loss_dfl
self.loss_bbox = loss_bbox
self.assigner = assigner
self.reg_max = reg_max
self.feat_in_chan = feat_in_chan
self.nms = nms
self.nms_pre = nms_pre
self.cell_offset = cell_offset
self.eval_size = eval_size
self.device = paddle.device.get_device()
self.use_sigmoid = self.loss_vfl.use_sigmoid
if self.use_sigmoid:
self.cls_out_channels = self.num_classes
else:
self.cls_out_channels = self.num_classes + 1
bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob)
# Clear the super class initialization
self.gfl_head_cls = None
self.gfl_head_reg = None
self.scales_regs = None
self.head_cls_list = []
self.head_reg_list = []
for i in range(len(fpn_stride)):
head_cls = self.add_sublayer(
"head_cls" + str(i),
nn.Conv2D(
in_channels=self.feat_in_chan,
out_channels=self.cls_out_channels + 4 * (self.reg_max + 1)
if self.conv_feat.share_cls_reg else self.cls_out_channels,
kernel_size=1,
stride=1,
padding=0,
weight_attr=ParamAttr(initializer=Normal(
mean=0., std=0.01)),
bias_attr=ParamAttr(
initializer=Constant(value=bias_init_value))))
self.head_cls_list.append(head_cls)
if not self.conv_feat.share_cls_reg:
head_reg = self.add_sublayer(
"head_reg" + str(i),
nn.Conv2D(
in_channels=self.feat_in_chan,
out_channels=4 * (self.reg_max + 1),
kernel_size=1,
stride=1,
padding=0,
weight_attr=ParamAttr(initializer=Normal(
mean=0., std=0.01)),
bias_attr=ParamAttr(initializer=Constant(value=0))))
self.head_reg_list.append(head_reg)
# initialize the anchor points
if self.eval_size:
self.anchor_points, self.stride_tensor = self._generate_anchors()
def forward(self, fpn_feats, export_post_process=True):
assert len(fpn_feats) == len(
self.fpn_stride
), "The size of fpn_feats is not equal to size of fpn_stride"
if self.training:
return self.forward_train(fpn_feats)
else:
return self.forward_eval(
fpn_feats, export_post_process=export_post_process)
def forward_train(self, fpn_feats):
cls_logits_list, bboxes_reg_list = [], []
for i, fpn_feat in enumerate(fpn_feats):
conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat, i)
if self.conv_feat.share_cls_reg:
cls_logits = self.head_cls_list[i](conv_cls_feat)
cls_score, bbox_pred = paddle.split(
cls_logits,
[self.cls_out_channels, 4 * (self.reg_max + 1)],
axis=1)
else:
cls_score = self.head_cls_list[i](conv_cls_feat)
bbox_pred = self.head_reg_list[i](conv_reg_feat)
if self.dgqp_module:
quality_score = self.dgqp_module(bbox_pred)
cls_score = F.sigmoid(cls_score) * quality_score
cls_logits_list.append(cls_score)
bboxes_reg_list.append(bbox_pred)
return (cls_logits_list, bboxes_reg_list)
def forward_eval(self, fpn_feats, export_post_process=True):
if self.eval_size:
anchor_points, stride_tensor = self.anchor_points, self.stride_tensor
else:
anchor_points, stride_tensor = self._generate_anchors(fpn_feats)
cls_logits_list, bboxes_reg_list = [], []
for i, fpn_feat in enumerate(fpn_feats):
conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat, i)
if self.conv_feat.share_cls_reg:
cls_logits = self.head_cls_list[i](conv_cls_feat)
cls_score, bbox_pred = paddle.split(
cls_logits,
[self.cls_out_channels, 4 * (self.reg_max + 1)],
axis=1)
else:
cls_score = self.head_cls_list[i](conv_cls_feat)
bbox_pred = self.head_reg_list[i](conv_reg_feat)
if self.dgqp_module:
quality_score = self.dgqp_module(bbox_pred)
cls_score = F.sigmoid(cls_score) * quality_score
if not export_post_process:
# Now only supports batch size = 1 in deploy
# TODO(ygh): support batch size > 1
cls_score_out = F.sigmoid(cls_score).reshape(
[1, self.cls_out_channels, -1]).transpose([0, 2, 1])
bbox_pred = bbox_pred.reshape([1, (self.reg_max + 1) * 4,
-1]).transpose([0, 2, 1])
else:
_, _, h, w = fpn_feat.shape
l = h * w
cls_score_out = F.sigmoid(
cls_score.reshape([-1, self.cls_out_channels, l]))
bbox_pred = bbox_pred.transpose([0, 2, 3, 1])
bbox_pred = self.distribution_project(bbox_pred)
bbox_pred = bbox_pred.reshape([-1, l, 4])
cls_logits_list.append(cls_score_out)
bboxes_reg_list.append(bbox_pred)
if export_post_process:
cls_logits_list = paddle.concat(cls_logits_list, axis=-1)
bboxes_reg_list = paddle.concat(bboxes_reg_list, axis=1)
bboxes_reg_list = batch_distance2bbox(anchor_points,
bboxes_reg_list)
bboxes_reg_list *= stride_tensor
return (cls_logits_list, bboxes_reg_list)
def _generate_anchors(self, feats=None):
# just use in eval time
anchor_points = []
stride_tensor = []
for i, stride in enumerate(self.fpn_stride):
if feats is not None:
_, _, h, w = feats[i].shape
else:
h = math.ceil(self.eval_size[0] / stride)
w = math.ceil(self.eval_size[1] / stride)
shift_x = paddle.arange(end=w) + self.cell_offset
shift_y = paddle.arange(end=h) + self.cell_offset
shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
anchor_point = paddle.cast(
paddle.stack(
[shift_x, shift_y], axis=-1), dtype='float32')
anchor_points.append(anchor_point.reshape([-1, 2]))
stride_tensor.append(
paddle.full(
[h * w, 1], stride, dtype='float32'))
anchor_points = paddle.concat(anchor_points)
stride_tensor = paddle.concat(stride_tensor)
return anchor_points, stride_tensor
def post_process(self,
head_outs,
scale_factor,
export_nms=True,
nms_cpu=False):
pred_scores, pred_bboxes = head_outs
if not export_nms:
return pred_bboxes, pred_scores
else:
# rescale: [h_scale, w_scale] -> [w_scale, h_scale, w_scale, h_scale]
scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)
scale_factor = paddle.concat(
[scale_x, scale_y, scale_x, scale_y],
axis=-1).reshape([-1, 1, 4])
# scale bbox to origin image size.
pred_bboxes /= scale_factor
if nms_cpu:
paddle.set_device("cpu")
bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
paddle.set_device(self.device)
else:
bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
return bbox_pred, bbox_num
@register
class PicoHeadV2(GFLHead):
"""
PicoHeadV2
Args:
conv_feat (object): Instance of 'PicoFeat'
num_classes (int): Number of classes
fpn_stride (list): The stride of each FPN Layer
prior_prob (float): Used to set the bias init for the class prediction layer
loss_class (object): Instance of VariFocalLoss.
loss_dfl (object): Instance of DistributionFocalLoss.
loss_bbox (object): Instance of bbox loss.
assigner (object): Instance of label assigner.
reg_max: Max value of integral set :math: `{0, ..., reg_max}`
n QFL setting. Default: 7.
"""
__inject__ = [
'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox',
'static_assigner', 'assigner', 'nms'
]
__shared__ = ['num_classes', 'eval_size']
def __init__(self,
conv_feat='PicoFeatV2',
dgqp_module=None,
num_classes=80,
fpn_stride=[8, 16, 32],
prior_prob=0.01,
use_align_head=True,
loss_class='VariFocalLoss',
loss_dfl='DistributionFocalLoss',
loss_bbox='GIoULoss',
static_assigner_epoch=60,
static_assigner='ATSSAssigner',
assigner='TaskAlignedAssigner',
reg_max=16,
feat_in_chan=96,
nms=None,
nms_pre=1000,
cell_offset=0,
act='hard_swish',
grid_cell_scale=5.0,
eval_size=None):
super(PicoHeadV2, self).__init__(
conv_feat=conv_feat,
dgqp_module=dgqp_module,
num_classes=num_classes,
fpn_stride=fpn_stride,
prior_prob=prior_prob,
loss_class=loss_class,
loss_dfl=loss_dfl,
loss_bbox=loss_bbox,
reg_max=reg_max,
feat_in_chan=feat_in_chan,
nms=nms,
nms_pre=nms_pre,
cell_offset=cell_offset, )
self.conv_feat = conv_feat
self.num_classes = num_classes
self.fpn_stride = fpn_stride
self.prior_prob = prior_prob
self.loss_vfl = loss_class
self.loss_dfl = loss_dfl
self.loss_bbox = loss_bbox
self.static_assigner_epoch = static_assigner_epoch
self.static_assigner = static_assigner
self.assigner = assigner
self.reg_max = reg_max
self.feat_in_chan = feat_in_chan
self.nms = nms
self.nms_pre = nms_pre
self.cell_offset = cell_offset
self.act = act
self.grid_cell_scale = grid_cell_scale
self.use_align_head = use_align_head
self.cls_out_channels = self.num_classes
self.eval_size = eval_size
bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob)
# Clear the super class initialization
self.gfl_head_cls = None
self.gfl_head_reg = None
self.scales_regs = None
self.head_cls_list = nn.LayerList()
self.head_reg_list = nn.LayerList()
self.cls_align = nn.LayerList()
for i in range(len(fpn_stride)):
head_cls = self.add_sublayer(
"head_cls" + str(i),
nn.Conv2D(
in_channels=self.feat_in_chan,
out_channels=self.cls_out_channels,
kernel_size=1,
stride=1,
padding=0,
weight_attr=ParamAttr(initializer=Normal(
mean=0., std=0.01)),
bias_attr=ParamAttr(
initializer=Constant(value=bias_init_value))))
self.head_cls_list.append(head_cls)
head_reg = self.add_sublayer(
"head_reg" + str(i),
nn.Conv2D(
in_channels=self.feat_in_chan,
out_channels=4 * (self.reg_max + 1),
kernel_size=1,
stride=1,
padding=0,
weight_attr=ParamAttr(initializer=Normal(
mean=0., std=0.01)),
bias_attr=ParamAttr(initializer=Constant(value=0))))
self.head_reg_list.append(head_reg)
if self.use_align_head:
self.cls_align.append(
DPModule(
self.feat_in_chan,
1,
5,
act=self.act,
use_act_in_out=False))
# initialize the anchor points
if self.eval_size:
self.anchor_points, self.stride_tensor = self._generate_anchors()
def forward(self, fpn_feats, export_post_process=True):
assert len(fpn_feats) == len(
self.fpn_stride
), "The size of fpn_feats is not equal to size of fpn_stride"
if self.training:
return self.forward_train(fpn_feats)
else:
return self.forward_eval(
fpn_feats, export_post_process=export_post_process)
def forward_train(self, fpn_feats):
cls_score_list, reg_list, box_list = [], [], []
for i, (fpn_feat, stride) in enumerate(zip(fpn_feats, self.fpn_stride)):
b, _, h, w = get_static_shape(fpn_feat)
# task decomposition
conv_cls_feat, se_feat = self.conv_feat(fpn_feat, i)
cls_logit = self.head_cls_list[i](se_feat)
reg_pred = self.head_reg_list[i](se_feat)
# cls prediction and alignment
if self.use_align_head:
cls_prob = F.sigmoid(self.cls_align[i](conv_cls_feat))
cls_score = (F.sigmoid(cls_logit) * cls_prob + eps).sqrt()
else:
cls_score = F.sigmoid(cls_logit)
cls_score_out = cls_score.transpose([0, 2, 3, 1])
bbox_pred = reg_pred.transpose([0, 2, 3, 1])
b, cell_h, cell_w, _ = cls_score_out.shape
y, x = self.get_single_level_center_point(
[cell_h, cell_w], stride, cell_offset=self.cell_offset)
center_points = paddle.stack([x, y], axis=-1)
cls_score_out = cls_score_out.reshape(
[b, -1, self.cls_out_channels])
bbox_pred = self.distribution_project(bbox_pred) * stride
bbox_pred = bbox_pred.reshape([b, cell_h * cell_w, 4])
bbox_pred = batch_distance2bbox(
center_points, bbox_pred, max_shapes=None)
cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))
reg_list.append(reg_pred.flatten(2).transpose([0, 2, 1]))
box_list.append(bbox_pred / stride)
cls_score_list = paddle.concat(cls_score_list, axis=1)
box_list = paddle.concat(box_list, axis=1)
reg_list = paddle.concat(reg_list, axis=1)
return cls_score_list, reg_list, box_list, fpn_feats
def forward_eval(self, fpn_feats, export_post_process=True):
if self.eval_size:
anchor_points, stride_tensor = self.anchor_points, self.stride_tensor
else:
anchor_points, stride_tensor = self._generate_anchors(fpn_feats)
cls_score_list, box_list = [], []
for i, (fpn_feat, stride) in enumerate(zip(fpn_feats, self.fpn_stride)):
_, _, h, w = fpn_feat.shape
# task decomposition
conv_cls_feat, se_feat = self.conv_feat(fpn_feat, i)
cls_logit = self.head_cls_list[i](se_feat)
reg_pred = self.head_reg_list[i](se_feat)
# cls prediction and alignment
if self.use_align_head:
cls_prob = F.sigmoid(self.cls_align[i](conv_cls_feat))
cls_score = (F.sigmoid(cls_logit) * cls_prob + eps).sqrt()
else:
cls_score = F.sigmoid(cls_logit)
if not export_post_process:
# Now only supports batch size = 1 in deploy
cls_score_list.append(
cls_score.reshape([1, self.cls_out_channels, -1]).transpose(
[0, 2, 1]))
box_list.append(
reg_pred.reshape([1, (self.reg_max + 1) * 4, -1]).transpose(
[0, 2, 1]))
else:
l = h * w
cls_score_out = cls_score.reshape(
[-1, self.cls_out_channels, l])
bbox_pred = reg_pred.transpose([0, 2, 3, 1])
bbox_pred = self.distribution_project(bbox_pred)
bbox_pred = bbox_pred.reshape([-1, l, 4])
cls_score_list.append(cls_score_out)
box_list.append(bbox_pred)
if export_post_process:
cls_score_list = paddle.concat(cls_score_list, axis=-1)
box_list = paddle.concat(box_list, axis=1)
box_list = batch_distance2bbox(anchor_points, box_list)
box_list *= stride_tensor
return cls_score_list, box_list
def get_loss(self, head_outs, gt_meta):
pred_scores, pred_regs, pred_bboxes, fpn_feats = head_outs
gt_labels = gt_meta['gt_class']
gt_bboxes = gt_meta['gt_bbox']
gt_scores = gt_meta['gt_score'] if 'gt_score' in gt_meta else None
num_imgs = gt_meta['im_id'].shape[0]
pad_gt_mask = gt_meta['pad_gt_mask']
anchors, _, num_anchors_list, stride_tensor_list = generate_anchors_for_grid_cell(
fpn_feats, self.fpn_stride, self.grid_cell_scale, self.cell_offset)
centers = bbox_center(anchors)
# label assignment
if gt_meta['epoch_id'] < self.static_assigner_epoch:
assigned_labels, assigned_bboxes, assigned_scores = self.static_assigner(
anchors,
num_anchors_list,
gt_labels,
gt_bboxes,
pad_gt_mask,
bg_index=self.num_classes,
gt_scores=gt_scores,
pred_bboxes=pred_bboxes.detach() * stride_tensor_list)
else:
assigned_labels, assigned_bboxes, assigned_scores = self.assigner(
pred_scores.detach(),
pred_bboxes.detach() * stride_tensor_list,
centers,
num_anchors_list,
gt_labels,
gt_bboxes,
pad_gt_mask,
bg_index=self.num_classes,
gt_scores=gt_scores)
assigned_bboxes /= stride_tensor_list
centers_shape = centers.shape
flatten_centers = centers.expand(
[num_imgs, centers_shape[0], centers_shape[1]]).reshape([-1, 2])
flatten_strides = stride_tensor_list.expand(
[num_imgs, centers_shape[0], 1]).reshape([-1, 1])
flatten_cls_preds = pred_scores.reshape([-1, self.num_classes])
flatten_regs = pred_regs.reshape([-1, 4 * (self.reg_max + 1)])
flatten_bboxes = pred_bboxes.reshape([-1, 4])
flatten_bbox_targets = assigned_bboxes.reshape([-1, 4])
flatten_labels = assigned_labels.reshape([-1])
flatten_assigned_scores = assigned_scores.reshape(
[-1, self.num_classes])
pos_inds = paddle.nonzero(
paddle.logical_and((flatten_labels >= 0),
(flatten_labels < self.num_classes)),
as_tuple=False).squeeze(1)
num_total_pos = len(pos_inds)
if num_total_pos > 0:
pos_bbox_targets = paddle.gather(
flatten_bbox_targets, pos_inds, axis=0)
pos_decode_bbox_pred = paddle.gather(
flatten_bboxes, pos_inds, axis=0)
pos_reg = paddle.gather(flatten_regs, pos_inds, axis=0)
pos_strides = paddle.gather(flatten_strides, pos_inds, axis=0)
pos_centers = paddle.gather(
flatten_centers, pos_inds, axis=0) / pos_strides
weight_targets = flatten_assigned_scores.detach()
weight_targets = paddle.gather(
weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0)
pred_corners = pos_reg.reshape([-1, self.reg_max + 1])
target_corners = bbox2distance(pos_centers, pos_bbox_targets,
self.reg_max).reshape([-1])
# regression loss
loss_bbox = paddle.sum(
self.loss_bbox(pos_decode_bbox_pred,
pos_bbox_targets) * weight_targets)
# dfl loss
loss_dfl = self.loss_dfl(
pred_corners,
target_corners,
weight=weight_targets.expand([-1, 4]).reshape([-1]),
avg_factor=4.0)
else:
loss_bbox = paddle.zeros([])
loss_dfl = paddle.zeros([])
avg_factor = flatten_assigned_scores.sum()
if paddle.distributed.get_world_size() > 1:
paddle.distributed.all_reduce(avg_factor)
avg_factor = paddle.clip(
avg_factor / paddle.distributed.get_world_size(), min=1)
loss_vfl = self.loss_vfl(
flatten_cls_preds, flatten_assigned_scores, avg_factor=avg_factor)
loss_bbox = loss_bbox / avg_factor
loss_dfl = loss_dfl / avg_factor
loss_states = dict(
loss_vfl=loss_vfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl)
return loss_states
def _generate_anchors(self, feats=None):
# just use in eval time
anchor_points = []
stride_tensor = []
for i, stride in enumerate(self.fpn_stride):
if feats is not None:
_, _, h, w = feats[i].shape
else:
h = math.ceil(self.eval_size[0] / stride)
w = math.ceil(self.eval_size[1] / stride)
shift_x = paddle.arange(end=w) + self.cell_offset
shift_y = paddle.arange(end=h) + self.cell_offset
shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
anchor_point = paddle.cast(
paddle.stack(
[shift_x, shift_y], axis=-1), dtype='float32')
anchor_points.append(anchor_point.reshape([-1, 2]))
stride_tensor.append(
paddle.full(
[h * w, 1], stride, dtype='float32'))
anchor_points = paddle.concat(anchor_points)
stride_tensor = paddle.concat(stride_tensor)
return anchor_points, stride_tensor
def post_process(self,
head_outs,
scale_factor,
export_nms=True,
nms_cpu=False):
pred_scores, pred_bboxes = head_outs
if not export_nms:
return pred_bboxes, pred_scores
else:
# rescale: [h_scale, w_scale] -> [w_scale, h_scale, w_scale, h_scale]
scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)
scale_factor = paddle.concat(
[scale_x, scale_y, scale_x, scale_y],
axis=-1).reshape([-1, 1, 4])
# scale bbox to origin image size.
pred_bboxes /= scale_factor
bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
return bbox_pred, bbox_num
================================================
FILE: ppdet/modeling/heads/ppyoloe_contrast_head.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from ..initializer import bias_init_with_prob, constant_
from ..assigners.utils import generate_anchors_for_grid_cell
from ppdet.modeling.heads.ppyoloe_head import PPYOLOEHead
__all__ = ['PPYOLOEContrastHead']
@register
class PPYOLOEContrastHead(PPYOLOEHead):
__shared__ = [
'num_classes', 'eval_size', 'trt', 'exclude_nms',
'exclude_post_process', 'use_shared_conv', 'for_distill'
]
__inject__ = ['static_assigner', 'assigner', 'nms', 'contrast_loss']
def __init__(self,
in_channels=[1024, 512, 256],
num_classes=80,
act='swish',
fpn_strides=(32, 16, 8),
grid_cell_scale=5.0,
grid_cell_offset=0.5,
reg_max=16,
reg_range=None,
static_assigner_epoch=4,
use_varifocal_loss=True,
static_assigner='ATSSAssigner',
assigner='TaskAlignedAssigner',
contrast_loss='SupContrast',
nms='MultiClassNMS',
eval_size=None,
loss_weight={
'class': 1.0,
'iou': 2.5,
'dfl': 0.5,
},
trt=False,
attn_conv='convbn',
exclude_nms=False,
exclude_post_process=False,
use_shared_conv=True,
for_distill=False):
super().__init__(in_channels, num_classes, act, fpn_strides,
grid_cell_scale, grid_cell_offset, reg_max, reg_range,
static_assigner_epoch, use_varifocal_loss,
static_assigner, assigner, nms, eval_size, loss_weight,
trt, attn_conv, exclude_nms, exclude_post_process,
use_shared_conv, for_distill)
assert len(in_channels) > 0, "len(in_channels) should > 0"
self.contrast_loss = contrast_loss
self.contrast_encoder = nn.LayerList()
for in_c in self.in_channels:
self.contrast_encoder.append(nn.Conv2D(in_c, 128, 3, padding=1))
self._init_contrast_encoder()
def _init_contrast_encoder(self):
bias_en = bias_init_with_prob(0.01)
for en_ in self.contrast_encoder:
constant_(en_.weight)
constant_(en_.bias, bias_en)
def forward_train(self, feats, targets, aux_pred=None):
anchors, anchor_points, num_anchors_list, stride_tensor = \
generate_anchors_for_grid_cell(
feats, self.fpn_strides, self.grid_cell_scale,
self.grid_cell_offset)
cls_score_list, reg_distri_list = [], []
contrast_encoder_list = []
for i, feat in enumerate(feats):
avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +
feat)
reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
contrast_logit = self.contrast_encoder[i](self.stem_cls[i](
feat, avg_feat) + feat)
contrast_encoder_list.append(
contrast_logit.flatten(2).transpose([0, 2, 1]))
# cls and reg
cls_score = F.sigmoid(cls_logit)
cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))
reg_distri_list.append(reg_distri.flatten(2).transpose([0, 2, 1]))
cls_score_list = paddle.concat(cls_score_list, axis=1)
reg_distri_list = paddle.concat(reg_distri_list, axis=1)
contrast_encoder_list = paddle.concat(contrast_encoder_list, axis=1)
return self.get_loss([
cls_score_list, reg_distri_list, contrast_encoder_list, anchors,
anchor_points, num_anchors_list, stride_tensor
], targets)
def get_loss(self, head_outs, gt_meta):
pred_scores, pred_distri, pred_contrast_encoder, anchors,\
anchor_points, num_anchors_list, stride_tensor = head_outs
anchor_points_s = anchor_points / stride_tensor
pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri)
gt_labels = gt_meta['gt_class']
gt_bboxes = gt_meta['gt_bbox']
pad_gt_mask = gt_meta['pad_gt_mask']
# label assignment
if gt_meta['epoch_id'] < self.static_assigner_epoch:
assigned_labels, assigned_bboxes, assigned_scores = \
self.static_assigner(
anchors,
num_anchors_list,
gt_labels,
gt_bboxes,
pad_gt_mask,
bg_index=self.num_classes,
pred_bboxes=pred_bboxes.detach() * stride_tensor)
alpha_l = 0.25
else:
if self.sm_use:
assigned_labels, assigned_bboxes, assigned_scores = \
self.assigner(
pred_scores.detach(),
pred_bboxes.detach() * stride_tensor,
anchor_points,
stride_tensor,
gt_labels,
gt_bboxes,
pad_gt_mask,
bg_index=self.num_classes)
else:
assigned_labels, assigned_bboxes, assigned_scores = \
self.assigner(
pred_scores.detach(),
pred_bboxes.detach() * stride_tensor,
anchor_points,
num_anchors_list,
gt_labels,
gt_bboxes,
pad_gt_mask,
bg_index=self.num_classes)
alpha_l = -1
# rescale bbox
assigned_bboxes /= stride_tensor
# cls loss
if self.use_varifocal_loss:
one_hot_label = F.one_hot(assigned_labels,
self.num_classes + 1)[..., :-1]
loss_cls = self._varifocal_loss(pred_scores, assigned_scores,
one_hot_label)
else:
loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l)
assigned_scores_sum = assigned_scores.sum()
if paddle.distributed.get_world_size() > 1:
paddle.distributed.all_reduce(assigned_scores_sum)
assigned_scores_sum /= paddle.distributed.get_world_size()
assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.)
loss_cls /= assigned_scores_sum
loss_l1, loss_iou, loss_dfl = \
self._bbox_loss(pred_distri, pred_bboxes, anchor_points_s,
assigned_labels, assigned_bboxes, assigned_scores,
assigned_scores_sum)
# contrast loss
loss_contrast = self.contrast_loss(pred_contrast_encoder.reshape([-1, pred_contrast_encoder.shape[-1]]), \
assigned_labels.reshape([-1]), assigned_scores.max(-1).reshape([-1]))
loss = self.loss_weight['class'] * loss_cls + \
self.loss_weight['iou'] * loss_iou + \
self.loss_weight['dfl'] * loss_dfl + \
self.loss_weight['contrast'] * loss_contrast
out_dict = {
'loss': loss,
'loss_cls': loss_cls,
'loss_iou': loss_iou,
'loss_dfl': loss_dfl,
'loss_l1': loss_l1,
'loss_contrast': loss_contrast
}
return out_dict
================================================
FILE: ppdet/modeling/heads/ppyoloe_head.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from paddle import ParamAttr
from paddle.nn.initializer import KaimingNormal
from paddle.nn.initializer import Normal, Constant
from ..bbox_utils import batch_distance2bbox
from ..losses import GIoULoss
from ..initializer import bias_init_with_prob, constant_, normal_
from ..assigners.utils import generate_anchors_for_grid_cell
from ppdet.modeling.backbones.cspresnet import ConvBNLayer, RepVggBlock
from ppdet.modeling.ops import get_static_shape, get_act_fn
from ppdet.modeling.layers import MultiClassNMS
__all__ = ['PPYOLOEHead', 'SimpleConvHead']
class ESEAttn(nn.Layer):
def __init__(self, feat_channels, act='swish', attn_conv='convbn'):
super(ESEAttn, self).__init__()
self.fc = nn.Conv2D(feat_channels, feat_channels, 1)
if attn_conv == 'convbn':
self.conv = ConvBNLayer(feat_channels, feat_channels, 1, act=act)
elif attn_conv == 'repvgg':
self.conv = RepVggBlock(feat_channels, feat_channels, act=act)
else:
self.conv = None
self._init_weights()
def _init_weights(self):
normal_(self.fc.weight, std=0.001)
def forward(self, feat, avg_feat):
weight = F.sigmoid(self.fc(avg_feat))
if self.conv:
return self.conv(feat * weight)
else:
return feat * weight
@register
class PPYOLOEHead(nn.Layer):
__shared__ = [
'num_classes', 'eval_size', 'trt', 'exclude_nms',
'exclude_post_process', 'use_shared_conv', 'for_distill'
]
__inject__ = ['static_assigner', 'assigner', 'nms']
def __init__(self,
in_channels=[1024, 512, 256],
num_classes=80,
act='swish',
fpn_strides=(32, 16, 8),
grid_cell_scale=5.0,
grid_cell_offset=0.5,
reg_max=16,
reg_range=None,
static_assigner_epoch=4,
use_varifocal_loss=True,
static_assigner='ATSSAssigner',
assigner='TaskAlignedAssigner',
nms='MultiClassNMS',
eval_size=None,
loss_weight={
'class': 1.0,
'iou': 2.5,
'dfl': 0.5,
},
trt=False,
attn_conv='convbn',
exclude_nms=False,
exclude_post_process=False,
use_shared_conv=True,
for_distill=False):
super(PPYOLOEHead, self).__init__()
assert len(in_channels) > 0, "len(in_channels) should > 0"
self.in_channels = in_channels
self.num_classes = num_classes
self.fpn_strides = fpn_strides
self.grid_cell_scale = grid_cell_scale
self.grid_cell_offset = grid_cell_offset
if reg_range:
self.sm_use = True
self.reg_range = reg_range
else:
self.sm_use = False
self.reg_range = (0, reg_max + 1)
self.reg_channels = self.reg_range[1] - self.reg_range[0]
self.iou_loss = GIoULoss()
self.loss_weight = loss_weight
self.use_varifocal_loss = use_varifocal_loss
self.eval_size = eval_size
self.static_assigner_epoch = static_assigner_epoch
self.static_assigner = static_assigner
self.assigner = assigner
self.nms = nms
if isinstance(self.nms, MultiClassNMS) and trt:
self.nms.trt = trt
self.exclude_nms = exclude_nms
self.exclude_post_process = exclude_post_process
self.use_shared_conv = use_shared_conv
self.for_distill = for_distill
self.is_teacher = False
# stem
self.stem_cls = nn.LayerList()
self.stem_reg = nn.LayerList()
act = get_act_fn(
act, trt=trt) if act is None or isinstance(act,
(str, dict)) else act
for in_c in self.in_channels:
self.stem_cls.append(ESEAttn(in_c, act=act, attn_conv=attn_conv))
self.stem_reg.append(ESEAttn(in_c, act=act, attn_conv=attn_conv))
# pred head
self.pred_cls = nn.LayerList()
self.pred_reg = nn.LayerList()
for in_c in self.in_channels:
self.pred_cls.append(
nn.Conv2D(
in_c, self.num_classes, 3, padding=1))
self.pred_reg.append(
nn.Conv2D(
in_c, 4 * self.reg_channels, 3, padding=1))
# projection conv
self.proj_conv = nn.Conv2D(self.reg_channels, 1, 1, bias_attr=False)
self.proj_conv.skip_quant = True
self._init_weights()
if self.for_distill:
self.distill_pairs = {}
@classmethod
def from_config(cls, cfg, input_shape):
return {'in_channels': [i.channels for i in input_shape], }
def _init_weights(self):
bias_cls = bias_init_with_prob(0.01)
for cls_, reg_ in zip(self.pred_cls, self.pred_reg):
constant_(cls_.weight)
constant_(cls_.bias, bias_cls)
constant_(reg_.weight)
constant_(reg_.bias, 1.0)
proj = paddle.linspace(self.reg_range[0], self.reg_range[1] - 1,
self.reg_channels).reshape(
[1, self.reg_channels, 1, 1])
self.proj_conv.weight.set_value(proj)
self.proj_conv.weight.stop_gradient = True
if self.eval_size:
anchor_points, stride_tensor = self._generate_anchors()
self.anchor_points = anchor_points
self.stride_tensor = stride_tensor
def m_avg_pool2d(self, feat, w, h):
batch_size, channels, _, _ = feat.shape
feat_flat = paddle.reshape(feat, [batch_size, channels, -1])
feat_mean = paddle.mean(feat_flat, axis=2)
feat_mean = paddle.reshape(
feat_mean, [batch_size, channels, w, h])
return feat_mean
def forward_train(self, feats, targets, aux_pred=None):
anchors, anchor_points, num_anchors_list, stride_tensor = \
generate_anchors_for_grid_cell(
feats, self.fpn_strides, self.grid_cell_scale,
self.grid_cell_offset)
cls_score_list, reg_distri_list = [], []
for i, feat in enumerate(feats):
if (paddle.get_device()[:3]=='npu'):
avg_feat = self.m_avg_pool2d(feat, 1, 1)
else:
avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +
feat)
reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
# cls and reg
cls_score = F.sigmoid(cls_logit)
cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))
reg_distri_list.append(reg_distri.flatten(2).transpose([0, 2, 1]))
cls_score_list = paddle.concat(cls_score_list, axis=1)
reg_distri_list = paddle.concat(reg_distri_list, axis=1)
if targets.get('is_teacher', False):
pred_deltas, pred_dfls = self._bbox_decode_fake(reg_distri_list)
return cls_score_list, pred_deltas * stride_tensor, pred_dfls
if targets.get('get_data', False):
pred_deltas, pred_dfls = self._bbox_decode_fake(reg_distri_list)
return cls_score_list, pred_deltas * stride_tensor, pred_dfls
return self.get_loss([
cls_score_list, reg_distri_list, anchors, anchor_points,
num_anchors_list, stride_tensor
], targets, aux_pred)
def _generate_anchors(self, feats=None, dtype='float32'):
# just use in eval time
anchor_points = []
stride_tensor = []
for i, stride in enumerate(self.fpn_strides):
if feats is not None:
_, _, h, w = feats[i].shape
else:
h = int(self.eval_size[0] / stride)
w = int(self.eval_size[1] / stride)
shift_x = paddle.arange(end=w) + self.grid_cell_offset
shift_y = paddle.arange(end=h) + self.grid_cell_offset
shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
anchor_point = paddle.cast(
paddle.stack(
[shift_x, shift_y], axis=-1), dtype=dtype)
anchor_points.append(anchor_point.reshape([-1, 2]))
stride_tensor.append(paddle.full([h * w, 1], stride, dtype=dtype))
anchor_points = paddle.concat(anchor_points)
stride_tensor = paddle.concat(stride_tensor)
return anchor_points, stride_tensor
def forward_eval(self, feats):
if self.eval_size:
anchor_points, stride_tensor = self.anchor_points, self.stride_tensor
else:
anchor_points, stride_tensor = self._generate_anchors(feats)
cls_score_list, reg_dist_list = [], []
for i, feat in enumerate(feats):
_, _, h, w = feat.shape
l = h * w
if (paddle.device.get_device()[:3]=='npu'):
avg_feat = self.m_avg_pool2d(feat, 1, 1)
else:
avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +
feat)
reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
reg_dist = reg_dist.reshape(
[-1, 4, self.reg_channels, l]).transpose([0, 2, 3, 1])
if self.use_shared_conv:
reg_dist = self.proj_conv(F.softmax(
reg_dist, axis=1)).squeeze(1)
else:
reg_dist = F.softmax(reg_dist, axis=1)
# cls and reg
cls_score = F.sigmoid(cls_logit)
cls_score_list.append(cls_score.reshape([-1, self.num_classes, l]))
reg_dist_list.append(reg_dist)
cls_score_list = paddle.concat(cls_score_list, axis=-1)
if self.use_shared_conv:
reg_dist_list = paddle.concat(reg_dist_list, axis=1)
else:
reg_dist_list = paddle.concat(reg_dist_list, axis=2)
reg_dist_list = self.proj_conv(reg_dist_list).squeeze(1)
return cls_score_list, reg_dist_list, anchor_points, stride_tensor
def forward(self, feats, targets=None, aux_pred=None):
assert len(feats) == len(self.fpn_strides), \
"The size of feats is not equal to size of fpn_strides"
if self.training:
return self.forward_train(feats, targets, aux_pred)
else:
if targets is not None:
# only for semi-det
self.is_teacher = targets.get('is_teacher', False)
if self.is_teacher:
return self.forward_train(feats, targets, aux_pred=None)
else:
return self.forward_eval(feats)
return self.forward_eval(feats)
@staticmethod
def _focal_loss(score, label, alpha=0.25, gamma=2.0):
weight = (score - label).pow(gamma)
if alpha > 0:
alpha_t = alpha * label + (1 - alpha) * (1 - label)
weight *= alpha_t
loss = F.binary_cross_entropy(
score, label, weight=weight, reduction='sum')
return loss
@staticmethod
def _varifocal_loss(pred_score, gt_score, label, alpha=0.75, gamma=2.0):
weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
loss = F.binary_cross_entropy(
pred_score, gt_score, weight=weight, reduction='sum')
return loss
def _bbox_decode(self, anchor_points, pred_dist):
_, l, _ = get_static_shape(pred_dist)
pred_dist = F.softmax(pred_dist.reshape([-1, l, 4, self.reg_channels]))
pred_dist = self.proj_conv(pred_dist.transpose([0, 3, 1, 2])).squeeze(1)
return batch_distance2bbox(anchor_points, pred_dist)
def _bbox_decode_fake(self, pred_dist):
_, l, _ = get_static_shape(pred_dist)
pred_dist_dfl = F.softmax(
pred_dist.reshape([-1, l, 4, self.reg_channels]))
pred_dist = self.proj_conv(pred_dist_dfl.transpose([0, 3, 1, 2
])).squeeze(1)
return pred_dist, pred_dist_dfl
def _bbox2distance(self, points, bbox):
x1y1, x2y2 = paddle.split(bbox, 2, -1)
lt = points - x1y1
rb = x2y2 - points
return paddle.concat([lt, rb], -1).clip(self.reg_range[0],
self.reg_range[1] - 1 - 0.01)
def _df_loss(self, pred_dist, target, lower_bound=0):
target_left = paddle.cast(target.floor(), 'int64')
target_right = target_left + 1
weight_left = target_right.astype('float32') - target
weight_right = 1 - weight_left
loss_left = F.cross_entropy(
pred_dist, target_left - lower_bound,
reduction='none') * weight_left
loss_right = F.cross_entropy(
pred_dist, target_right - lower_bound,
reduction='none') * weight_right
return (loss_left + loss_right).mean(-1, keepdim=True)
def _bbox_loss(self, pred_dist, pred_bboxes, anchor_points, assigned_labels,
assigned_bboxes, assigned_scores, assigned_scores_sum):
# select positive samples mask
mask_positive = (assigned_labels != self.num_classes)
if self.for_distill:
# only used for LD main_kd distill
self.distill_pairs['mask_positive_select'] = mask_positive
num_pos = mask_positive.sum()
# pos/neg loss
if num_pos > 0:
# l1 + iou
bbox_mask = mask_positive.astype('int32').unsqueeze(-1).tile(
[1, 1, 4]).astype('bool')
pred_bboxes_pos = paddle.masked_select(pred_bboxes,
bbox_mask).reshape([-1, 4])
assigned_bboxes_pos = paddle.masked_select(
assigned_bboxes, bbox_mask).reshape([-1, 4])
bbox_weight = paddle.masked_select(
assigned_scores.sum(-1), mask_positive).unsqueeze(-1)
loss_l1 = F.l1_loss(pred_bboxes_pos, assigned_bboxes_pos)
loss_iou = self.iou_loss(pred_bboxes_pos,
assigned_bboxes_pos) * bbox_weight
loss_iou = loss_iou.sum() / assigned_scores_sum
dist_mask = mask_positive.unsqueeze(-1).astype('int32').tile(
[1, 1, self.reg_channels * 4]).astype('bool')
pred_dist_pos = paddle.masked_select(
pred_dist, dist_mask).reshape([-1, 4, self.reg_channels])
assigned_ltrb = self._bbox2distance(anchor_points, assigned_bboxes)
assigned_ltrb_pos = paddle.masked_select(
assigned_ltrb, bbox_mask).reshape([-1, 4])
loss_dfl = self._df_loss(pred_dist_pos, assigned_ltrb_pos,
self.reg_range[0]) * bbox_weight
loss_dfl = loss_dfl.sum() / assigned_scores_sum
if self.for_distill:
self.distill_pairs['pred_bboxes_pos'] = pred_bboxes_pos
self.distill_pairs['pred_dist_pos'] = pred_dist_pos
self.distill_pairs['bbox_weight'] = bbox_weight
else:
loss_l1 = paddle.zeros([])
loss_iou = paddle.zeros([])
loss_dfl = pred_dist.sum() * 0.
return loss_l1, loss_iou, loss_dfl
def get_loss(self, head_outs, gt_meta, aux_pred=None):
pred_scores, pred_distri, anchors,\
anchor_points, num_anchors_list, stride_tensor = head_outs
anchor_points_s = anchor_points / stride_tensor
pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri)
if aux_pred is not None:
pred_scores_aux = aux_pred[0]
pred_bboxes_aux = self._bbox_decode(anchor_points_s, aux_pred[1])
if 'origin_gt_class' in gt_meta:
gt_labels = gt_meta['origin_gt_class']
gt_bboxes = gt_meta['origin_gt_bbox']
pad_gt_mask = gt_meta['pad_origin_gt_mask']
else:
gt_labels = gt_meta['gt_class']
gt_bboxes = gt_meta['gt_bbox']
pad_gt_mask = gt_meta['pad_gt_mask']
# label assignment
if gt_meta['epoch_id'] < self.static_assigner_epoch:
assigned_labels, assigned_bboxes, assigned_scores = \
self.static_assigner(
anchors,
num_anchors_list,
gt_labels,
gt_bboxes,
pad_gt_mask,
bg_index=self.num_classes,
pred_bboxes=pred_bboxes.detach() * stride_tensor)
alpha_l = 0.25
else:
if self.sm_use:
# only used in smalldet of PPYOLOE-SOD model
assigned_labels, assigned_bboxes, assigned_scores = \
self.assigner(
pred_scores.detach(),
pred_bboxes.detach() * stride_tensor,
anchor_points,
stride_tensor,
gt_labels,
gt_bboxes,
pad_gt_mask,
bg_index=self.num_classes)
else:
if aux_pred is None:
if not hasattr(self, "assigned_labels"):
assigned_labels, assigned_bboxes, assigned_scores = \
self.assigner(
pred_scores.detach(),
pred_bboxes.detach() * stride_tensor,
anchor_points,
num_anchors_list,
gt_labels,
gt_bboxes,
pad_gt_mask,
bg_index=self.num_classes)
if self.for_distill:
self.assigned_labels = assigned_labels
self.assigned_bboxes = assigned_bboxes
self.assigned_scores = assigned_scores
else:
# only used in distill
assigned_labels = self.assigned_labels
assigned_bboxes = self.assigned_bboxes
assigned_scores = self.assigned_scores
else:
assigned_labels, assigned_bboxes, assigned_scores = \
self.assigner(
pred_scores_aux.detach(),
pred_bboxes_aux.detach() * stride_tensor,
anchor_points,
num_anchors_list,
gt_labels,
gt_bboxes,
pad_gt_mask,
bg_index=self.num_classes)
alpha_l = -1
# rescale bbox
assigned_bboxes /= stride_tensor
assign_out_dict = self.get_loss_from_assign(
pred_scores, pred_distri, pred_bboxes, anchor_points_s,
assigned_labels, assigned_bboxes, assigned_scores, alpha_l)
if aux_pred is not None:
assign_out_dict_aux = self.get_loss_from_assign(
aux_pred[0], aux_pred[1], pred_bboxes_aux, anchor_points_s,
assigned_labels, assigned_bboxes, assigned_scores, alpha_l)
loss = {}
for key in assign_out_dict.keys():
loss[key] = assign_out_dict[key] + assign_out_dict_aux[key]
else:
loss = assign_out_dict
return loss
def get_loss_from_assign(self, pred_scores, pred_distri, pred_bboxes,
anchor_points_s, assigned_labels, assigned_bboxes,
assigned_scores, alpha_l):
# cls loss
if self.use_varifocal_loss:
one_hot_label = F.one_hot(assigned_labels,
self.num_classes + 1)[..., :-1]
loss_cls = self._varifocal_loss(pred_scores, assigned_scores,
one_hot_label)
else:
loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l)
assigned_scores_sum = assigned_scores.sum()
if paddle.distributed.get_world_size() > 1:
paddle.distributed.all_reduce(assigned_scores_sum)
assigned_scores_sum /= paddle.distributed.get_world_size()
assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.)
loss_cls /= assigned_scores_sum
if self.for_distill:
self.distill_pairs['pred_cls_scores'] = pred_scores
self.distill_pairs['pos_num'] = assigned_scores_sum
self.distill_pairs['assigned_scores'] = assigned_scores
one_hot_label = F.one_hot(assigned_labels,
self.num_classes + 1)[..., :-1]
self.distill_pairs['target_labels'] = one_hot_label
loss_l1, loss_iou, loss_dfl = \
self._bbox_loss(pred_distri, pred_bboxes, anchor_points_s,
assigned_labels, assigned_bboxes, assigned_scores,
assigned_scores_sum)
loss = self.loss_weight['class'] * loss_cls + \
self.loss_weight['iou'] * loss_iou + \
self.loss_weight['dfl'] * loss_dfl
out_dict = {
'loss': loss,
'loss_cls': loss_cls,
'loss_iou': loss_iou,
'loss_dfl': loss_dfl,
'loss_l1': loss_l1,
}
return out_dict
def post_process(self, head_outs, scale_factor):
pred_scores, pred_dist, anchor_points, stride_tensor = head_outs
pred_bboxes = batch_distance2bbox(anchor_points, pred_dist)
pred_bboxes *= stride_tensor
if self.exclude_post_process:
return paddle.concat(
[pred_bboxes, pred_scores.transpose([0, 2, 1])],
axis=-1), None, None
else:
# scale bbox to origin
scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)
scale_factor = paddle.concat(
[scale_x, scale_y, scale_x, scale_y],
axis=-1).reshape([-1, 1, 4])
pred_bboxes /= scale_factor
if self.exclude_nms:
# `exclude_nms=True` just use in benchmark
return pred_bboxes, pred_scores, None
else:
bbox_pred, bbox_num, nms_keep_idx = self.nms(pred_bboxes,
pred_scores)
return bbox_pred, bbox_num, nms_keep_idx
def get_activation(name="LeakyReLU"):
if name == "silu":
module = nn.Silu()
elif name == "relu":
module = nn.ReLU()
elif name in ["LeakyReLU", 'leakyrelu', 'lrelu']:
module = nn.LeakyReLU(0.1)
elif name is None:
module = nn.Identity()
else:
raise AttributeError("Unsupported act type: {}".format(name))
return module
class ConvNormLayer(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
norm_type='gn',
activation="LeakyReLU"):
super(ConvNormLayer, self).__init__()
assert norm_type in ['bn', 'sync_bn', 'syncbn', 'gn', None]
self.conv = nn.Conv2D(
in_channels,
out_channels,
kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
bias_attr=False,
weight_attr=ParamAttr(initializer=KaimingNormal()))
if norm_type in ['bn', 'sync_bn', 'syncbn']:
self.norm = nn.BatchNorm2D(out_channels)
elif norm_type == 'gn':
self.norm = nn.GroupNorm(num_groups=32, num_channels=out_channels)
else:
self.norm = None
self.act = get_activation(activation)
def forward(self, x):
y = self.conv(x)
if self.norm is not None:
y = self.norm(y)
y = self.act(y)
return y
class ScaleReg(nn.Layer):
"""
Parameter for scaling the regression outputs.
"""
def __init__(self, scale=1.0):
super(ScaleReg, self).__init__()
scale = paddle.to_tensor(scale)
self.scale = self.create_parameter(
shape=[1],
dtype='float32',
default_initializer=nn.initializer.Assign(scale))
def forward(self, x):
return x * self.scale
@register
class SimpleConvHead(nn.Layer):
__shared__ = ['num_classes']
def __init__(self,
num_classes=80,
feat_in=288,
feat_out=288,
num_convs=1,
fpn_strides=[32, 16, 8, 4],
norm_type='gn',
act='LeakyReLU',
prior_prob=0.01,
reg_max=16):
super(SimpleConvHead, self).__init__()
self.num_classes = num_classes
self.feat_in = feat_in
self.feat_out = feat_out
self.num_convs = num_convs
self.fpn_strides = fpn_strides
self.reg_max = reg_max
self.cls_convs = nn.LayerList()
self.reg_convs = nn.LayerList()
for i in range(self.num_convs):
in_c = feat_in if i == 0 else feat_out
self.cls_convs.append(
ConvNormLayer(
in_c,
feat_out,
3,
stride=1,
padding=1,
norm_type=norm_type,
activation=act))
self.reg_convs.append(
ConvNormLayer(
in_c,
feat_out,
3,
stride=1,
padding=1,
norm_type=norm_type,
activation=act))
bias_cls = bias_init_with_prob(prior_prob)
self.gfl_cls = nn.Conv2D(
feat_out,
self.num_classes,
kernel_size=3,
stride=1,
padding=1,
weight_attr=ParamAttr(initializer=Normal(
mean=0.0, std=0.01)),
bias_attr=ParamAttr(initializer=Constant(value=bias_cls)))
self.gfl_reg = nn.Conv2D(
feat_out,
4 * (self.reg_max + 1),
kernel_size=3,
stride=1,
padding=1,
weight_attr=ParamAttr(initializer=Normal(
mean=0.0, std=0.01)),
bias_attr=ParamAttr(initializer=Constant(value=0)))
self.scales = nn.LayerList()
for i in range(len(self.fpn_strides)):
self.scales.append(ScaleReg(1.0))
def forward(self, feats):
cls_scores = []
bbox_preds = []
for x, scale in zip(feats, self.scales):
cls_feat = x
reg_feat = x
for cls_conv in self.cls_convs:
cls_feat = cls_conv(cls_feat)
for reg_conv in self.reg_convs:
reg_feat = reg_conv(reg_feat)
cls_score = self.gfl_cls(cls_feat)
cls_score = F.sigmoid(cls_score)
cls_score = cls_score.flatten(2).transpose([0, 2, 1])
cls_scores.append(cls_score)
bbox_pred = scale(self.gfl_reg(reg_feat))
bbox_pred = bbox_pred.flatten(2).transpose([0, 2, 1])
bbox_preds.append(bbox_pred)
cls_scores = paddle.concat(cls_scores, axis=1)
bbox_preds = paddle.concat(bbox_preds, axis=1)
return cls_scores, bbox_preds
================================================
FILE: ppdet/modeling/heads/ppyoloe_ins_head.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from ppdet.modeling.backbones.csp_darknet import BaseConv
from ppdet.modeling.layers import MultiClassNMS
from ppdet.modeling.ops import get_static_shape, get_act_fn
from .ppyoloe_head import ESEAttn
from ..assigners.utils import generate_anchors_for_grid_cell
from ..bbox_utils import batch_distance2bbox
from ..initializer import bias_init_with_prob, constant_
from ..losses import GIoULoss
__all__ = ['PPYOLOEInsHead']
def custom_binary_cross_entropy_with_logits(x, y):
max_val = paddle.maximum(-x, paddle.to_tensor(0.0))
loss = (1 - y) * x + max_val + paddle.log(
paddle.exp(-max_val) + paddle.exp(-x - max_val))
return loss
class MaskProto(nn.Layer):
# YOLOv8 mask Proto module for instance segmentation models
def __init__(self, ch_in, num_protos=256, num_masks=32, act='silu'):
super().__init__()
self.conv1 = BaseConv(ch_in, num_protos, 3, 1, act=act)
self.upsample = nn.Conv2DTranspose(num_protos,
num_protos,
2,
2,
0,
bias_attr=True)
self.conv2 = BaseConv(num_protos, num_protos, 3, 1, act=act)
self.conv3 = BaseConv(num_protos, num_masks, 1, 1, act=act)
def forward(self, x):
return self.conv3(self.conv2(self.upsample(self.conv1(x))))
def xyxy2xywh(x):
"""
Convert bounding box coordinates from (x1, y1, x2, y2) format to (x, y, width, height) format where (x1, y1) is the
top-left corner and (x2, y2) is the bottom-right corner.
"""
assert x.shape[
-1] == 4, f'input shape last dimension expected 4 but input shape is {x.shape}'
y = paddle.empty_like(x) if isinstance(
x, paddle.Tensor) else np.empty_like(x) # faster than clone/copy
y[..., 0] = (x[..., 0] + x[..., 2]) / 2 # x center
y[..., 1] = (x[..., 1] + x[..., 3]) / 2 # y center
y[..., 2] = x[..., 2] - x[..., 0] # width
y[..., 3] = x[..., 3] - x[..., 1] # height
return y
def crop_mask(masks, boxes):
"""
It takes a mask and a bounding box, and returns a mask that is cropped to the bounding box
Args:
masks (paddle.Tensor): [h, w, n] tensor of masks
boxes (paddle.Tensor): [n, 4] tensor of bbox coordinates in relative point form
Returns:
(paddle.Tensor): The masks are being cropped to the bounding box.
"""
_, h, w = masks.shape
x1, y1, x2, y2 = paddle.chunk(boxes[:, :, None], 4, axis=1)
r = paddle.arange(w, dtype=x1.dtype)[None, None, :]
c = paddle.arange(h, dtype=y1.dtype)[None, :, None]
if "npu" in paddle.device.get_all_custom_device_type():
# bool tensor broadcast multiply is extreamly slow on npu, so we cast it to float32.
m_dtype = masks.dtype
return masks * ((r >= x1).cast(m_dtype) * (r < x2).cast(m_dtype) *
(c >= y1).cast(m_dtype) * (c < y2).cast(m_dtype))
else:
return masks * ((r >= x1) * (r < x2) * (c >= y1) *
(c < y2)).astype(masks.dtype)
def process_mask_upsample(protos, masks_in, bboxes, shape):
"""
It takes the output of the mask head, and applies the mask to the bounding boxes. This produces masks of higher
quality but is slower.
Args:
protos (paddle.Tensor): [mask_dim, mask_h, mask_w]
masks_in (paddle.Tensor): [n, mask_dim], n is number of masks after nms
bboxes (paddle.Tensor): [n, 4], n is number of masks after nms
shape (tuple): the size of the input image (h,w)
Returns:
(paddle.Tensor): The upsampled masks.
"""
c, mh, mw = protos.shape # CHW
masks = F.sigmoid(masks_in @ protos.reshape([c, -1])).reshape([-1, mh, mw])
masks = F.interpolate(masks[None],
shape,
mode='bilinear',
align_corners=False)[0] # CHW
masks = crop_mask(masks, bboxes) # CHW
return masks
@register
class PPYOLOEInsHead(nn.Layer):
__shared__ = [
'num_classes', 'eval_size', 'trt', 'exclude_nms',
'exclude_post_process', 'use_shared_conv', 'for_distill', 'width_mult'
]
__inject__ = ['static_assigner', 'assigner', 'nms']
def __init__(self,
in_channels=[1024, 512, 256],
num_classes=80,
act='swish',
fpn_strides=(32, 16, 8),
grid_cell_scale=5.0,
grid_cell_offset=0.5,
reg_max=16,
reg_range=None,
static_assigner_epoch=4,
use_varifocal_loss=True,
static_assigner='ATSSAssigner',
assigner='TaskAlignedAssigner',
nms='MultiClassNMS',
eval_size=None,
loss_weight={
'class': 1.0,
'iou': 2.5,
'dfl': 0.5,
},
trt=False,
attn_conv='convbn',
exclude_nms=False,
exclude_post_process=False,
use_shared_conv=True,
mask_thr_binary=0.5,
num_masks=32,
num_protos=256,
width_mult=1.0,
for_distill=False):
super(PPYOLOEInsHead, self).__init__()
assert len(in_channels) > 0, "len(in_channels) should > 0"
self.mask_thr_binary = mask_thr_binary
self.num_masks = num_masks
self.num_protos = int(num_protos * width_mult)
self.in_channels = in_channels
self.num_classes = num_classes
self.fpn_strides = fpn_strides
self.grid_cell_scale = grid_cell_scale
self.grid_cell_offset = grid_cell_offset
if reg_range:
self.sm_use = True
self.reg_range = reg_range
else:
self.sm_use = False
self.reg_range = (0, reg_max + 1)
self.reg_channels = self.reg_range[1] - self.reg_range[0]
self.iou_loss = GIoULoss()
self.loss_weight = loss_weight
self.use_varifocal_loss = use_varifocal_loss
self.eval_size = eval_size
self.static_assigner_epoch = static_assigner_epoch
self.static_assigner = static_assigner
self.assigner = assigner
self.nms = nms
if isinstance(self.nms, MultiClassNMS) and trt:
self.nms.trt = trt
self.exclude_nms = exclude_nms
self.exclude_post_process = exclude_post_process
self.use_shared_conv = use_shared_conv
self.for_distill = for_distill
self.is_teacher = False
# stem
self.stem_cls = nn.LayerList()
self.stem_reg = nn.LayerList()
self.stem_ins = nn.LayerList()
act = get_act_fn(
act, trt=trt) if act is None or isinstance(act,
(str, dict)) else act
for in_c in self.in_channels:
self.stem_cls.append(ESEAttn(in_c, act=act, attn_conv=attn_conv))
self.stem_reg.append(ESEAttn(in_c, act=act, attn_conv=attn_conv))
self.stem_ins.append(ESEAttn(in_c, act=act, attn_conv=attn_conv))
# pred head
self.pred_cls = nn.LayerList()
self.pred_reg = nn.LayerList()
self.pred_ins = nn.LayerList()
for in_c in self.in_channels:
self.pred_cls.append(
nn.Conv2D(in_c, self.num_classes, 3, padding=1))
self.pred_reg.append(
nn.Conv2D(in_c, 4 * self.reg_channels, 3, padding=1))
self.pred_ins.append(nn.Conv2D(in_c, self.num_masks, 3, padding=1))
# projection conv
self.proj_conv = nn.Conv2D(self.reg_channels, 1, 1, bias_attr=False)
self.proj_conv.skip_quant = True
self._init_weights()
self.proto = MaskProto(in_channels[-1],
self.num_protos,
self.num_masks,
act=act)
if self.for_distill:
self.distill_pairs = {}
@classmethod
def from_config(cls, cfg, input_shape):
return {
'in_channels': [i.channels for i in input_shape],
}
def _init_weights(self):
bias_cls = bias_init_with_prob(0.01)
for cls_, reg_ in zip(self.pred_cls, self.pred_reg):
constant_(cls_.weight)
constant_(cls_.bias, bias_cls)
constant_(reg_.weight)
constant_(reg_.bias, 1.0)
proj = paddle.linspace(self.reg_range[0], self.reg_range[1] - 1,
self.reg_channels).reshape(
[1, self.reg_channels, 1, 1])
self.proj_conv.weight.set_value(proj)
self.proj_conv.weight.stop_gradient = True
if self.eval_size:
anchor_points, stride_tensor = self._generate_anchors()
self.anchor_points = anchor_points
self.stride_tensor = stride_tensor
def forward_train(self, feats, targets):
anchors, anchor_points, num_anchors_list, stride_tensor = \
generate_anchors_for_grid_cell(
feats, self.fpn_strides, self.grid_cell_scale,
self.grid_cell_offset)
cls_score_list, reg_distri_list = [], []
mask_feat = self.proto(feats[-1])
mask_coeff_list = []
for i, feat in enumerate(feats):
_, _, h, w = feat.shape
l = h * w
if "npu" in paddle.device.get_all_custom_device_type(
): # backward in avgpool is extremely slow in npu kernel, replace it with mean
avg_feat = feat.mean(axis=[2, 3], keepdim=True)
else:
avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +
feat)
reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
msk_coeff = self.pred_ins[i](self.stem_ins[i](feat, avg_feat) +
feat)
# cls and reg
cls_score = F.sigmoid(cls_logit)
cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))
mask_coeff_list.append(msk_coeff.flatten(2).transpose([0, 2,
1])) ###
reg_distri_list.append(reg_distri.flatten(2).transpose([0, 2, 1]))
cls_score_list = paddle.concat(cls_score_list, axis=1)
mask_coeff_list = paddle.concat(mask_coeff_list, axis=1)
reg_distri_list = paddle.concat(reg_distri_list, axis=1)
return self.get_loss([
cls_score_list, reg_distri_list, mask_coeff_list, mask_feat,
anchors, anchor_points, num_anchors_list, stride_tensor
], targets)
def _generate_anchors(self, feats=None, dtype='float32'):
# just use in eval time
anchor_points = []
stride_tensor = []
for i, stride in enumerate(self.fpn_strides):
if feats is not None:
_, _, h, w = feats[i].shape
else:
h = int(self.eval_size[0] / stride)
w = int(self.eval_size[1] / stride)
shift_x = paddle.arange(end=w) + self.grid_cell_offset
shift_y = paddle.arange(end=h) + self.grid_cell_offset
shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
anchor_point = paddle.cast(paddle.stack([shift_x, shift_y],
axis=-1),
dtype=dtype)
anchor_points.append(anchor_point.reshape([-1, 2]))
stride_tensor.append(paddle.full([h * w, 1], stride, dtype=dtype))
anchor_points = paddle.concat(anchor_points)
stride_tensor = paddle.concat(stride_tensor)
return anchor_points, stride_tensor
def forward_eval(self, feats):
mask_proto = self.proto(feats[-1])
if self.eval_size:
anchor_points, stride_tensor = self.anchor_points, self.stride_tensor
else:
anchor_points, stride_tensor = self._generate_anchors(feats)
cls_score_list, reg_dist_list, pred_mask_list = [], [], []
feats_shapes = []
for i, feat in enumerate(feats):
_, _, h, w = feat.shape
l = h * w
feats_shapes.append(l)
if "npu" in paddle.device.get_all_custom_device_type():
# backward in avgpool is extremely slow in npu kernel, replace it with mean
avg_feat = feat.mean(axis=[2, 3], keepdim=True)
else:
avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +
feat)
reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
mask_coeff = self.pred_ins[i](self.stem_ins[i](feat, avg_feat) +
feat)
pred_mask_list.append(mask_coeff.reshape([-1, self.num_masks, l]))
reg_dist = reg_dist.reshape([-1, 4, self.reg_channels,
l]).transpose([0, 2, 3, 1])
if self.use_shared_conv:
reg_dist = self.proj_conv(F.softmax(reg_dist,
axis=1)).squeeze(1)
else:
reg_dist = F.softmax(reg_dist, axis=1)
# cls and reg
cls_score = F.sigmoid(cls_logit)
cls_score_list.append(cls_score.reshape([-1, self.num_classes, l]))
reg_dist_list.append(reg_dist)
cls_score_list = paddle.concat(cls_score_list, axis=-1)
pred_mask_list = paddle.concat(pred_mask_list, axis=-1)
if self.use_shared_conv:
reg_dist_list = paddle.concat(reg_dist_list, axis=1)
else:
reg_dist_list = paddle.concat(reg_dist_list, axis=2)
reg_dist_list = self.proj_conv(reg_dist_list).squeeze(1)
return cls_score_list, reg_dist_list, pred_mask_list, mask_proto, anchor_points, stride_tensor
def forward(self, feats, targets=None):
assert len(feats) == len(self.fpn_strides), \
"The size of feats is not equal to size of fpn_strides"
if self.training:
return self.forward_train(feats, targets)
else:
return self.forward_eval(feats)
@staticmethod
def _focal_loss(score, label, alpha=0.25, gamma=2.0):
weight = (score - label).pow(gamma)
if alpha > 0:
alpha_t = alpha * label + (1 - alpha) * (1 - label)
weight *= alpha_t
loss = F.binary_cross_entropy(score,
label,
weight=weight,
reduction='sum')
return loss
@staticmethod
def _varifocal_loss(pred_score, gt_score, label, alpha=0.75, gamma=2.0):
weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
loss = F.binary_cross_entropy(pred_score,
gt_score,
weight=weight,
reduction='sum')
return loss
def _bbox_decode(self, anchor_points, pred_dist):
_, l, _ = get_static_shape(pred_dist)
pred_dist = F.softmax(pred_dist.reshape([-1, l, 4, self.reg_channels]))
pred_dist = self.proj_conv(pred_dist.transpose([0, 3, 1,
2])).squeeze(1)
return batch_distance2bbox(anchor_points, pred_dist)
def _bbox_decode_fake(self, pred_dist):
_, l, _ = get_static_shape(pred_dist)
pred_dist_dfl = F.softmax(
pred_dist.reshape([-1, l, 4, self.reg_channels]))
pred_dist = self.proj_conv(pred_dist_dfl.transpose([0, 3, 1,
2])).squeeze(1)
return pred_dist, pred_dist_dfl
def _bbox2distance(self, points, bbox):
x1y1, x2y2 = paddle.split(bbox, 2, -1)
lt = points - x1y1
rb = x2y2 - points
if "npu" in paddle.device.get_all_custom_device_type(
): # npu clip kernel causes nan grad, replace it with maximum & minimum.
out = paddle.concat([lt, rb], -1)
out = paddle.maximum(
out, paddle.to_tensor(self.reg_range[0], dtype=out.dtype))
out = paddle.minimum(
out,
paddle.to_tensor(self.reg_range[1] - 1 - 0.01,
dtype=out.dtype))
return out
else:
return paddle.concat([lt, rb],
-1).clip(self.reg_range[0],
self.reg_range[1] - 1 - 0.01)
def _df_loss(self, pred_dist, target, lower_bound=0):
target_left = paddle.cast(target.floor(), 'int64')
target_right = target_left + 1
weight_left = target_right.astype('float32') - target
weight_right = 1 - weight_left
loss_left = F.cross_entropy(pred_dist,
target_left - lower_bound,
reduction='none') * weight_left
loss_right = F.cross_entropy(pred_dist,
target_right - lower_bound,
reduction='none') * weight_right
return (loss_left + loss_right).mean(-1, keepdim=True)
def get_loss(self, head_outs, gt_meta):
assert 'gt_bbox' in gt_meta and 'gt_class' in gt_meta
assert 'gt_segm' in gt_meta
pred_scores, pred_distri, pred_mask_coeffs, mask_proto, anchors, \
anchor_points, num_anchors_list, stride_tensor = head_outs
bs = pred_scores.shape[0]
imgsz = paddle.to_tensor(
[640, 640]
) # paddle.to_tensor(pred_scores[0].shape[2:]) * self.fpn_strides[0] # image size (h,w)
mask_h, mask_w = mask_proto.shape[-2:]
anchor_points_s = anchor_points / stride_tensor
pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri)
gt_labels = paddle.stack(gt_meta['gt_class'])
gt_bboxes = paddle.stack(gt_meta['gt_bbox'])
pad_gt_mask = paddle.stack(gt_meta['pad_gt_mask'])
gt_segms = paddle.stack(gt_meta['gt_segm']).cast('float32')
if tuple(gt_segms.shape[-2:]) != (mask_h, mask_w): # downsample
gt_segms = F.interpolate(gt_segms, (mask_h, mask_w),
mode='nearest').reshape(
[bs, -1, mask_h * mask_w])
# label assignment
assigned_labels, assigned_bboxes, assigned_scores, assigned_gt_index = \
self.assigner(
pred_scores.detach(),
pred_bboxes.detach() * stride_tensor,
anchor_points,
num_anchors_list,
gt_labels,
gt_bboxes,
pad_gt_mask,
bg_index=self.num_classes,
gt_segms=gt_segms)
# rescale bbox
assigned_bboxes /= stride_tensor
# assign segms for masks
assigned_masks = paddle.gather(gt_segms.reshape([-1, mask_h * mask_w]),
assigned_gt_index.flatten(),
axis=0)
assigned_masks = assigned_masks.reshape(
[bs, assigned_gt_index.shape[1], mask_h * mask_w])
assign_out_dict = self.get_loss_from_assign(
pred_scores, pred_distri, pred_bboxes, anchor_points_s,
assigned_labels, assigned_bboxes, assigned_scores, assigned_masks,
pred_mask_coeffs, mask_proto, stride_tensor, imgsz)
loss = assign_out_dict
return loss
def get_loss_from_assign(self, pred_scores, pred_distri, pred_bboxes,
anchor_points_s, assigned_labels, assigned_bboxes,
assigned_scores, assigned_masks, pred_mask_coeffs,
mask_proto, stride_tensor, imgsz):
# cls loss
if self.use_varifocal_loss:
one_hot_label = F.one_hot(assigned_labels,
self.num_classes + 1)[..., :-1]
loss_cls = self._varifocal_loss(pred_scores, assigned_scores,
one_hot_label)
else:
loss_cls = self._focal_loss(pred_scores,
assigned_scores,
alpha_l=-1)
assigned_scores_sum = assigned_scores.sum()
if paddle.distributed.get_world_size() > 1:
paddle.distributed.all_reduce(assigned_scores_sum)
assigned_scores_sum /= paddle.distributed.get_world_size()
if "npu" in paddle.device.get_all_custom_device_type():
# npu clip kernel causes nan grad, replace it with maximum & minimum.
assigned_scores_sum = paddle.maximum(
assigned_scores_sum,
paddle.to_tensor(1., dtype=assigned_scores_sum.dtype))
else:
assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.)
loss_cls /= assigned_scores_sum
# select positive samples mask
mask_positive = (assigned_labels != self.num_classes)
num_pos = mask_positive.sum()
# pos/neg loss
if num_pos > 0:
# l1 + iou
bbox_mask = mask_positive.astype('int32').unsqueeze(-1).tile(
[1, 1, 4]).astype('bool')
pred_bboxes_pos = paddle.masked_select(pred_bboxes,
bbox_mask).reshape([-1, 4])
assigned_bboxes_pos = paddle.masked_select(
assigned_bboxes, bbox_mask).reshape([-1, 4])
bbox_weight = paddle.masked_select(assigned_scores.sum(-1),
mask_positive).unsqueeze(-1)
loss_l1 = F.l1_loss(pred_bboxes_pos, assigned_bboxes_pos)
loss_iou = self.iou_loss(pred_bboxes_pos,
assigned_bboxes_pos) * bbox_weight
loss_iou = loss_iou.sum() / assigned_scores_sum
# dfl loss
dist_mask = mask_positive.unsqueeze(-1).astype('int32').tile(
[1, 1, self.reg_channels * 4]).astype('bool')
pred_dist_pos = paddle.masked_select(pred_distri,
dist_mask).reshape([
-1, 4, self.reg_channels
]) # pred_dist in funs
assigned_ltrb = self._bbox2distance(
anchor_points_s, assigned_bboxes) # anchor_points in func
assigned_ltrb_pos = paddle.masked_select(
assigned_ltrb, bbox_mask).reshape([-1, 4])
loss_dfl = self._df_loss(pred_dist_pos, assigned_ltrb_pos,
self.reg_range[0]) * bbox_weight
loss_dfl = loss_dfl.sum() / assigned_scores_sum
# mask loss
loss_mask = self.calculate_segmentation_loss(
mask_positive, assigned_masks, assigned_bboxes * stride_tensor,
mask_proto, pred_mask_coeffs, imgsz)
# [bs, 8400] [bs, 8400, 160 * 160] [bs, 8400, 4] [bs, 32, 160, 160] [bs, 8400, 32]
loss_mask /= assigned_scores_sum
else:
loss_l1 = paddle.zeros([1])
loss_iou = paddle.zeros([1])
loss_mask = paddle.zeros([1])
loss_dfl = paddle.zeros([1])
loss = self.loss_weight['class'] * loss_cls + \
self.loss_weight['iou'] * loss_iou + \
self.loss_weight['dfl'] * loss_dfl + \
self.loss_weight['iou'] * loss_mask
out_dict = {
'loss': loss,
'loss_cls': loss_cls,
'loss_iou': loss_iou,
'loss_dfl': loss_dfl,
'loss_mask': loss_mask,
'loss_l1': loss_l1,
}
return out_dict
def calculate_segmentation_loss(self,
fg_mask,
masks,
target_bboxes,
proto,
pred_masks,
imgsz,
overlap=True):
"""
Calculate the loss for instance segmentation.
Args:
fg_mask (paddle.Tensor): A binary tensor of shape (BS, N_anchors) indicating which anchors are positive.
masks (paddle.Tensor): Ground truth masks of shape (BS, H, W) if `overlap` is False, otherwise (BS, ?, H, W).
target_gt_idx (paddle.Tensor): Indexes of ground truth objects for each anchor of shape (BS, N_anchors).
target_bboxes (paddle.Tensor): Ground truth bounding boxes for each anchor of shape (BS, N_anchors, 4).
batch_idx (paddle.Tensor): Batch indices of shape (N_labels_in_batch, 1).
proto (paddle.Tensor): Prototype masks of shape (BS, 32, H, W).
pred_masks (paddle.Tensor): Predicted masks for each anchor of shape (BS, N_anchors, 32).
imgsz (paddle.Tensor): Size of the input image as a tensor of shape (2), i.e., (H, W).
overlap (bool): Whether the masks in `masks` tensor overlap.
Returns:
(paddle.Tensor): The calculated loss for instance segmentation.
Notes:
The batch loss can be computed for improved speed at higher memory usage.
For example, pred_mask can be computed as follows:
pred_mask = paddle.einsum('in,nhw->ihw', pred, proto) # (i, 32) @ (32, 160, 160) -> (i, 160, 160)
"""
_, _, mask_h, mask_w = proto.shape
loss = paddle.to_tensor([0.])
# Normalize to 0-1
target_bboxes_normalized = target_bboxes / imgsz[[1, 0, 1, 0]].cast(
target_bboxes.dtype)
# [8, 8400, 4]
# Areas of target bboxes
marea = xyxy2xywh(target_bboxes_normalized)[...,
2:].prod(2).unsqueeze(-1)
# Normalize to mask size
mxyxy = target_bboxes_normalized * paddle.to_tensor(
[mask_w, mask_h, mask_w, mask_h],
dtype=target_bboxes_normalized.dtype)
for i, single_i in enumerate(
zip(fg_mask, pred_masks, proto, mxyxy, marea, masks)):
fg_mask_i, pred_masks_i, proto_i, mxyxy_i, marea_i, masks_i = single_i
# [8400] [8400, 32] [32, 160, 160] [8400, 4] [8400, 1] [8400, 25600]
if fg_mask_i.any():
loss += self.single_mask_loss(masks_i[fg_mask_i],
pred_masks_i[fg_mask_i], proto_i,
mxyxy_i[fg_mask_i],
marea_i[fg_mask_i])
# [10, 25600] [10, 32] [32, 160, 160] [10, 4] [10, 1]
else:
loss += (proto * 0).sum() + (
pred_masks * 0).sum() # inf sums may lead to nan loss
return loss
@staticmethod
def single_mask_loss(gt_mask, pred, proto, xyxy, area):
"""
Compute the instance segmentation loss for a single image.
Args:
gt_mask (paddle.Tensor): Ground truth mask of shape (n, H, W), where n is the number of objects.
pred (paddle.Tensor): Predicted mask coefficients of shape (n, 32).
proto (paddle.Tensor): Prototype masks of shape (32, H, W).
xyxy (paddle.Tensor): Ground truth bounding boxes in xyxy format, normalized to [0, 1], of shape (n, 4).
area (paddle.Tensor): Area of each ground truth bounding box of shape (n,).
Returns:
(paddle.Tensor): The calculated mask loss for a single image.
Notes:
The function uses the equation pred_mask = paddle.einsum('in,nhw->ihw', pred, proto) to produce the
predicted masks from the prototype masks and predicted mask coefficients.
"""
nt = pred.shape[0]
gt_mask = gt_mask.reshape([nt, *proto.shape[1:]])
nmasks = 32
pred_mask = (pred @ proto.reshape([nmasks, -1])).reshape(
[-1, *proto.shape[1:]]) # (n,32) @ (32,80,80) -> (n,80,80)
if "npu" in paddle.device.get_all_custom_device_type():
# bce npu kernel causes nan grad, replace it with numeric stable custom implementation.
loss = custom_binary_cross_entropy_with_logits(pred_mask, gt_mask)
else:
loss = F.binary_cross_entropy_with_logits(pred_mask,
gt_mask,
reduction='none')
return (crop_mask(loss, xyxy).mean(axis=(1, 2)) /
area.squeeze(-1)).sum()
def post_process(self,
head_outs,
im_shape,
scale_factor,
infer_shape=[640, 640],
rescale=True):
pred_scores, pred_dist, pred_mask_coeffs, mask_feat, anchor_points, stride_tensor = head_outs
pred_bboxes = batch_distance2bbox(anchor_points, pred_dist)
pred_bboxes *= stride_tensor
if self.exclude_post_process:
return paddle.concat([
pred_bboxes,
pred_scores.transpose([0, 2, 1]),
pred_mask_coeffs.transpose([0, 2, 1])
],
axis=-1), mask_feat, None
# [1, 8400, 4+80+32], [1, 32, 160, 160]
bbox_pred, bbox_num, keep_idxs = self.nms(pred_bboxes, pred_scores)
if bbox_num.sum() > 0:
pred_mask_coeffs = pred_mask_coeffs.transpose([0, 2, 1])
mask_coeffs = paddle.gather(
pred_mask_coeffs.reshape([-1, self.num_masks]), keep_idxs)
mask_logits = process_mask_upsample(mask_feat[0], mask_coeffs,
bbox_pred[:, 2:6], infer_shape)
if rescale:
ori_h, ori_w = im_shape[0] / scale_factor[0]
mask_logits = F.interpolate(
mask_logits.unsqueeze(0),
size=[
int(paddle.round(mask_logits.shape[-2] /
scale_factor[0][0])),
int(paddle.round(mask_logits.shape[-1] /
scale_factor[0][1]))
],
mode='bilinear',
align_corners=False)
if "npu" in paddle.device.get_all_custom_device_type():
# due to npu numeric error, we need to take round of img size.
mask_logits = mask_logits[
..., :round(ori_h.item()), :round(ori_w.item())]
else:
mask_logits = mask_logits[..., :int(ori_h), :int(ori_w)]
masks = mask_logits.squeeze(0)
mask_pred = paddle.to_tensor(masks > self.mask_thr_binary).cast("float32")
# scale bbox to origin
scale_factor = scale_factor.flip(-1).tile([1, 2])
bbox_pred[:, 2:6] /= scale_factor
else:
ori_h, ori_w = im_shape[0] / scale_factor[0]
bbox_num = paddle.to_tensor([1]).cast("int32")
bbox_pred = paddle.zeros([bbox_num, 6])
mask_pred = paddle.zeros([bbox_num, int(ori_h), int(ori_w)])
return bbox_pred, bbox_num, mask_pred, keep_idxs
================================================
FILE: ppdet/modeling/heads/ppyoloe_r_head.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from ..losses import ProbIoULoss
from ..initializer import bias_init_with_prob, constant_, normal_, vector_
from ppdet.modeling.backbones.cspresnet import ConvBNLayer
from ppdet.modeling.ops import get_static_shape, get_act_fn, anchor_generator
from ppdet.modeling.layers import MultiClassNMS
__all__ = ['PPYOLOERHead']
class ESEAttn(nn.Layer):
def __init__(self, feat_channels, act='swish'):
super(ESEAttn, self).__init__()
self.fc = nn.Conv2D(feat_channels, feat_channels, 1)
self.conv = ConvBNLayer(feat_channels, feat_channels, 1, act=act)
self._init_weights()
def _init_weights(self):
normal_(self.fc.weight, std=0.01)
def forward(self, feat, avg_feat):
weight = F.sigmoid(self.fc(avg_feat))
return self.conv(feat * weight)
@register
class PPYOLOERHead(nn.Layer):
__shared__ = ['num_classes', 'trt', 'export_onnx']
__inject__ = ['static_assigner', 'assigner', 'nms']
def __init__(self,
in_channels=[1024, 512, 256],
num_classes=15,
act='swish',
fpn_strides=(32, 16, 8),
grid_cell_offset=0.5,
angle_max=90,
use_varifocal_loss=True,
static_assigner_epoch=4,
trt=False,
export_onnx=False,
static_assigner='ATSSAssigner',
assigner='TaskAlignedAssigner',
nms='MultiClassNMS',
loss_weight={'class': 1.0,
'iou': 2.5,
'dfl': 0.05}):
super(PPYOLOERHead, self).__init__()
assert len(in_channels) > 0, "len(in_channels) should > 0"
self.in_channels = in_channels
self.num_classes = num_classes
self.fpn_strides = fpn_strides
self.grid_cell_offset = grid_cell_offset
self.angle_max = angle_max
self.loss_weight = loss_weight
self.use_varifocal_loss = use_varifocal_loss
self.half_pi = paddle.to_tensor(
[1.5707963267948966], dtype=paddle.float32)
self.half_pi_bin = self.half_pi / angle_max
self.iou_loss = ProbIoULoss()
self.static_assigner_epoch = static_assigner_epoch
self.static_assigner = static_assigner
self.assigner = assigner
self.nms = nms
# stem
self.stem_cls = nn.LayerList()
self.stem_reg = nn.LayerList()
self.stem_angle = nn.LayerList()
trt = False if export_onnx else trt
self.export_onnx = export_onnx
act = get_act_fn(
act, trt=trt) if act is None or isinstance(act,
(str, dict)) else act
self.trt = trt
for in_c in self.in_channels:
self.stem_cls.append(ESEAttn(in_c, act=act))
self.stem_reg.append(ESEAttn(in_c, act=act))
self.stem_angle.append(ESEAttn(in_c, act=act))
# pred head
self.pred_cls = nn.LayerList()
self.pred_reg = nn.LayerList()
self.pred_angle = nn.LayerList()
for in_c in self.in_channels:
self.pred_cls.append(
nn.Conv2D(
in_c, self.num_classes, 3, padding=1))
self.pred_reg.append(nn.Conv2D(in_c, 4, 3, padding=1))
self.pred_angle.append(
nn.Conv2D(
in_c, self.angle_max + 1, 3, padding=1))
self.angle_proj_conv = nn.Conv2D(
self.angle_max + 1, 1, 1, bias_attr=False)
self._init_weights()
@classmethod
def from_config(cls, cfg, input_shape):
return {'in_channels': [i.channels for i in input_shape], }
def _init_weights(self):
bias_cls = bias_init_with_prob(0.01)
bias_angle = [10.] + [1.] * self.angle_max
for cls_, reg_, angle_ in zip(self.pred_cls, self.pred_reg,
self.pred_angle):
normal_(cls_.weight, std=0.01)
constant_(cls_.bias, bias_cls)
normal_(reg_.weight, std=0.01)
constant_(reg_.bias)
constant_(angle_.weight)
vector_(angle_.bias, bias_angle)
angle_proj = paddle.linspace(0, self.angle_max, self.angle_max + 1)
self.angle_proj = angle_proj * self.half_pi_bin
self.angle_proj_conv.weight.set_value(
self.angle_proj.reshape([1, self.angle_max + 1, 1, 1]))
self.angle_proj_conv.weight.stop_gradient = True
def _generate_anchors(self, feats):
if self.trt:
anchor_points = []
for feat, stride in zip(feats, self.fpn_strides):
_, _, h, w = feat.shape
anchor, _ = anchor_generator(
feat,
stride * 4,
1.0, [1.0, 1.0, 1.0, 1.0], [stride, stride],
offset=0.5)
x1, y1, x2, y2 = paddle.split(anchor, 4, axis=-1)
xc = (x1 + x2 + 1) / 2
yc = (y1 + y2 + 1) / 2
anchor_point = paddle.concat(
[xc, yc], axis=-1).reshape((1, h * w, 2))
anchor_points.append(anchor_point)
anchor_points = paddle.concat(anchor_points, axis=1)
return anchor_points, None, None
else:
anchor_points = []
stride_tensor = []
num_anchors_list = []
for feat, stride in zip(feats, self.fpn_strides):
_, _, h, w = feat.shape
shift_x = (paddle.arange(end=w) + 0.5) * stride
shift_y = (paddle.arange(end=h) + 0.5) * stride
shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
anchor_point = paddle.cast(
paddle.stack(
[shift_x, shift_y], axis=-1), dtype='float32')
anchor_points.append(anchor_point.reshape([1, -1, 2]))
stride_tensor.append(
paddle.full(
[1, h * w, 1], stride, dtype='float32'))
num_anchors_list.append(h * w)
anchor_points = paddle.concat(anchor_points, axis=1)
stride_tensor = paddle.concat(stride_tensor, axis=1)
return anchor_points, stride_tensor, num_anchors_list
def forward(self, feats, targets=None):
assert len(feats) == len(self.fpn_strides), \
"The size of feats is not equal to size of fpn_strides"
if self.training:
return self.forward_train(feats, targets)
else:
return self.forward_eval(feats)
def forward_train(self, feats, targets):
anchor_points, stride_tensor, num_anchors_list = self._generate_anchors(
feats)
cls_score_list, reg_dist_list, reg_angle_list = [], [], []
for i, feat in enumerate(feats):
avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +
feat)
reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
reg_angle = self.pred_angle[i](self.stem_angle[i](feat, avg_feat))
# cls and reg
cls_score = F.sigmoid(cls_logit)
cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))
reg_dist_list.append(reg_dist.flatten(2).transpose([0, 2, 1]))
reg_angle_list.append(reg_angle.flatten(2).transpose([0, 2, 1]))
cls_score_list = paddle.concat(cls_score_list, axis=1)
reg_dist_list = paddle.concat(reg_dist_list, axis=1)
reg_angle_list = paddle.concat(reg_angle_list, axis=1)
return self.get_loss([
cls_score_list, reg_dist_list, reg_angle_list, anchor_points,
num_anchors_list, stride_tensor
], targets)
def forward_eval(self, feats):
cls_score_list, reg_box_list = [], []
anchor_points, _, _ = self._generate_anchors(feats)
for i, (feat, stride) in enumerate(zip(feats, self.fpn_strides)):
b, _, h, w = feat.shape
l = h * w
# cls
avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +
feat)
# reg
reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
reg_xy, reg_wh = paddle.split(reg_dist, 2, axis=1)
reg_xy = reg_xy * stride
reg_wh = (F.elu(reg_wh) + 1.) * stride
reg_angle = self.pred_angle[i](self.stem_angle[i](feat, avg_feat))
reg_angle = self.angle_proj_conv(F.softmax(reg_angle, axis=1))
reg_box = paddle.concat([reg_xy, reg_wh, reg_angle], axis=1)
# cls and reg
cls_score = F.sigmoid(cls_logit)
cls_score_list.append(cls_score.reshape([b, self.num_classes, l]))
reg_box_list.append(reg_box.reshape([b, 5, l]))
cls_score_list = paddle.concat(cls_score_list, axis=-1)
reg_box_list = paddle.concat(reg_box_list, axis=-1).transpose([0, 2, 1])
reg_xy, reg_wha = paddle.split(reg_box_list, [2, 3], axis=-1)
reg_xy = reg_xy + anchor_points
reg_box_list = paddle.concat([reg_xy, reg_wha], axis=-1)
return cls_score_list, reg_box_list
def _bbox_decode(self, points, pred_dist, pred_angle, stride_tensor):
# predict vector to x, y, w, h, angle
b, l = pred_angle.shape[:2]
xy, wh = paddle.split(pred_dist, 2, axis=-1)
xy = xy * stride_tensor + points
wh = (F.elu(wh) + 1.) * stride_tensor
angle = F.softmax(pred_angle.reshape([b, l, 1, self.angle_max + 1
])).matmul(self.angle_proj)
return paddle.concat([xy, wh, angle], axis=-1)
def get_loss(self, head_outs, gt_meta):
pred_scores, pred_dist, pred_angle, \
anchor_points, num_anchors_list, stride_tensor = head_outs
# [B, N, 5] -> [B, N, 5]
pred_bboxes = self._bbox_decode(anchor_points, pred_dist, pred_angle,
stride_tensor)
gt_labels = gt_meta['gt_class']
# [B, N, 5]
gt_bboxes = gt_meta['gt_rbox']
pad_gt_mask = gt_meta['pad_gt_mask']
# label assignment
if gt_meta['epoch_id'] < self.static_assigner_epoch:
assigned_labels, assigned_bboxes, assigned_scores = \
self.static_assigner(
anchor_points,
stride_tensor,
num_anchors_list,
gt_labels,
gt_meta['gt_bbox'],
gt_bboxes,
pad_gt_mask,
self.num_classes,
pred_bboxes.detach()
)
else:
assigned_labels, assigned_bboxes, assigned_scores = \
self.assigner(
pred_scores.detach(),
pred_bboxes.detach(),
anchor_points,
num_anchors_list,
gt_labels,
gt_bboxes,
pad_gt_mask,
bg_index=self.num_classes)
alpha_l = -1
# cls loss
if self.use_varifocal_loss:
one_hot_label = F.one_hot(assigned_labels,
self.num_classes + 1)[..., :-1]
loss_cls = self._varifocal_loss(pred_scores, assigned_scores,
one_hot_label)
else:
loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l)
assigned_scores_sum = assigned_scores.sum()
if paddle.distributed.get_world_size() > 1:
paddle.distributed.all_reduce(assigned_scores_sum)
assigned_scores_sum = paddle.clip(
assigned_scores_sum / paddle.distributed.get_world_size(),
min=1.)
else:
assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.)
loss_cls /= assigned_scores_sum
loss_iou, loss_dfl = self._bbox_loss(pred_angle, pred_bboxes,
anchor_points, assigned_labels,
assigned_bboxes, assigned_scores,
assigned_scores_sum, stride_tensor)
loss = self.loss_weight['class'] * loss_cls + \
self.loss_weight['iou'] * loss_iou + \
self.loss_weight['dfl'] * loss_dfl
out_dict = {
'loss': loss,
'loss_cls': loss_cls,
'loss_iou': loss_iou,
'loss_dfl': loss_dfl
}
return out_dict
@staticmethod
def _focal_loss(score, label, alpha=0.25, gamma=2.0):
weight = (score - label).pow(gamma)
if alpha > 0:
alpha_t = alpha * label + (1 - alpha) * (1 - label)
weight *= alpha_t
loss = F.binary_cross_entropy(
score, label, weight=weight, reduction='sum')
return loss
@staticmethod
def _varifocal_loss(pred_score, gt_score, label, alpha=0.75, gamma=2.0):
weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
loss = F.binary_cross_entropy(
pred_score, gt_score, weight=weight, reduction='sum')
return loss
@staticmethod
def _df_loss(pred_dist, target):
target_left = paddle.cast(target, 'int64')
target_right = target_left + 1
weight_left = target_right.astype('float32') - target
weight_right = 1 - weight_left
loss_left = F.cross_entropy(
pred_dist, target_left, reduction='none') * weight_left
loss_right = F.cross_entropy(
pred_dist, target_right, reduction='none') * weight_right
return (loss_left + loss_right).mean(-1, keepdim=True)
def _bbox_loss(self, pred_angle, pred_bboxes, anchor_points,
assigned_labels, assigned_bboxes, assigned_scores,
assigned_scores_sum, stride_tensor):
# select positive samples mask
mask_positive = (assigned_labels != self.num_classes)
num_pos = mask_positive.sum()
# pos/neg loss
if num_pos > 0:
# iou
bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 5])
pred_bboxes_pos = paddle.masked_select(pred_bboxes,
bbox_mask).reshape([-1, 5])
assigned_bboxes_pos = paddle.masked_select(
assigned_bboxes, bbox_mask).reshape([-1, 5])
bbox_weight = paddle.masked_select(
assigned_scores.sum(-1), mask_positive).reshape([-1])
loss_iou = self.iou_loss(pred_bboxes_pos,
assigned_bboxes_pos) * bbox_weight
loss_iou = loss_iou.sum() / assigned_scores_sum
# dfl
angle_mask = mask_positive.unsqueeze(-1).tile(
[1, 1, self.angle_max + 1])
pred_angle_pos = paddle.masked_select(
pred_angle, angle_mask).reshape([-1, self.angle_max + 1])
assigned_angle_pos = (
assigned_bboxes_pos[:, 4] /
self.half_pi_bin).clip(0, self.angle_max - 0.01)
loss_dfl = self._df_loss(pred_angle_pos, assigned_angle_pos)
else:
loss_iou = pred_bboxes.sum() * 0.
loss_dfl = paddle.zeros([1])
return loss_iou, loss_dfl
def _box2corners(self, pred_bboxes):
""" convert (x, y, w, h, angle) to (x1, y1, x2, y2, x3, y3, x4, y4)
Args:
pred_bboxes (Tensor): [B, N, 5]
Returns:
polys (Tensor): [B, N, 8]
"""
x, y, w, h, angle = paddle.split(pred_bboxes, 5, axis=-1)
cos_a_half = paddle.cos(angle) * 0.5
sin_a_half = paddle.sin(angle) * 0.5
w_x = cos_a_half * w
w_y = sin_a_half * w
h_x = -sin_a_half * h
h_y = cos_a_half * h
return paddle.concat(
[
x + w_x + h_x, y + w_y + h_y, x - w_x + h_x, y - w_y + h_y,
x - w_x - h_x, y - w_y - h_y, x + w_x - h_x, y + w_y - h_y
],
axis=-1)
def post_process(self, head_outs, scale_factor):
pred_scores, pred_bboxes = head_outs
# [B, N, 5] -> [B, N, 8]
pred_bboxes = self._box2corners(pred_bboxes)
# scale bbox to origin
scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)
scale_factor = paddle.concat(
[
scale_x, scale_y, scale_x, scale_y, scale_x, scale_y, scale_x,
scale_y
],
axis=-1).reshape([-1, 1, 8])
pred_bboxes /= scale_factor
if self.export_onnx:
return pred_bboxes, pred_scores, None
bbox_pred, bbox_num, nms_keep_idx = self.nms(pred_bboxes,
pred_scores)
return bbox_pred, bbox_num, nms_keep_idx
================================================
FILE: ppdet/modeling/heads/retina_head.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import Normal, Constant
from ppdet.modeling.bbox_utils import bbox2delta, delta2bbox
from ppdet.modeling.heads.fcos_head import FCOSFeat
from ppdet.core.workspace import register
__all__ = ['RetinaHead']
@register
class RetinaFeat(FCOSFeat):
"""We use FCOSFeat to construct conv layers in RetinaNet.
We rename FCOSFeat to RetinaFeat to avoid confusion.
"""
pass
@register
class RetinaHead(nn.Layer):
"""Used in RetinaNet proposed in paper https://arxiv.org/pdf/1708.02002.pdf
"""
__shared__ = ['num_classes']
__inject__ = [
'conv_feat', 'anchor_generator', 'bbox_assigner', 'loss_class',
'loss_bbox', 'nms'
]
def __init__(self,
num_classes=80,
conv_feat='RetinaFeat',
anchor_generator='RetinaAnchorGenerator',
bbox_assigner='MaxIoUAssigner',
loss_class='FocalLoss',
loss_bbox='SmoothL1Loss',
nms='MultiClassNMS',
prior_prob=0.01,
nms_pre=1000,
weights=[1., 1., 1., 1.]):
super(RetinaHead, self).__init__()
self.num_classes = num_classes
self.conv_feat = conv_feat
self.anchor_generator = anchor_generator
self.bbox_assigner = bbox_assigner
self.loss_class = loss_class
self.loss_bbox = loss_bbox
self.nms = nms
self.nms_pre = nms_pre
self.weights = weights
bias_init_value = -math.log((1 - prior_prob) / prior_prob)
num_anchors = self.anchor_generator.num_anchors
self.retina_cls = nn.Conv2D(
in_channels=self.conv_feat.feat_out,
out_channels=self.num_classes * num_anchors,
kernel_size=3,
stride=1,
padding=1,
weight_attr=ParamAttr(initializer=Normal(
mean=0.0, std=0.01)),
bias_attr=ParamAttr(initializer=Constant(value=bias_init_value)))
self.retina_reg = nn.Conv2D(
in_channels=self.conv_feat.feat_out,
out_channels=4 * num_anchors,
kernel_size=3,
stride=1,
padding=1,
weight_attr=ParamAttr(initializer=Normal(
mean=0.0, std=0.01)),
bias_attr=ParamAttr(initializer=Constant(value=0)))
def forward(self, neck_feats, targets=None):
cls_logits_list = []
bboxes_reg_list = []
for neck_feat in neck_feats:
conv_cls_feat, conv_reg_feat = self.conv_feat(neck_feat)
cls_logits = self.retina_cls(conv_cls_feat)
bbox_reg = self.retina_reg(conv_reg_feat)
cls_logits_list.append(cls_logits)
bboxes_reg_list.append(bbox_reg)
if self.training:
return self.get_loss([cls_logits_list, bboxes_reg_list], targets)
else:
return [cls_logits_list, bboxes_reg_list]
def get_loss(self, head_outputs, targets):
"""Here we calculate loss for a batch of images.
We assign anchors to gts in each image and gather all the assigned
postive and negative samples. Then loss is calculated on the gathered
samples.
"""
cls_logits_list, bboxes_reg_list = head_outputs
anchors = self.anchor_generator(cls_logits_list)
anchors = paddle.concat(anchors)
# matches: contain gt_inds
# match_labels: -1(ignore), 0(neg) or 1(pos)
matches_list, match_labels_list = [], []
# assign anchors to gts, no sampling is involved
for gt_bbox in targets['gt_bbox']:
matches, match_labels = self.bbox_assigner(anchors, gt_bbox)
matches_list.append(matches)
match_labels_list.append(match_labels)
# reshape network outputs
cls_logits = [
_.transpose([0, 2, 3, 1]).reshape([0, -1, self.num_classes])
for _ in cls_logits_list
]
bboxes_reg = [
_.transpose([0, 2, 3, 1]).reshape([0, -1, 4])
for _ in bboxes_reg_list
]
cls_logits = paddle.concat(cls_logits, axis=1)
bboxes_reg = paddle.concat(bboxes_reg, axis=1)
cls_pred_list, cls_tar_list = [], []
reg_pred_list, reg_tar_list = [], []
# find and gather preds and targets in each image
for matches, match_labels, cls_logit, bbox_reg, gt_bbox, gt_class in \
zip(matches_list, match_labels_list, cls_logits, bboxes_reg,
targets['gt_bbox'], targets['gt_class']):
pos_mask = (match_labels == 1)
neg_mask = (match_labels == 0)
chosen_mask = paddle.logical_or(pos_mask, neg_mask)
gt_class = gt_class.reshape([-1])
bg_class = paddle.to_tensor(
[self.num_classes], dtype=gt_class.dtype)
# a trick to assign num_classes to negative targets
gt_class = paddle.concat([gt_class, bg_class], axis=-1)
matches = paddle.where(neg_mask,
paddle.full_like(matches, gt_class.size - 1),
matches)
cls_pred = cls_logit[chosen_mask]
cls_tar = gt_class[matches[chosen_mask]]
reg_pred = bbox_reg[pos_mask].reshape([-1, 4])
reg_tar = gt_bbox[matches[pos_mask]].reshape([-1, 4])
reg_tar = bbox2delta(anchors[pos_mask], reg_tar, self.weights)
cls_pred_list.append(cls_pred)
cls_tar_list.append(cls_tar)
reg_pred_list.append(reg_pred)
reg_tar_list.append(reg_tar)
cls_pred = paddle.concat(cls_pred_list)
cls_tar = paddle.concat(cls_tar_list)
reg_pred = paddle.concat(reg_pred_list)
reg_tar = paddle.concat(reg_tar_list)
avg_factor = max(1.0, reg_pred.shape[0])
cls_loss = self.loss_class(
cls_pred, cls_tar, reduction='sum') / avg_factor
if reg_pred.shape[0] == 0:
reg_loss = paddle.zeros([])
reg_loss.stop_gradient = False
else:
reg_loss = self.loss_bbox(
reg_pred, reg_tar, reduction='sum') / avg_factor
loss = cls_loss + reg_loss
out_dict = {
'loss_cls': cls_loss,
'loss_reg': reg_loss,
'loss': loss,
}
return out_dict
def get_bboxes_single(self,
anchors,
cls_scores_list,
bbox_preds_list,
im_shape,
scale_factor,
rescale=True):
assert len(cls_scores_list) == len(bbox_preds_list)
mlvl_bboxes = []
mlvl_scores = []
for anchor, cls_score, bbox_pred in zip(anchors, cls_scores_list,
bbox_preds_list):
cls_score = cls_score.reshape([-1, self.num_classes])
bbox_pred = bbox_pred.reshape([-1, 4])
if self.nms_pre is not None and cls_score.shape[0] > self.nms_pre:
max_score = cls_score.max(axis=1)
_, topk_inds = max_score.topk(self.nms_pre)
bbox_pred = bbox_pred.gather(topk_inds)
anchor = anchor.gather(topk_inds)
cls_score = cls_score.gather(topk_inds)
bbox_pred = delta2bbox(bbox_pred, anchor, self.weights).squeeze()
mlvl_bboxes.append(bbox_pred)
mlvl_scores.append(F.sigmoid(cls_score))
mlvl_bboxes = paddle.concat(mlvl_bboxes)
mlvl_bboxes = paddle.squeeze(mlvl_bboxes)
if rescale:
mlvl_bboxes = mlvl_bboxes / paddle.concat(
[scale_factor[::-1], scale_factor[::-1]])
mlvl_scores = paddle.concat(mlvl_scores)
mlvl_scores = mlvl_scores.transpose([1, 0])
return mlvl_bboxes, mlvl_scores
def decode(self, anchors, cls_logits, bboxes_reg, im_shape, scale_factor):
batch_bboxes = []
batch_scores = []
for img_id in range(cls_logits[0].shape[0]):
num_lvls = len(cls_logits)
cls_scores_list = [cls_logits[i][img_id] for i in range(num_lvls)]
bbox_preds_list = [bboxes_reg[i][img_id] for i in range(num_lvls)]
bboxes, scores = self.get_bboxes_single(
anchors, cls_scores_list, bbox_preds_list, im_shape[img_id],
scale_factor[img_id])
batch_bboxes.append(bboxes)
batch_scores.append(scores)
batch_bboxes = paddle.stack(batch_bboxes, axis=0)
batch_scores = paddle.stack(batch_scores, axis=0)
return batch_bboxes, batch_scores
def post_process(self, head_outputs, im_shape, scale_factor):
cls_logits_list, bboxes_reg_list = head_outputs
anchors = self.anchor_generator(cls_logits_list)
cls_logits = [_.transpose([0, 2, 3, 1]) for _ in cls_logits_list]
bboxes_reg = [_.transpose([0, 2, 3, 1]) for _ in bboxes_reg_list]
bboxes, scores = self.decode(anchors, cls_logits, bboxes_reg, im_shape,
scale_factor)
bbox_pred, bbox_num, nms_keep_idx = self.nms(bboxes, scores)
return bbox_pred, bbox_num, nms_keep_idx
def get_scores_single(self, cls_scores_list):
mlvl_logits = []
for cls_score in cls_scores_list:
cls_score = cls_score.reshape([-1, self.num_classes])
if self.nms_pre is not None and cls_score.shape[0] > self.nms_pre:
max_score = cls_score.max(axis=1)
_, topk_inds = max_score.topk(self.nms_pre)
cls_score = cls_score.gather(topk_inds)
mlvl_logits.append(cls_score)
mlvl_logits = paddle.concat(mlvl_logits)
mlvl_logits = mlvl_logits.transpose([1, 0])
return mlvl_logits
def decode_cls_logits(self, cls_logits_list):
cls_logits = [_.transpose([0, 2, 3, 1]) for _ in cls_logits_list]
batch_logits = []
for img_id in range(cls_logits[0].shape[0]):
num_lvls = len(cls_logits)
cls_scores_list = [cls_logits[i][img_id] for i in range(num_lvls)]
logits = self.get_scores_single(cls_scores_list)
batch_logits.append(logits)
batch_logits = paddle.stack(batch_logits, axis=0)
return batch_logits
================================================
FILE: ppdet/modeling/heads/roi_extractor.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from ppdet.core.workspace import register
from ppdet.modeling import ops
import paddle.nn as nn
def _to_list(v):
if not isinstance(v, (list, tuple)):
return [v]
return v
@register
class RoIAlign(nn.Layer):
"""
RoI Align module
For more details, please refer to the document of roi_align in
in https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/vision/ops.py
Args:
resolution (int): The output size, default 14
spatial_scale (float): Multiplicative spatial scale factor to translate
ROI coords from their input scale to the scale used when pooling.
default 0.0625
sampling_ratio (int): The number of sampling points in the interpolation
grid, default 0
canconical_level (int): The referring level of FPN layer with
specified level. default 4
canonical_size (int): The referring scale of FPN layer with
specified scale. default 224
start_level (int): The start level of FPN layer to extract RoI feature,
default 0
end_level (int): The end level of FPN layer to extract RoI feature,
default 3
aligned (bool): Whether to add offset to rois' coord in roi_align.
default false
"""
def __init__(self,
resolution=14,
spatial_scale=0.0625,
sampling_ratio=0,
canconical_level=4,
canonical_size=224,
start_level=0,
end_level=3,
aligned=False):
super(RoIAlign, self).__init__()
self.resolution = resolution
self.spatial_scale = _to_list(spatial_scale)
self.sampling_ratio = sampling_ratio
self.canconical_level = canconical_level
self.canonical_size = canonical_size
self.start_level = start_level
self.end_level = end_level
self.aligned = False # TODO: npu kernel do not support aligned=True
@classmethod
def from_config(cls, cfg, input_shape):
return {'spatial_scale': [1. / i.stride for i in input_shape]}
def forward(self, feats, roi, rois_num):
roi = paddle.concat(roi) if len(roi) > 1 else roi[0]
if len(feats) == 1:
rois_feat = paddle.vision.ops.roi_align(
x=feats[self.start_level],
boxes=roi,
boxes_num=rois_num,
output_size=self.resolution,
spatial_scale=self.spatial_scale[0],
aligned=self.aligned)
else:
offset = 2
k_min = self.start_level + offset
k_max = self.end_level + offset
if hasattr(paddle.vision.ops, "distribute_fpn_proposals"):
distribute_fpn_proposals = getattr(paddle.vision.ops,
"distribute_fpn_proposals")
else:
distribute_fpn_proposals = ops.distribute_fpn_proposals
rois_dist, restore_index, rois_num_dist = distribute_fpn_proposals(
roi,
k_min,
k_max,
self.canconical_level,
self.canonical_size,
rois_num=rois_num)
rois_feat_list = []
for lvl in range(self.start_level, self.end_level + 1):
roi_feat = paddle.vision.ops.roi_align(
x=feats[lvl],
boxes=rois_dist[lvl],
boxes_num=rois_num_dist[lvl],
output_size=self.resolution,
spatial_scale=self.spatial_scale[lvl],
sampling_ratio=self.sampling_ratio,
aligned=self.aligned)
rois_feat_list.append(roi_feat)
rois_feat_shuffle = paddle.concat(rois_feat_list)
rois_feat = paddle.gather(rois_feat_shuffle, restore_index)
return rois_feat
================================================
FILE: ppdet/modeling/heads/s2anet_head.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The code is based on https://github.com/csuhan/s2anet/blob/master/mmdet/models/anchor_heads_rotated/s2anet_head.py
import paddle
from paddle import ParamAttr
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import Normal, Constant
from ppdet.core.workspace import register
from ppdet.modeling.proposal_generator.target_layer import RBoxAssigner
from ppdet.modeling.proposal_generator.anchor_generator import S2ANetAnchorGenerator
from ppdet.modeling.layers import AlignConv
from ..cls_utils import _get_class_default_kwargs
import numpy as np
@register
class S2ANetHead(nn.Layer):
"""
S2Anet head
Args:
stacked_convs (int): number of stacked_convs
feat_in (int): input channels of feat
feat_out (int): output channels of feat
num_classes (int): num_classes
anchor_strides (list): stride of anchors
anchor_scales (list): scale of anchors
anchor_ratios (list): ratios of anchors
target_means (list): target_means
target_stds (list): target_stds
align_conv_type (str): align_conv_type ['Conv', 'AlignConv']
align_conv_size (int): kernel size of align_conv
use_sigmoid_cls (bool): use sigmoid_cls or not
reg_loss_weight (list): loss weight for regression
"""
__shared__ = ['num_classes']
__inject__ = ['anchor_assign', 'nms']
def __init__(self,
stacked_convs=2,
feat_in=256,
feat_out=256,
num_classes=15,
anchor_strides=[8, 16, 32, 64, 128],
anchor_scales=[4],
anchor_ratios=[1.0],
target_means=0.0,
target_stds=1.0,
align_conv_type='AlignConv',
align_conv_size=3,
use_sigmoid_cls=True,
anchor_assign=_get_class_default_kwargs(RBoxAssigner),
reg_loss_weight=[1.0, 1.0, 1.0, 1.0, 1.1],
cls_loss_weight=[1.1, 1.05],
reg_loss_type='l1',
nms_pre=2000,
nms='MultiClassNMS'):
super(S2ANetHead, self).__init__()
self.stacked_convs = stacked_convs
self.feat_in = feat_in
self.feat_out = feat_out
self.anchor_list = None
self.anchor_scales = anchor_scales
self.anchor_ratios = anchor_ratios
self.anchor_strides = anchor_strides
self.anchor_strides = paddle.to_tensor(anchor_strides)
self.anchor_base_sizes = list(anchor_strides)
self.means = paddle.ones(shape=[5]) * target_means
self.stds = paddle.ones(shape=[5]) * target_stds
assert align_conv_type in ['AlignConv', 'Conv', 'DCN']
self.align_conv_type = align_conv_type
self.align_conv_size = align_conv_size
self.use_sigmoid_cls = use_sigmoid_cls
self.cls_out_channels = num_classes if self.use_sigmoid_cls else num_classes + 1
self.sampling = False
self.anchor_assign = anchor_assign
self.reg_loss_weight = reg_loss_weight
self.cls_loss_weight = cls_loss_weight
self.alpha = 1.0
self.beta = 1.0
self.reg_loss_type = reg_loss_type
self.nms_pre = nms_pre
self.nms = nms
self.fake_bbox = paddle.to_tensor(
np.array(
[[-1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
dtype='float32'))
self.fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32'))
# anchor
self.anchor_generators = []
for anchor_base in self.anchor_base_sizes:
self.anchor_generators.append(
S2ANetAnchorGenerator(anchor_base, anchor_scales,
anchor_ratios))
self.anchor_generators = nn.LayerList(self.anchor_generators)
self.fam_cls_convs = nn.Sequential()
self.fam_reg_convs = nn.Sequential()
for i in range(self.stacked_convs):
chan_in = self.feat_in if i == 0 else self.feat_out
self.fam_cls_convs.add_sublayer(
'fam_cls_conv_{}'.format(i),
nn.Conv2D(
in_channels=chan_in,
out_channels=self.feat_out,
kernel_size=3,
padding=1,
weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
bias_attr=ParamAttr(initializer=Constant(0))))
self.fam_cls_convs.add_sublayer('fam_cls_conv_{}_act'.format(i),
nn.ReLU())
self.fam_reg_convs.add_sublayer(
'fam_reg_conv_{}'.format(i),
nn.Conv2D(
in_channels=chan_in,
out_channels=self.feat_out,
kernel_size=3,
padding=1,
weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
bias_attr=ParamAttr(initializer=Constant(0))))
self.fam_reg_convs.add_sublayer('fam_reg_conv_{}_act'.format(i),
nn.ReLU())
self.fam_reg = nn.Conv2D(
self.feat_out,
5,
1,
weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
bias_attr=ParamAttr(initializer=Constant(0)))
prior_prob = 0.01
bias_init = float(-np.log((1 - prior_prob) / prior_prob))
self.fam_cls = nn.Conv2D(
self.feat_out,
self.cls_out_channels,
1,
weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
bias_attr=ParamAttr(initializer=Constant(bias_init)))
if self.align_conv_type == "AlignConv":
self.align_conv = AlignConv(self.feat_out, self.feat_out,
self.align_conv_size)
elif self.align_conv_type == "Conv":
self.align_conv = nn.Conv2D(
self.feat_out,
self.feat_out,
self.align_conv_size,
padding=(self.align_conv_size - 1) // 2,
bias_attr=ParamAttr(initializer=Constant(0)))
elif self.align_conv_type == "DCN":
self.align_conv_offset = nn.Conv2D(
self.feat_out,
2 * self.align_conv_size**2,
1,
weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
bias_attr=ParamAttr(initializer=Constant(0)))
self.align_conv = paddle.vision.ops.DeformConv2D(
self.feat_out,
self.feat_out,
self.align_conv_size,
padding=(self.align_conv_size - 1) // 2,
weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
bias_attr=False)
self.or_conv = nn.Conv2D(
self.feat_out,
self.feat_out,
kernel_size=3,
padding=1,
weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
bias_attr=ParamAttr(initializer=Constant(0)))
# ODM
self.odm_cls_convs = nn.Sequential()
self.odm_reg_convs = nn.Sequential()
for i in range(self.stacked_convs):
ch_in = self.feat_out
# ch_in = int(self.feat_out / 8) if i == 0 else self.feat_out
self.odm_cls_convs.add_sublayer(
'odm_cls_conv_{}'.format(i),
nn.Conv2D(
in_channels=ch_in,
out_channels=self.feat_out,
kernel_size=3,
stride=1,
padding=1,
weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
bias_attr=ParamAttr(initializer=Constant(0))))
self.odm_cls_convs.add_sublayer('odm_cls_conv_{}_act'.format(i),
nn.ReLU())
self.odm_reg_convs.add_sublayer(
'odm_reg_conv_{}'.format(i),
nn.Conv2D(
in_channels=self.feat_out,
out_channels=self.feat_out,
kernel_size=3,
stride=1,
padding=1,
weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
bias_attr=ParamAttr(initializer=Constant(0))))
self.odm_reg_convs.add_sublayer('odm_reg_conv_{}_act'.format(i),
nn.ReLU())
self.odm_cls = nn.Conv2D(
self.feat_out,
self.cls_out_channels,
3,
padding=1,
weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
bias_attr=ParamAttr(initializer=Constant(bias_init)))
self.odm_reg = nn.Conv2D(
self.feat_out,
5,
3,
padding=1,
weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
bias_attr=ParamAttr(initializer=Constant(0)))
def forward(self, feats, targets=None):
fam_reg_list, fam_cls_list = [], []
odm_reg_list, odm_cls_list = [], []
num_anchors_list, base_anchors_list, refine_anchors_list = [], [], []
for i, feat in enumerate(feats):
# get shape
B = feat.shape[0]
H, W = feat.shape[2], feat.shape[3]
NA = H * W
num_anchors_list.append(NA)
fam_cls_feat = self.fam_cls_convs(feat)
fam_cls = self.fam_cls(fam_cls_feat)
# [N, CLS, H, W] --> [N, H, W, CLS]
fam_cls = fam_cls.transpose([0, 2, 3, 1]).reshape(
[B, NA, self.cls_out_channels])
fam_cls_list.append(fam_cls)
fam_reg_feat = self.fam_reg_convs(feat)
fam_reg = self.fam_reg(fam_reg_feat)
# [N, 5, H, W] --> [N, H, W, 5]
fam_reg = fam_reg.transpose([0, 2, 3, 1]).reshape([B, NA, 5])
fam_reg_list.append(fam_reg)
# prepare anchor
init_anchors = self.anchor_generators[i]((H, W),
self.anchor_strides[i])
init_anchors = init_anchors.reshape([1, NA, 5])
base_anchors_list.append(init_anchors.squeeze(0))
if self.training:
refine_anchor = self.bbox_decode(fam_reg.detach(), init_anchors)
else:
refine_anchor = self.bbox_decode(fam_reg, init_anchors)
refine_anchors_list.append(refine_anchor)
if self.align_conv_type == 'AlignConv':
align_feat = self.align_conv(feat,
refine_anchor.clone(), (H, W),
self.anchor_strides[i])
elif self.align_conv_type == 'DCN':
align_offset = self.align_conv_offset(feat)
align_feat = self.align_conv(feat, align_offset)
elif self.align_conv_type == 'Conv':
align_feat = self.align_conv(feat)
or_feat = self.or_conv(align_feat)
odm_reg_feat = or_feat
odm_cls_feat = or_feat
odm_reg_feat = self.odm_reg_convs(odm_reg_feat)
odm_cls_feat = self.odm_cls_convs(odm_cls_feat)
odm_cls = self.odm_cls(odm_cls_feat)
# [N, CLS, H, W] --> [N, H, W, CLS]
odm_cls = odm_cls.transpose([0, 2, 3, 1]).reshape(
[B, NA, self.cls_out_channels])
odm_cls_list.append(odm_cls)
odm_reg = self.odm_reg(odm_reg_feat)
# [N, 5, H, W] --> [N, H, W, 5]
odm_reg = odm_reg.transpose([0, 2, 3, 1]).reshape([B, NA, 5])
odm_reg_list.append(odm_reg)
if self.training:
return self.get_loss([
fam_cls_list, fam_reg_list, odm_cls_list, odm_reg_list,
num_anchors_list, base_anchors_list, refine_anchors_list
], targets)
else:
odm_bboxes_list = []
for odm_reg, refine_anchor in zip(odm_reg_list,
refine_anchors_list):
odm_bboxes = self.bbox_decode(odm_reg, refine_anchor)
odm_bboxes_list.append(odm_bboxes)
return [odm_bboxes_list, odm_cls_list]
def get_bboxes(self, head_outs):
perd_bboxes_list, pred_scores_list = head_outs
batch = pred_scores_list[0].shape[0]
bboxes, bbox_num = [], []
for i in range(batch):
pred_scores_per_image = [t[i] for t in pred_scores_list]
pred_bboxes_per_image = [t[i] for t in perd_bboxes_list]
bbox_per_image, bbox_num_per_image = self.get_bboxes_single(
pred_scores_per_image, pred_bboxes_per_image)
bboxes.append(bbox_per_image)
bbox_num.append(bbox_num_per_image)
bboxes = paddle.concat(bboxes)
bbox_num = paddle.concat(bbox_num)
return bboxes, bbox_num
def get_pred(self, bboxes, bbox_num, im_shape, scale_factor):
"""
Rescale, clip and filter the bbox from the output of NMS to
get final prediction.
Args:
bboxes(Tensor): bboxes [N, 10]
bbox_num(Tensor): bbox_num
im_shape(Tensor): [1 2]
scale_factor(Tensor): [1 2]
Returns:
bbox_pred(Tensor): The output is the prediction with shape [N, 8]
including labels, scores and bboxes. The size of
bboxes are corresponding to the original image.
"""
origin_shape = paddle.floor(im_shape / scale_factor + 0.5)
origin_shape_list = []
scale_factor_list = []
# scale_factor: scale_y, scale_x
for i in range(bbox_num.shape[0]):
expand_shape = paddle.expand(origin_shape[i:i + 1, :],
[bbox_num[i], 2])
scale_y, scale_x = scale_factor[i, 0:1], scale_factor[i, 1:2]
scale = paddle.concat([
scale_x, scale_y, scale_x, scale_y, scale_x, scale_y, scale_x,
scale_y
])
expand_scale = paddle.expand(scale, [bbox_num[i], 8])
origin_shape_list.append(expand_shape)
scale_factor_list.append(expand_scale)
origin_shape_list = paddle.concat(origin_shape_list)
scale_factor_list = paddle.concat(scale_factor_list)
# bboxes: [N, 10], label, score, bbox
pred_label_score = bboxes[:, 0:2]
pred_bbox = bboxes[:, 2:]
# rescale bbox to original image
pred_bbox = pred_bbox.reshape([-1, 8])
scaled_bbox = pred_bbox / scale_factor_list
origin_h = origin_shape_list[:, 0]
origin_w = origin_shape_list[:, 1]
bboxes = scaled_bbox
zeros = paddle.zeros_like(origin_h)
x1 = paddle.maximum(paddle.minimum(bboxes[:, 0], origin_w - 1), zeros)
y1 = paddle.maximum(paddle.minimum(bboxes[:, 1], origin_h - 1), zeros)
x2 = paddle.maximum(paddle.minimum(bboxes[:, 2], origin_w - 1), zeros)
y2 = paddle.maximum(paddle.minimum(bboxes[:, 3], origin_h - 1), zeros)
x3 = paddle.maximum(paddle.minimum(bboxes[:, 4], origin_w - 1), zeros)
y3 = paddle.maximum(paddle.minimum(bboxes[:, 5], origin_h - 1), zeros)
x4 = paddle.maximum(paddle.minimum(bboxes[:, 6], origin_w - 1), zeros)
y4 = paddle.maximum(paddle.minimum(bboxes[:, 7], origin_h - 1), zeros)
pred_bbox = paddle.stack([x1, y1, x2, y2, x3, y3, x4, y4], axis=-1)
pred_result = paddle.concat([pred_label_score, pred_bbox], axis=1)
return pred_result
def get_bboxes_single(self, cls_score_list, bbox_pred_list):
mlvl_bboxes = []
mlvl_scores = []
for cls_score, bbox_pred in zip(cls_score_list, bbox_pred_list):
if self.use_sigmoid_cls:
scores = F.sigmoid(cls_score)
else:
scores = F.softmax(cls_score, axis=-1)
if scores.shape[0] > self.nms_pre:
# Get maximum scores for foreground classes.
if self.use_sigmoid_cls:
max_scores = paddle.max(scores, axis=1)
else:
max_scores = paddle.max(scores[:, :-1], axis=1)
topk_val, topk_inds = paddle.topk(max_scores, self.nms_pre)
bbox_pred = paddle.gather(bbox_pred, topk_inds)
scores = paddle.gather(scores, topk_inds)
mlvl_bboxes.append(bbox_pred)
mlvl_scores.append(scores)
mlvl_bboxes = paddle.concat(mlvl_bboxes)
mlvl_scores = paddle.concat(mlvl_scores)
mlvl_polys = self.rbox2poly(mlvl_bboxes).unsqueeze(0)
mlvl_scores = paddle.transpose(mlvl_scores, [1, 0]).unsqueeze(0)
bbox, bbox_num, _ = self.nms(mlvl_polys, mlvl_scores)
if bbox.shape[0] <= 0:
bbox = self.fake_bbox
bbox_num = self.fake_bbox_num
return bbox, bbox_num
def smooth_l1_loss(self, pred, label, delta=1.0 / 9.0):
"""
Args:
pred: pred score
label: label
delta: delta
Returns: loss
"""
assert pred.shape == label.shape and label.numel() > 0
assert delta > 0
diff = paddle.abs(pred - label)
loss = paddle.where(diff < delta, 0.5 * diff * diff / delta,
diff - 0.5 * delta)
return loss
def get_fam_loss(self, fam_target, s2anet_head_out, reg_loss_type='l1'):
(labels, label_weights, bbox_targets, bbox_weights, bbox_gt_bboxes,
pos_inds, neg_inds) = fam_target
fam_cls_branch_list, fam_reg_branch_list, odm_cls_branch_list, odm_reg_branch_list, num_anchors_list = s2anet_head_out
fam_cls_losses = []
fam_bbox_losses = []
st_idx = 0
num_total_samples = len(pos_inds) + len(
neg_inds) if self.sampling else len(pos_inds)
num_total_samples = max(1, num_total_samples)
for idx, feat_anchor_num in enumerate(num_anchors_list):
# step1: get data
feat_labels = labels[st_idx:st_idx + feat_anchor_num]
feat_label_weights = label_weights[st_idx:st_idx + feat_anchor_num]
feat_bbox_targets = bbox_targets[st_idx:st_idx + feat_anchor_num, :]
feat_bbox_weights = bbox_weights[st_idx:st_idx + feat_anchor_num, :]
# step2: calc cls loss
feat_labels = feat_labels.reshape(-1)
feat_label_weights = feat_label_weights.reshape(-1)
fam_cls_score = fam_cls_branch_list[idx]
fam_cls_score = paddle.squeeze(fam_cls_score, axis=0)
fam_cls_score1 = fam_cls_score
feat_labels = paddle.to_tensor(feat_labels)
feat_labels_one_hot = paddle.nn.functional.one_hot(
feat_labels, self.cls_out_channels + 1)
feat_labels_one_hot = feat_labels_one_hot[:, 1:]
feat_labels_one_hot.stop_gradient = True
num_total_samples = paddle.to_tensor(
num_total_samples, dtype='float32', stop_gradient=True)
fam_cls = F.sigmoid_focal_loss(
fam_cls_score1,
feat_labels_one_hot,
normalizer=num_total_samples,
reduction='none')
feat_label_weights = feat_label_weights.reshape(
feat_label_weights.shape[0], 1)
feat_label_weights = np.repeat(
feat_label_weights, self.cls_out_channels, axis=1)
feat_label_weights = paddle.to_tensor(
feat_label_weights, stop_gradient=True)
fam_cls = fam_cls * feat_label_weights
fam_cls_total = paddle.sum(fam_cls)
fam_cls_losses.append(fam_cls_total)
# step3: regression loss
feat_bbox_targets = paddle.to_tensor(
feat_bbox_targets, dtype='float32', stop_gradient=True)
feat_bbox_targets = paddle.reshape(feat_bbox_targets, [-1, 5])
fam_bbox_pred = fam_reg_branch_list[idx]
fam_bbox_pred = paddle.squeeze(fam_bbox_pred, axis=0)
fam_bbox_pred = paddle.reshape(fam_bbox_pred, [-1, 5])
fam_bbox = self.smooth_l1_loss(fam_bbox_pred, feat_bbox_targets)
loss_weight = paddle.to_tensor(
self.reg_loss_weight, dtype='float32', stop_gradient=True)
fam_bbox = paddle.multiply(fam_bbox, loss_weight)
feat_bbox_weights = paddle.to_tensor(
feat_bbox_weights, stop_gradient=True)
fam_bbox = fam_bbox * feat_bbox_weights
fam_bbox_total = paddle.sum(fam_bbox) / num_total_samples
fam_bbox_losses.append(fam_bbox_total)
st_idx += feat_anchor_num
fam_cls_loss = paddle.add_n(fam_cls_losses)
fam_cls_loss_weight = paddle.to_tensor(
self.cls_loss_weight[0], dtype='float32', stop_gradient=True)
fam_cls_loss = fam_cls_loss * fam_cls_loss_weight
fam_reg_loss = paddle.add_n(fam_bbox_losses)
return fam_cls_loss, fam_reg_loss
def get_odm_loss(self, odm_target, s2anet_head_out, reg_loss_type='l1'):
(labels, label_weights, bbox_targets, bbox_weights, bbox_gt_bboxes,
pos_inds, neg_inds) = odm_target
fam_cls_branch_list, fam_reg_branch_list, odm_cls_branch_list, odm_reg_branch_list, num_anchors_list = s2anet_head_out
odm_cls_losses = []
odm_bbox_losses = []
st_idx = 0
num_total_samples = len(pos_inds) + len(
neg_inds) if self.sampling else len(pos_inds)
num_total_samples = max(1, num_total_samples)
for idx, feat_anchor_num in enumerate(num_anchors_list):
# step1: get data
feat_labels = labels[st_idx:st_idx + feat_anchor_num]
feat_label_weights = label_weights[st_idx:st_idx + feat_anchor_num]
feat_bbox_targets = bbox_targets[st_idx:st_idx + feat_anchor_num, :]
feat_bbox_weights = bbox_weights[st_idx:st_idx + feat_anchor_num, :]
# step2: calc cls loss
feat_labels = feat_labels.reshape(-1)
feat_label_weights = feat_label_weights.reshape(-1)
odm_cls_score = odm_cls_branch_list[idx]
odm_cls_score = paddle.squeeze(odm_cls_score, axis=0)
odm_cls_score1 = odm_cls_score
feat_labels = paddle.to_tensor(feat_labels)
feat_labels_one_hot = paddle.nn.functional.one_hot(
feat_labels, self.cls_out_channels + 1)
feat_labels_one_hot = feat_labels_one_hot[:, 1:]
feat_labels_one_hot.stop_gradient = True
num_total_samples = paddle.to_tensor(
num_total_samples, dtype='float32', stop_gradient=True)
odm_cls = F.sigmoid_focal_loss(
odm_cls_score1,
feat_labels_one_hot,
normalizer=num_total_samples,
reduction='none')
feat_label_weights = feat_label_weights.reshape(
feat_label_weights.shape[0], 1)
feat_label_weights = np.repeat(
feat_label_weights, self.cls_out_channels, axis=1)
feat_label_weights = paddle.to_tensor(feat_label_weights)
feat_label_weights.stop_gradient = True
odm_cls = odm_cls * feat_label_weights
odm_cls_total = paddle.sum(odm_cls)
odm_cls_losses.append(odm_cls_total)
# # step3: regression loss
feat_bbox_targets = paddle.to_tensor(
feat_bbox_targets, dtype='float32')
feat_bbox_targets = paddle.reshape(feat_bbox_targets, [-1, 5])
feat_bbox_targets.stop_gradient = True
odm_bbox_pred = odm_reg_branch_list[idx]
odm_bbox_pred = paddle.squeeze(odm_bbox_pred, axis=0)
odm_bbox_pred = paddle.reshape(odm_bbox_pred, [-1, 5])
odm_bbox = self.smooth_l1_loss(odm_bbox_pred, feat_bbox_targets)
loss_weight = paddle.to_tensor(
self.reg_loss_weight, dtype='float32', stop_gradient=True)
odm_bbox = paddle.multiply(odm_bbox, loss_weight)
feat_bbox_weights = paddle.to_tensor(
feat_bbox_weights, stop_gradient=True)
odm_bbox = odm_bbox * feat_bbox_weights
odm_bbox_total = paddle.sum(odm_bbox) / num_total_samples
odm_bbox_losses.append(odm_bbox_total)
st_idx += feat_anchor_num
odm_cls_loss = paddle.add_n(odm_cls_losses)
odm_cls_loss_weight = paddle.to_tensor(
self.cls_loss_weight[1], dtype='float32', stop_gradient=True)
odm_cls_loss = odm_cls_loss * odm_cls_loss_weight
odm_reg_loss = paddle.add_n(odm_bbox_losses)
return odm_cls_loss, odm_reg_loss
def get_loss(self, head_outs, inputs):
fam_cls_list, fam_reg_list, odm_cls_list, odm_reg_list, \
num_anchors_list, base_anchors_list, refine_anchors_list = head_outs
# compute loss
fam_cls_loss_lst = []
fam_reg_loss_lst = []
odm_cls_loss_lst = []
odm_reg_loss_lst = []
batch = len(inputs['gt_rbox'])
for i in range(batch):
# data_format: (xc, yc, w, h, theta)
gt_mask = inputs['pad_gt_mask'][i, :, 0]
gt_idx = paddle.nonzero(gt_mask).squeeze(-1)
gt_bboxes = paddle.gather(inputs['gt_rbox'][i], gt_idx).numpy()
gt_labels = paddle.gather(inputs['gt_class'][i], gt_idx).numpy()
is_crowd = paddle.gather(inputs['is_crowd'][i], gt_idx).numpy()
gt_labels = gt_labels + 1
anchors_per_image = np.concatenate(base_anchors_list)
fam_cls_per_image = [t[i] for t in fam_cls_list]
fam_reg_per_image = [t[i] for t in fam_reg_list]
odm_cls_per_image = [t[i] for t in odm_cls_list]
odm_reg_per_image = [t[i] for t in odm_reg_list]
im_s2anet_head_out = (fam_cls_per_image, fam_reg_per_image,
odm_cls_per_image, odm_reg_per_image,
num_anchors_list)
# FAM
im_fam_target = self.anchor_assign(anchors_per_image, gt_bboxes,
gt_labels, is_crowd)
if im_fam_target is not None:
im_fam_cls_loss, im_fam_reg_loss = self.get_fam_loss(
im_fam_target, im_s2anet_head_out, self.reg_loss_type)
fam_cls_loss_lst.append(im_fam_cls_loss)
fam_reg_loss_lst.append(im_fam_reg_loss)
# ODM
refine_anchors_per_image = [t[i] for t in refine_anchors_list]
refine_anchors_per_image = paddle.concat(
refine_anchors_per_image).numpy()
im_odm_target = self.anchor_assign(refine_anchors_per_image,
gt_bboxes, gt_labels, is_crowd)
if im_odm_target is not None:
im_odm_cls_loss, im_odm_reg_loss = self.get_odm_loss(
im_odm_target, im_s2anet_head_out, self.reg_loss_type)
odm_cls_loss_lst.append(im_odm_cls_loss)
odm_reg_loss_lst.append(im_odm_reg_loss)
fam_cls_loss = paddle.add_n(fam_cls_loss_lst) / batch
fam_reg_loss = paddle.add_n(fam_reg_loss_lst) / batch
odm_cls_loss = paddle.add_n(odm_cls_loss_lst) / batch
odm_reg_loss = paddle.add_n(odm_reg_loss_lst) / batch
loss = fam_cls_loss + fam_reg_loss + odm_cls_loss + odm_reg_loss
return {
'loss': loss,
'fam_cls_loss': fam_cls_loss,
'fam_reg_loss': fam_reg_loss,
'odm_cls_loss': odm_cls_loss,
'odm_reg_loss': odm_reg_loss
}
def bbox_decode(self, preds, anchors, wh_ratio_clip=1e-6):
"""decode bbox from deltas
Args:
preds: [B, L, 5]
anchors: [1, L, 5]
return:
bboxes: [B, L, 5]
"""
preds = paddle.add(paddle.multiply(preds, self.stds), self.means)
dx, dy, dw, dh, dangle = paddle.split(preds, 5, axis=-1)
max_ratio = np.abs(np.log(wh_ratio_clip))
dw = paddle.clip(dw, min=-max_ratio, max=max_ratio)
dh = paddle.clip(dh, min=-max_ratio, max=max_ratio)
rroi_x, rroi_y, rroi_w, rroi_h, rroi_angle = paddle.split(
anchors, 5, axis=-1)
gx = dx * rroi_w * paddle.cos(rroi_angle) - dy * rroi_h * paddle.sin(
rroi_angle) + rroi_x
gy = dx * rroi_w * paddle.sin(rroi_angle) + dy * rroi_h * paddle.cos(
rroi_angle) + rroi_y
gw = rroi_w * dw.exp()
gh = rroi_h * dh.exp()
ga = np.pi * dangle + rroi_angle
ga = (ga + np.pi / 4) % np.pi - np.pi / 4
bboxes = paddle.concat([gx, gy, gw, gh, ga], axis=-1)
return bboxes
def rbox2poly(self, rboxes):
"""
rboxes: [x_ctr,y_ctr,w,h,angle]
to
polys: [x0,y0,x1,y1,x2,y2,x3,y3]
"""
N = rboxes.shape[0]
x_ctr = rboxes[:, 0]
y_ctr = rboxes[:, 1]
width = rboxes[:, 2]
height = rboxes[:, 3]
angle = rboxes[:, 4]
tl_x, tl_y, br_x, br_y = -width * 0.5, -height * 0.5, width * 0.5, height * 0.5
normal_rects = paddle.stack(
[tl_x, br_x, br_x, tl_x, tl_y, tl_y, br_y, br_y], axis=0)
normal_rects = paddle.reshape(normal_rects, [2, 4, N])
normal_rects = paddle.transpose(normal_rects, [2, 0, 1])
sin, cos = paddle.sin(angle), paddle.cos(angle)
# M: [N,2,2]
M = paddle.stack([cos, -sin, sin, cos], axis=0)
M = paddle.reshape(M, [2, 2, N])
M = paddle.transpose(M, [2, 0, 1])
# polys: [N,8]
polys = paddle.matmul(M, normal_rects)
polys = paddle.transpose(polys, [2, 1, 0])
polys = paddle.reshape(polys, [-1, N])
polys = paddle.transpose(polys, [1, 0])
tmp = paddle.stack(
[x_ctr, y_ctr, x_ctr, y_ctr, x_ctr, y_ctr, x_ctr, y_ctr], axis=1)
polys = polys + tmp
return polys
================================================
FILE: ppdet/modeling/heads/simota_head.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# The code is based on:
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/yolox_head.py
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
from functools import partial
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import Normal, Constant
from ppdet.core.workspace import register
from ppdet.modeling.bbox_utils import distance2bbox, bbox2distance
from ppdet.data.transform.atss_assigner import bbox_overlaps
from .gfl_head import GFLHead
@register
class OTAHead(GFLHead):
"""
OTAHead
Args:
conv_feat (object): Instance of 'FCOSFeat'
num_classes (int): Number of classes
fpn_stride (list): The stride of each FPN Layer
prior_prob (float): Used to set the bias init for the class prediction layer
loss_qfl (object): Instance of QualityFocalLoss.
loss_dfl (object): Instance of DistributionFocalLoss.
loss_bbox (object): Instance of bbox loss.
assigner (object): Instance of label assigner.
reg_max: Max value of integral set :math: `{0, ..., reg_max}`
n QFL setting. Default: 16.
"""
__inject__ = [
'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox',
'assigner', 'nms'
]
__shared__ = ['num_classes']
def __init__(self,
conv_feat='FCOSFeat',
dgqp_module=None,
num_classes=80,
fpn_stride=[8, 16, 32, 64, 128],
prior_prob=0.01,
loss_class='QualityFocalLoss',
loss_dfl='DistributionFocalLoss',
loss_bbox='GIoULoss',
assigner='SimOTAAssigner',
reg_max=16,
feat_in_chan=256,
nms=None,
nms_pre=1000,
cell_offset=0):
super(OTAHead, self).__init__(
conv_feat=conv_feat,
dgqp_module=dgqp_module,
num_classes=num_classes,
fpn_stride=fpn_stride,
prior_prob=prior_prob,
loss_class=loss_class,
loss_dfl=loss_dfl,
loss_bbox=loss_bbox,
reg_max=reg_max,
feat_in_chan=feat_in_chan,
nms=nms,
nms_pre=nms_pre,
cell_offset=cell_offset)
self.conv_feat = conv_feat
self.dgqp_module = dgqp_module
self.num_classes = num_classes
self.fpn_stride = fpn_stride
self.prior_prob = prior_prob
self.loss_qfl = loss_class
self.loss_dfl = loss_dfl
self.loss_bbox = loss_bbox
self.reg_max = reg_max
self.feat_in_chan = feat_in_chan
self.nms = nms
self.nms_pre = nms_pre
self.cell_offset = cell_offset
self.use_sigmoid = self.loss_qfl.use_sigmoid
self.assigner = assigner
def _get_target_single(self, flatten_cls_pred, flatten_center_and_stride,
flatten_bbox, gt_bboxes, gt_labels):
"""Compute targets for priors in a single image.
"""
pos_num, label, label_weight, bbox_target = self.assigner(
F.sigmoid(flatten_cls_pred), flatten_center_and_stride,
flatten_bbox, gt_bboxes, gt_labels)
return (pos_num, label, label_weight, bbox_target)
def get_loss(self, head_outs, gt_meta):
cls_scores, bbox_preds = head_outs
num_level_anchors = [
featmap.shape[-2] * featmap.shape[-1] for featmap in cls_scores
]
num_imgs = gt_meta['im_id'].shape[0]
featmap_sizes = [[featmap.shape[-2], featmap.shape[-1]]
for featmap in cls_scores]
decode_bbox_preds = []
center_and_strides = []
for featmap_size, stride, bbox_pred in zip(featmap_sizes,
self.fpn_stride, bbox_preds):
# center in origin image
yy, xx = self.get_single_level_center_point(featmap_size, stride,
self.cell_offset)
center_and_stride = paddle.stack([xx, yy, stride, stride], -1).tile(
[num_imgs, 1, 1])
center_and_strides.append(center_and_stride)
center_in_feature = center_and_stride.reshape(
[-1, 4])[:, :-2] / stride
bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape(
[num_imgs, -1, 4 * (self.reg_max + 1)])
pred_distances = self.distribution_project(bbox_pred)
decode_bbox_pred_wo_stride = distance2bbox(
center_in_feature, pred_distances).reshape([num_imgs, -1, 4])
decode_bbox_preds.append(decode_bbox_pred_wo_stride * stride)
flatten_cls_preds = [
cls_pred.transpose([0, 2, 3, 1]).reshape(
[num_imgs, -1, self.cls_out_channels])
for cls_pred in cls_scores
]
flatten_cls_preds = paddle.concat(flatten_cls_preds, axis=1)
flatten_bboxes = paddle.concat(decode_bbox_preds, axis=1)
flatten_center_and_strides = paddle.concat(center_and_strides, axis=1)
gt_boxes, gt_labels = gt_meta['gt_bbox'], gt_meta['gt_class']
pos_num_l, label_l, label_weight_l, bbox_target_l = [], [], [], []
for flatten_cls_pred,flatten_center_and_stride,flatten_bbox,gt_box, gt_label \
in zip(flatten_cls_preds.detach(),flatten_center_and_strides.detach(), \
flatten_bboxes.detach(),gt_boxes, gt_labels):
pos_num, label, label_weight, bbox_target = self._get_target_single(
flatten_cls_pred, flatten_center_and_stride, flatten_bbox,
gt_box, gt_label)
pos_num_l.append(pos_num)
label_l.append(label)
label_weight_l.append(label_weight)
bbox_target_l.append(bbox_target)
labels = paddle.to_tensor(np.stack(label_l, axis=0))
label_weights = paddle.to_tensor(np.stack(label_weight_l, axis=0))
bbox_targets = paddle.to_tensor(np.stack(bbox_target_l, axis=0))
center_and_strides_list = self._images_to_levels(
flatten_center_and_strides, num_level_anchors)
labels_list = self._images_to_levels(labels, num_level_anchors)
label_weights_list = self._images_to_levels(label_weights,
num_level_anchors)
bbox_targets_list = self._images_to_levels(bbox_targets,
num_level_anchors)
num_total_pos = sum(pos_num_l)
try:
paddle.distributed.all_reduce(paddle.to_tensor(num_total_pos))
num_total_pos = paddle.clip(
num_total_pos / paddle.distributed.get_world_size(), min=1.)
except:
num_total_pos = max(num_total_pos, 1)
loss_bbox_list, loss_dfl_list, loss_qfl_list, avg_factor = [], [], [], []
for cls_score, bbox_pred, center_and_strides, labels, label_weights, bbox_targets, stride in zip(
cls_scores, bbox_preds, center_and_strides_list, labels_list,
label_weights_list, bbox_targets_list, self.fpn_stride):
center_and_strides = center_and_strides.reshape([-1, 4])
cls_score = cls_score.transpose([0, 2, 3, 1]).reshape(
[-1, self.cls_out_channels])
bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape(
[-1, 4 * (self.reg_max + 1)])
bbox_targets = bbox_targets.reshape([-1, 4])
labels = labels.reshape([-1])
label_weights = label_weights.reshape([-1])
bg_class_ind = self.num_classes
pos_inds = paddle.nonzero(
paddle.logical_and((labels >= 0), (labels < bg_class_ind)),
as_tuple=False).squeeze(1)
score = np.zeros(labels.shape)
if len(pos_inds) > 0:
pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0)
pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0)
pos_centers = paddle.gather(
center_and_strides[:, :-2], pos_inds, axis=0) / stride
weight_targets = F.sigmoid(cls_score.detach())
weight_targets = paddle.gather(
weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0)
pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred)
pos_decode_bbox_pred = distance2bbox(pos_centers,
pos_bbox_pred_corners)
pos_decode_bbox_targets = pos_bbox_targets / stride
bbox_iou = bbox_overlaps(
pos_decode_bbox_pred.detach().numpy(),
pos_decode_bbox_targets.detach().numpy(),
is_aligned=True)
score[pos_inds.numpy()] = bbox_iou
pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1])
target_corners = bbox2distance(pos_centers,
pos_decode_bbox_targets,
self.reg_max).reshape([-1])
# regression loss
loss_bbox = paddle.sum(
self.loss_bbox(pos_decode_bbox_pred,
pos_decode_bbox_targets) * weight_targets)
# dfl loss
loss_dfl = self.loss_dfl(
pred_corners,
target_corners,
weight=weight_targets.expand([-1, 4]).reshape([-1]),
avg_factor=4.0)
else:
loss_bbox = bbox_pred.sum() * 0
loss_dfl = bbox_pred.sum() * 0
weight_targets = paddle.to_tensor([0], dtype='float32')
# qfl loss
score = paddle.to_tensor(score)
loss_qfl = self.loss_qfl(
cls_score, (labels, score),
weight=label_weights,
avg_factor=num_total_pos)
loss_bbox_list.append(loss_bbox)
loss_dfl_list.append(loss_dfl)
loss_qfl_list.append(loss_qfl)
avg_factor.append(weight_targets.sum())
avg_factor = sum(avg_factor)
try:
paddle.distributed.all_reduce(paddle.to_tensor(avg_factor))
avg_factor = paddle.clip(
avg_factor / paddle.distributed.get_world_size(), min=1)
except:
avg_factor = max(avg_factor.item(), 1)
if avg_factor <= 0:
loss_qfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
loss_bbox = paddle.to_tensor(
0, dtype='float32', stop_gradient=False)
loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
else:
losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list))
losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list))
loss_qfl = sum(loss_qfl_list)
loss_bbox = sum(losses_bbox)
loss_dfl = sum(losses_dfl)
loss_states = dict(
loss_qfl=loss_qfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl)
return loss_states
@register
class OTAVFLHead(OTAHead):
__inject__ = [
'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox',
'assigner', 'nms'
]
__shared__ = ['num_classes']
def __init__(self,
conv_feat='FCOSFeat',
dgqp_module=None,
num_classes=80,
fpn_stride=[8, 16, 32, 64, 128],
prior_prob=0.01,
loss_class='VarifocalLoss',
loss_dfl='DistributionFocalLoss',
loss_bbox='GIoULoss',
assigner='SimOTAAssigner',
reg_max=16,
feat_in_chan=256,
nms=None,
nms_pre=1000,
cell_offset=0):
super(OTAVFLHead, self).__init__(
conv_feat=conv_feat,
dgqp_module=dgqp_module,
num_classes=num_classes,
fpn_stride=fpn_stride,
prior_prob=prior_prob,
loss_class=loss_class,
loss_dfl=loss_dfl,
loss_bbox=loss_bbox,
reg_max=reg_max,
feat_in_chan=feat_in_chan,
nms=nms,
nms_pre=nms_pre,
cell_offset=cell_offset)
self.conv_feat = conv_feat
self.dgqp_module = dgqp_module
self.num_classes = num_classes
self.fpn_stride = fpn_stride
self.prior_prob = prior_prob
self.loss_vfl = loss_class
self.loss_dfl = loss_dfl
self.loss_bbox = loss_bbox
self.reg_max = reg_max
self.feat_in_chan = feat_in_chan
self.nms = nms
self.nms_pre = nms_pre
self.cell_offset = cell_offset
self.use_sigmoid = self.loss_vfl.use_sigmoid
self.assigner = assigner
def get_loss(self, head_outs, gt_meta):
cls_scores, bbox_preds = head_outs
num_level_anchors = [
featmap.shape[-2] * featmap.shape[-1] for featmap in cls_scores
]
num_imgs = gt_meta['im_id'].shape[0]
featmap_sizes = [[featmap.shape[-2], featmap.shape[-1]]
for featmap in cls_scores]
decode_bbox_preds = []
center_and_strides = []
for featmap_size, stride, bbox_pred in zip(featmap_sizes,
self.fpn_stride, bbox_preds):
# center in origin image
yy, xx = self.get_single_level_center_point(featmap_size, stride,
self.cell_offset)
strides = paddle.full((len(xx), ), stride)
center_and_stride = paddle.stack([xx, yy, strides, strides],
-1).tile([num_imgs, 1, 1])
center_and_strides.append(center_and_stride)
center_in_feature = center_and_stride.reshape(
[-1, 4])[:, :-2] / stride
bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape(
[num_imgs, -1, 4 * (self.reg_max + 1)])
pred_distances = self.distribution_project(bbox_pred)
decode_bbox_pred_wo_stride = distance2bbox(
center_in_feature, pred_distances).reshape([num_imgs, -1, 4])
decode_bbox_preds.append(decode_bbox_pred_wo_stride * stride)
flatten_cls_preds = [
cls_pred.transpose([0, 2, 3, 1]).reshape(
[num_imgs, -1, self.cls_out_channels])
for cls_pred in cls_scores
]
flatten_cls_preds = paddle.concat(flatten_cls_preds, axis=1)
flatten_bboxes = paddle.concat(decode_bbox_preds, axis=1)
flatten_center_and_strides = paddle.concat(center_and_strides, axis=1)
gt_boxes, gt_labels = gt_meta['gt_bbox'], gt_meta['gt_class']
pos_num_l, label_l, label_weight_l, bbox_target_l = [], [], [], []
for flatten_cls_pred, flatten_center_and_stride, flatten_bbox,gt_box,gt_label \
in zip(flatten_cls_preds.detach(), flatten_center_and_strides.detach(), \
flatten_bboxes.detach(),gt_boxes,gt_labels):
pos_num, label, label_weight, bbox_target = self._get_target_single(
flatten_cls_pred, flatten_center_and_stride, flatten_bbox,
gt_box, gt_label)
pos_num_l.append(pos_num)
label_l.append(label)
label_weight_l.append(label_weight)
bbox_target_l.append(bbox_target)
labels = paddle.to_tensor(np.stack(label_l, axis=0))
label_weights = paddle.to_tensor(np.stack(label_weight_l, axis=0))
bbox_targets = paddle.to_tensor(np.stack(bbox_target_l, axis=0))
center_and_strides_list = self._images_to_levels(
flatten_center_and_strides, num_level_anchors)
labels_list = self._images_to_levels(labels, num_level_anchors)
label_weights_list = self._images_to_levels(label_weights,
num_level_anchors)
bbox_targets_list = self._images_to_levels(bbox_targets,
num_level_anchors)
num_total_pos = sum(pos_num_l)
try:
paddle.distributed.all_reduce(paddle.to_tensor(num_total_pos))
num_total_pos = paddle.clip(
num_total_pos / paddle.distributed.get_world_size(), min=1.)
except:
num_total_pos = max(num_total_pos, 1)
loss_bbox_list, loss_dfl_list, loss_vfl_list, avg_factor = [], [], [], []
for cls_score, bbox_pred, center_and_strides, labels, label_weights, bbox_targets, stride in zip(
cls_scores, bbox_preds, center_and_strides_list, labels_list,
label_weights_list, bbox_targets_list, self.fpn_stride):
center_and_strides = center_and_strides.reshape([-1, 4])
cls_score = cls_score.transpose([0, 2, 3, 1]).reshape(
[-1, self.cls_out_channels])
bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape(
[-1, 4 * (self.reg_max + 1)])
bbox_targets = bbox_targets.reshape([-1, 4])
labels = labels.reshape([-1])
bg_class_ind = self.num_classes
pos_inds = paddle.nonzero(
paddle.logical_and((labels >= 0), (labels < bg_class_ind)),
as_tuple=False).squeeze(1)
# vfl
vfl_score = np.zeros(cls_score.shape)
if len(pos_inds) > 0:
pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0)
pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0)
pos_centers = paddle.gather(
center_and_strides[:, :-2], pos_inds, axis=0) / stride
weight_targets = F.sigmoid(cls_score.detach())
weight_targets = paddle.gather(
weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0)
pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred)
pos_decode_bbox_pred = distance2bbox(pos_centers,
pos_bbox_pred_corners)
pos_decode_bbox_targets = pos_bbox_targets / stride
bbox_iou = bbox_overlaps(
pos_decode_bbox_pred.detach().numpy(),
pos_decode_bbox_targets.detach().numpy(),
is_aligned=True)
# vfl
pos_labels = paddle.gather(labels, pos_inds, axis=0)
vfl_score[pos_inds.numpy(), pos_labels] = bbox_iou
pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1])
target_corners = bbox2distance(pos_centers,
pos_decode_bbox_targets,
self.reg_max).reshape([-1])
# regression loss
loss_bbox = paddle.sum(
self.loss_bbox(pos_decode_bbox_pred,
pos_decode_bbox_targets) * weight_targets)
# dfl loss
loss_dfl = self.loss_dfl(
pred_corners,
target_corners,
weight=weight_targets.expand([-1, 4]).reshape([-1]),
avg_factor=4.0)
else:
loss_bbox = bbox_pred.sum() * 0
loss_dfl = bbox_pred.sum() * 0
weight_targets = paddle.to_tensor([0], dtype='float32')
# vfl loss
num_pos_avg_per_gpu = num_total_pos
vfl_score = paddle.to_tensor(vfl_score)
loss_vfl = self.loss_vfl(
cls_score, vfl_score, avg_factor=num_pos_avg_per_gpu)
loss_bbox_list.append(loss_bbox)
loss_dfl_list.append(loss_dfl)
loss_vfl_list.append(loss_vfl)
avg_factor.append(weight_targets.sum())
avg_factor = sum(avg_factor)
try:
paddle.distributed.all_reduce(paddle.to_tensor(avg_factor))
avg_factor = paddle.clip(
avg_factor / paddle.distributed.get_world_size(), min=1)
except:
avg_factor = max(avg_factor.item(), 1)
if avg_factor <= 0:
loss_vfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
loss_bbox = paddle.to_tensor(
0, dtype='float32', stop_gradient=False)
loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
else:
losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list))
losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list))
loss_vfl = sum(loss_vfl_list)
loss_bbox = sum(losses_bbox)
loss_dfl = sum(losses_dfl)
loss_states = dict(
loss_vfl=loss_vfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl)
return loss_states
================================================
FILE: ppdet/modeling/heads/solov2_head.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from paddle import ParamAttr
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import Normal, Constant
from ppdet.modeling.layers import ConvNormLayer, MaskMatrixNMS, DropBlock
from ppdet.core.workspace import register
from six.moves import zip
import numpy as np
__all__ = ['SOLOv2Head']
@register
class SOLOv2MaskHead(nn.Layer):
"""
MaskHead of SOLOv2.
The code of this function is based on:
https://github.com/WXinlong/SOLO/blob/master/mmdet/models/mask_heads/mask_feat_head.py
Args:
in_channels (int): The channel number of input Tensor.
out_channels (int): The channel number of output Tensor.
start_level (int): The position where the input starts.
end_level (int): The position where the input ends.
use_dcn_in_tower (bool): Whether to use dcn in tower or not.
"""
__shared__ = ['norm_type']
def __init__(self,
in_channels=256,
mid_channels=128,
out_channels=256,
start_level=0,
end_level=3,
use_dcn_in_tower=False,
norm_type='gn'):
super(SOLOv2MaskHead, self).__init__()
assert start_level >= 0 and end_level >= start_level
self.in_channels = in_channels
self.out_channels = out_channels
self.mid_channels = mid_channels
self.use_dcn_in_tower = use_dcn_in_tower
self.range_level = end_level - start_level + 1
self.use_dcn = True if self.use_dcn_in_tower else False
self.convs_all_levels = []
self.norm_type = norm_type
for i in range(start_level, end_level + 1):
conv_feat_name = 'mask_feat_head.convs_all_levels.{}'.format(i)
conv_pre_feat = nn.Sequential()
if i == start_level:
conv_pre_feat.add_sublayer(
conv_feat_name + '.conv' + str(i),
ConvNormLayer(
ch_in=self.in_channels,
ch_out=self.mid_channels,
filter_size=3,
stride=1,
use_dcn=self.use_dcn,
norm_type=self.norm_type))
self.add_sublayer('conv_pre_feat' + str(i), conv_pre_feat)
self.convs_all_levels.append(conv_pre_feat)
else:
for j in range(i):
ch_in = 0
if j == 0:
ch_in = self.in_channels + 2 if i == end_level else self.in_channels
else:
ch_in = self.mid_channels
conv_pre_feat.add_sublayer(
conv_feat_name + '.conv' + str(j),
ConvNormLayer(
ch_in=ch_in,
ch_out=self.mid_channels,
filter_size=3,
stride=1,
use_dcn=self.use_dcn,
norm_type=self.norm_type))
conv_pre_feat.add_sublayer(
conv_feat_name + '.conv' + str(j) + 'act', nn.ReLU())
conv_pre_feat.add_sublayer(
'upsample' + str(i) + str(j),
nn.Upsample(
scale_factor=2, mode='bilinear'))
self.add_sublayer('conv_pre_feat' + str(i), conv_pre_feat)
self.convs_all_levels.append(conv_pre_feat)
conv_pred_name = 'mask_feat_head.conv_pred.0'
self.conv_pred = self.add_sublayer(
conv_pred_name,
ConvNormLayer(
ch_in=self.mid_channels,
ch_out=self.out_channels,
filter_size=1,
stride=1,
use_dcn=self.use_dcn,
norm_type=self.norm_type))
def forward(self, inputs):
"""
Get SOLOv2MaskHead output.
Args:
inputs(list[Tensor]): feature map from each necks with shape of [N, C, H, W]
Returns:
ins_pred(Tensor): Output of SOLOv2MaskHead head
"""
feat_all_level = F.relu(self.convs_all_levels[0](inputs[0]))
for i in range(1, self.range_level):
input_p = inputs[i]
if i == (self.range_level - 1):
input_feat = input_p
x_range = paddle.linspace(
-1, 1, input_feat.shape[-1], dtype='float32')
y_range = paddle.linspace(
-1, 1, input_feat.shape[-2], dtype='float32')
y, x = paddle.meshgrid([y_range, x_range])
x = paddle.unsqueeze(x, [0, 1])
y = paddle.unsqueeze(y, [0, 1])
y = paddle.expand(
y, shape=[input_feat.shape[0], 1, -1, -1])
x = paddle.expand(
x, shape=[input_feat.shape[0], 1, -1, -1])
coord_feat = paddle.concat([x, y], axis=1)
input_p = paddle.concat([input_p, coord_feat], axis=1)
feat_all_level = paddle.add(feat_all_level,
self.convs_all_levels[i](input_p))
ins_pred = F.relu(self.conv_pred(feat_all_level))
return ins_pred
@register
class SOLOv2Head(nn.Layer):
"""
Head block for SOLOv2 network
Args:
num_classes (int): Number of output classes.
in_channels (int): Number of input channels.
seg_feat_channels (int): Num_filters of kernel & categroy branch convolution operation.
stacked_convs (int): Times of convolution operation.
num_grids (list[int]): List of feature map grids size.
kernel_out_channels (int): Number of output channels in kernel branch.
dcn_v2_stages (list): Which stage use dcn v2 in tower. It is between [0, stacked_convs).
segm_strides (list[int]): List of segmentation area stride.
solov2_loss (object): SOLOv2Loss instance.
score_threshold (float): Threshold of categroy score.
mask_nms (object): MaskMatrixNMS instance.
"""
__inject__ = ['solov2_loss', 'mask_nms']
__shared__ = ['norm_type', 'num_classes']
def __init__(self,
num_classes=80,
in_channels=256,
seg_feat_channels=256,
stacked_convs=4,
num_grids=[40, 36, 24, 16, 12],
kernel_out_channels=256,
dcn_v2_stages=[],
segm_strides=[8, 8, 16, 32, 32],
solov2_loss=None,
score_threshold=0.1,
mask_threshold=0.5,
mask_nms=None,
norm_type='gn',
drop_block=False):
super(SOLOv2Head, self).__init__()
self.num_classes = num_classes
self.in_channels = in_channels
self.seg_num_grids = num_grids
self.cate_out_channels = self.num_classes
self.seg_feat_channels = seg_feat_channels
self.stacked_convs = stacked_convs
self.kernel_out_channels = kernel_out_channels
self.dcn_v2_stages = dcn_v2_stages
self.segm_strides = segm_strides
self.solov2_loss = solov2_loss
self.mask_nms = mask_nms
self.score_threshold = score_threshold
self.mask_threshold = mask_threshold
self.norm_type = norm_type
self.drop_block = drop_block
self.kernel_pred_convs = []
self.cate_pred_convs = []
for i in range(self.stacked_convs):
use_dcn = True if i in self.dcn_v2_stages else False
ch_in = self.in_channels + 2 if i == 0 else self.seg_feat_channels
kernel_conv = self.add_sublayer(
'bbox_head.kernel_convs.' + str(i),
ConvNormLayer(
ch_in=ch_in,
ch_out=self.seg_feat_channels,
filter_size=3,
stride=1,
use_dcn=use_dcn,
norm_type=self.norm_type))
self.kernel_pred_convs.append(kernel_conv)
ch_in = self.in_channels if i == 0 else self.seg_feat_channels
cate_conv = self.add_sublayer(
'bbox_head.cate_convs.' + str(i),
ConvNormLayer(
ch_in=ch_in,
ch_out=self.seg_feat_channels,
filter_size=3,
stride=1,
use_dcn=use_dcn,
norm_type=self.norm_type))
self.cate_pred_convs.append(cate_conv)
self.solo_kernel = self.add_sublayer(
'bbox_head.solo_kernel',
nn.Conv2D(
self.seg_feat_channels,
self.kernel_out_channels,
kernel_size=3,
stride=1,
padding=1,
weight_attr=ParamAttr(initializer=Normal(
mean=0., std=0.01)),
bias_attr=True))
self.solo_cate = self.add_sublayer(
'bbox_head.solo_cate',
nn.Conv2D(
self.seg_feat_channels,
self.cate_out_channels,
kernel_size=3,
stride=1,
padding=1,
weight_attr=ParamAttr(initializer=Normal(
mean=0., std=0.01)),
bias_attr=ParamAttr(initializer=Constant(
value=float(-np.log((1 - 0.01) / 0.01))))))
if self.drop_block and self.training:
self.drop_block_fun = DropBlock(
block_size=3, keep_prob=0.9, name='solo_cate.dropblock')
def _points_nms(self, heat, kernel_size=2):
hmax = F.max_pool2d(heat, kernel_size=kernel_size, stride=1, padding=1)
keep = paddle.cast((hmax[:, :, :-1, :-1] == heat), 'float32')
return heat * keep
def _split_feats(self, feats):
return (F.interpolate(
feats[0],
scale_factor=0.5,
align_corners=False,
align_mode=0,
mode='bilinear'), feats[1], feats[2], feats[3], F.interpolate(
feats[4],
size=feats[3].shape[-2:],
mode='bilinear',
align_corners=False,
align_mode=0))
def forward(self, input):
"""
Get SOLOv2 head output
Args:
input (list): List of Tensors, output of backbone or neck stages
Returns:
cate_pred_list (list): Tensors of each category branch layer
kernel_pred_list (list): Tensors of each kernel branch layer
"""
feats = self._split_feats(input)
cate_pred_list = []
kernel_pred_list = []
for idx in range(len(self.seg_num_grids)):
cate_pred, kernel_pred = self._get_output_single(feats[idx], idx)
cate_pred_list.append(cate_pred)
kernel_pred_list.append(kernel_pred)
return cate_pred_list, kernel_pred_list
def _get_output_single(self, input, idx):
ins_kernel_feat = input
# CoordConv
x_range = paddle.linspace(
-1, 1, ins_kernel_feat.shape[-1], dtype='float32')
y_range = paddle.linspace(
-1, 1, ins_kernel_feat.shape[-2], dtype='float32')
y, x = paddle.meshgrid([y_range, x_range])
x = paddle.unsqueeze(x, [0, 1])
y = paddle.unsqueeze(y, [0, 1])
y = paddle.expand(
y, shape=[ins_kernel_feat.shape[0], 1, -1, -1])
x = paddle.expand(
x, shape=[ins_kernel_feat.shape[0], 1, -1, -1])
coord_feat = paddle.concat([x, y], axis=1)
ins_kernel_feat = paddle.concat([ins_kernel_feat, coord_feat], axis=1)
# kernel branch
kernel_feat = ins_kernel_feat
seg_num_grid = self.seg_num_grids[idx]
kernel_feat = F.interpolate(
kernel_feat,
size=[seg_num_grid, seg_num_grid],
mode='bilinear',
align_corners=False,
align_mode=0)
cate_feat = kernel_feat[:, :-2, :, :]
for kernel_layer in self.kernel_pred_convs:
kernel_feat = F.relu(kernel_layer(kernel_feat))
if self.drop_block and self.training:
kernel_feat = self.drop_block_fun(kernel_feat)
kernel_pred = self.solo_kernel(kernel_feat)
# cate branch
for cate_layer in self.cate_pred_convs:
cate_feat = F.relu(cate_layer(cate_feat))
if self.drop_block and self.training:
cate_feat = self.drop_block_fun(cate_feat)
cate_pred = self.solo_cate(cate_feat)
if not self.training:
cate_pred = self._points_nms(F.sigmoid(cate_pred), kernel_size=2)
cate_pred = paddle.transpose(cate_pred, [0, 2, 3, 1])
return cate_pred, kernel_pred
def get_loss(self, cate_preds, kernel_preds, ins_pred, ins_labels,
cate_labels, grid_order_list, fg_num):
"""
Get loss of network of SOLOv2.
Args:
cate_preds (list): Tensor list of categroy branch output.
kernel_preds (list): Tensor list of kernel branch output.
ins_pred (list): Tensor list of instance branch output.
ins_labels (list): List of instance labels pre batch.
cate_labels (list): List of categroy labels pre batch.
grid_order_list (list): List of index in pre grid.
fg_num (int): Number of positive samples in a mini-batch.
Returns:
loss_ins (Tensor): The instance loss Tensor of SOLOv2 network.
loss_cate (Tensor): The category loss Tensor of SOLOv2 network.
"""
batch_size = grid_order_list[0].shape[0]
ins_pred_list = []
for kernel_preds_level, grid_orders_level in zip(kernel_preds,
grid_order_list):
if grid_orders_level.shape[1] == 0:
ins_pred_list.append(None)
continue
grid_orders_level = paddle.reshape(grid_orders_level, [-1])
reshape_pred = paddle.reshape(
kernel_preds_level,
shape=(kernel_preds_level.shape[0],
kernel_preds_level.shape[1], -1))
reshape_pred = paddle.transpose(reshape_pred, [0, 2, 1])
reshape_pred = paddle.reshape(
reshape_pred, shape=(-1, reshape_pred.shape[2]))
gathered_pred = paddle.gather(reshape_pred, index=grid_orders_level)
gathered_pred = paddle.reshape(
gathered_pred,
shape=[batch_size, -1, gathered_pred.shape[1]])
cur_ins_pred = ins_pred
cur_ins_pred = paddle.reshape(
cur_ins_pred,
shape=(cur_ins_pred.shape[0],
cur_ins_pred.shape[1], -1))
ins_pred_conv = paddle.matmul(gathered_pred, cur_ins_pred)
cur_ins_pred = paddle.reshape(
ins_pred_conv,
shape=(-1, ins_pred.shape[-2],
ins_pred.shape[-1]))
ins_pred_list.append(cur_ins_pred)
num_ins = paddle.sum(fg_num)
cate_preds = [
paddle.reshape(
paddle.transpose(cate_pred, [0, 2, 3, 1]),
shape=(-1, self.cate_out_channels)) for cate_pred in cate_preds
]
flatten_cate_preds = paddle.concat(cate_preds)
new_cate_labels = []
for cate_label in cate_labels:
new_cate_labels.append(paddle.reshape(cate_label, shape=[-1]))
cate_labels = paddle.concat(new_cate_labels)
loss_ins, loss_cate = self.solov2_loss(
ins_pred_list, ins_labels, flatten_cate_preds, cate_labels, num_ins)
return {'loss_ins': loss_ins, 'loss_cate': loss_cate}
def get_prediction(self, cate_preds, kernel_preds, seg_pred, im_shape,
scale_factor):
"""
Get prediction result of SOLOv2 network
Args:
cate_preds (list): List of Variables, output of categroy branch.
kernel_preds (list): List of Variables, output of kernel branch.
seg_pred (list): List of Variables, output of mask head stages.
im_shape (Variables): [h, w] for input images.
scale_factor (Variables): [scale, scale] for input images.
Returns:
seg_masks (Tensor): The prediction segmentation.
cate_labels (Tensor): The prediction categroy label of each segmentation.
seg_masks (Tensor): The prediction score of each segmentation.
"""
num_levels = len(cate_preds)
featmap_size = seg_pred.shape[-2:]
seg_masks_list = []
cate_labels_list = []
cate_scores_list = []
cate_preds = [cate_pred * 1.0 for cate_pred in cate_preds]
kernel_preds = [kernel_pred * 1.0 for kernel_pred in kernel_preds]
# Currently only supports batch size == 1
for idx in range(1):
cate_pred_list = [
paddle.reshape(
cate_preds[i][idx], shape=(-1, self.cate_out_channels))
for i in range(num_levels)
]
seg_pred_list = seg_pred
kernel_pred_list = [
paddle.reshape(
paddle.transpose(kernel_preds[i][idx], [1, 2, 0]),
shape=(-1, self.kernel_out_channels))
for i in range(num_levels)
]
cate_pred_list = paddle.concat(cate_pred_list, axis=0)
kernel_pred_list = paddle.concat(kernel_pred_list, axis=0)
seg_masks, cate_labels, cate_scores = self.get_seg_single(
cate_pred_list, seg_pred_list, kernel_pred_list, featmap_size,
im_shape[idx], scale_factor[idx][0])
bbox_num = cate_labels.shape[0:1]
return seg_masks, cate_labels, cate_scores, bbox_num
def get_seg_single(self, cate_preds, seg_preds, kernel_preds, featmap_size,
im_shape, scale_factor):
"""
The code of this function is based on:
https://github.com/WXinlong/SOLO/blob/master/mmdet/models/anchor_heads/solov2_head.py#L385
"""
h = paddle.cast(im_shape[0], 'int32')
w = paddle.cast(im_shape[1], 'int32')
upsampled_size_out = [featmap_size[0] * 4, featmap_size[1] * 4]
y = paddle.zeros(shape=cate_preds.shape, dtype='float32')
inds = paddle.where(cate_preds > self.score_threshold, cate_preds, y)
inds = paddle.nonzero(inds)
cate_preds = paddle.reshape(cate_preds, shape=[-1])
# Prevent empty and increase fake data
ind_a = paddle.cast(paddle.shape(kernel_preds)[0:1], 'int64')
ind_b = paddle.zeros(shape=[1], dtype='int64')
inds_end = paddle.unsqueeze(paddle.concat([ind_a, ind_b]), 0)
inds = paddle.concat([inds, inds_end])
kernel_preds_end = paddle.ones(
shape=[1, self.kernel_out_channels], dtype='float32')
kernel_preds = paddle.concat([kernel_preds, kernel_preds_end])
cate_preds = paddle.concat(
[cate_preds, paddle.zeros(
shape=[1], dtype='float32')])
# cate_labels & kernel_preds
cate_labels = inds[:, 1]
kernel_preds = paddle.gather(kernel_preds, index=inds[:, 0])
cate_score_idx = paddle.add(inds[:, 0] * self.cate_out_channels,
cate_labels)
cate_scores = paddle.gather(cate_preds, index=cate_score_idx)
size_trans = np.power(self.seg_num_grids, 2)
strides = []
for _ind in range(len(self.segm_strides)):
strides.append(
paddle.full(
shape=[int(size_trans[_ind])],
fill_value=self.segm_strides[_ind],
dtype="int32"))
strides = paddle.concat(strides)
strides = paddle.concat(
[strides, paddle.zeros(
shape=[1], dtype='int32')])
strides = paddle.gather(strides, index=inds[:, 0])
# mask encoding.
kernel_preds = paddle.unsqueeze(kernel_preds, [2, 3])
seg_preds = F.conv2d(seg_preds, kernel_preds)
seg_preds = F.sigmoid(paddle.squeeze(seg_preds, [0]))
seg_masks = seg_preds > self.mask_threshold
seg_masks = paddle.cast(seg_masks, 'float32')
sum_masks = paddle.sum(seg_masks, axis=[1, 2])
y = paddle.zeros(shape=paddle.shape(sum_masks), dtype='float32')
keep = paddle.where(sum_masks > strides.cast(sum_masks.dtype), sum_masks, y)
keep = paddle.nonzero(keep)
keep = paddle.squeeze(keep, axis=[1])
# Prevent empty and increase fake data
keep_other = paddle.concat(
[keep, paddle.cast(paddle.shape(sum_masks)[0:1] - 1, 'int64')])
keep_scores = paddle.concat(
[keep, paddle.cast(paddle.shape(sum_masks)[0:1], 'int64')])
cate_scores_end = paddle.zeros(shape=[1], dtype='float32')
cate_scores = paddle.concat([cate_scores, cate_scores_end])
seg_masks = paddle.gather(seg_masks, index=keep_other)
seg_preds = paddle.gather(seg_preds, index=keep_other)
sum_masks = paddle.gather(sum_masks, index=keep_other)
cate_labels = paddle.gather(cate_labels, index=keep_other)
cate_scores = paddle.gather(cate_scores, index=keep_scores)
# mask scoring.
seg_mul = paddle.cast(seg_preds * seg_masks, 'float32')
seg_scores = paddle.sum(seg_mul, axis=[1, 2]) / sum_masks
cate_scores *= seg_scores
# Matrix NMS
seg_preds, cate_scores, cate_labels = self.mask_nms(
seg_preds, seg_masks, cate_labels, cate_scores, sum_masks=sum_masks)
ori_shape = im_shape[:2] / scale_factor + 0.5
ori_shape = paddle.cast(ori_shape, 'int32')
seg_preds = F.interpolate(
paddle.unsqueeze(seg_preds, 0),
size=upsampled_size_out,
mode='bilinear',
align_corners=False,
align_mode=0)
seg_preds = paddle.slice(
seg_preds, axes=[2, 3], starts=[0, 0], ends=[h, w])
seg_masks = paddle.squeeze(
F.interpolate(
seg_preds,
size=ori_shape[:2],
mode='bilinear',
align_corners=False,
align_mode=0),
axis=[0])
seg_masks = paddle.cast(seg_masks > self.mask_threshold, 'uint8')
return seg_masks, cate_labels, cate_scores
================================================
FILE: ppdet/modeling/heads/sparse_roi_head.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This code is referenced from: https://github.com/open-mmlab/mmdetection
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import copy
import paddle
from paddle import nn
from ppdet.core.workspace import register
from ppdet.modeling import initializer as init
from .roi_extractor import RoIAlign
from ..bbox_utils import delta2bbox_v2
from ..cls_utils import _get_class_default_kwargs
from ..layers import MultiHeadAttention
__all__ = ['SparseRoIHead', 'DIIHead', 'DynamicMaskHead']
class DynamicConv(nn.Layer):
def __init__(self,
in_channels=256,
feature_channels=64,
out_channels=None,
roi_resolution=7,
with_proj=True):
super(DynamicConv, self).__init__()
self.in_channels = in_channels
self.feature_channels = feature_channels
self.out_channels = out_channels if out_channels else in_channels
self.num_params_in = self.in_channels * self.feature_channels
self.num_params_out = self.out_channels * self.feature_channels
self.dynamic_layer = nn.Linear(self.in_channels,
self.num_params_in + self.num_params_out)
self.norm_in = nn.LayerNorm(self.feature_channels)
self.norm_out = nn.LayerNorm(self.out_channels)
self.activation = nn.ReLU()
self.with_proj = with_proj
if self.with_proj:
num_output = self.out_channels * roi_resolution**2
self.fc_layer = nn.Linear(num_output, self.out_channels)
self.fc_norm = nn.LayerNorm(self.out_channels)
def forward(self, param_feature, input_feature):
input_feature = input_feature.flatten(2).transpose([2, 0, 1])
input_feature = input_feature.transpose([1, 0, 2])
parameters = self.dynamic_layer(param_feature)
param_in = parameters[:, :self.num_params_in].reshape(
[-1, self.in_channels, self.feature_channels])
param_out = parameters[:, -self.num_params_out:].reshape(
[-1, self.feature_channels, self.out_channels])
features = paddle.bmm(input_feature, param_in)
features = self.norm_in(features)
features = self.activation(features)
features = paddle.bmm(features, param_out)
features = self.norm_out(features)
features = self.activation(features)
if self.with_proj:
features = features.flatten(1)
features = self.fc_layer(features)
features = self.fc_norm(features)
features = self.activation(features)
return features
class FFN(nn.Layer):
def __init__(self,
embed_dims=256,
feedforward_channels=2048,
num_fcs=2,
ffn_drop=0.0,
add_identity=True):
super(FFN, self).__init__()
layers = []
in_channels = embed_dims
for _ in range(num_fcs - 1):
layers.append(
nn.Sequential(
nn.Linear(in_channels, feedforward_channels),
nn.ReLU(), nn.Dropout(ffn_drop)))
in_channels = feedforward_channels
layers.append(nn.Linear(feedforward_channels, embed_dims))
layers.append(nn.Dropout(ffn_drop))
self.layers = nn.Sequential(*layers)
self.add_identity = add_identity
def forward(self, x):
identity = x
out = self.layers(x)
if not self.add_identity:
return out
else:
return out + identity
@register
class DynamicMaskHead(nn.Layer):
__shared__ = ['num_classes', 'proposal_embedding_dim', 'norm_type']
def __init__(self,
num_classes=80,
proposal_embedding_dim=256,
dynamic_feature_channels=64,
roi_resolution=14,
num_convs=4,
conv_kernel_size=3,
conv_channels=256,
upsample_method='deconv',
upsample_scale_factor=2,
norm_type='bn'):
super(DynamicMaskHead, self).__init__()
self.d_model = proposal_embedding_dim
self.instance_interactive_conv = DynamicConv(
self.d_model,
dynamic_feature_channels,
roi_resolution=roi_resolution,
with_proj=False)
self.convs = nn.LayerList()
for i in range(num_convs):
self.convs.append(
nn.Sequential(
nn.Conv2D(
self.d_model if i == 0 else conv_channels,
conv_channels,
conv_kernel_size,
padding='same',
bias_attr=False),
nn.BatchNorm2D(conv_channels),
nn.ReLU()))
if norm_type == 'sync_bn':
self.convs = nn.SyncBatchNorm.convert_sync_batchnorm(self.convs)
self.upsample_method = upsample_method
if upsample_method is None:
self.upsample = None
elif upsample_method == 'deconv':
self.upsample = nn.Conv2DTranspose(
conv_channels if num_convs > 0 else self.d_model,
conv_channels,
upsample_scale_factor,
stride=upsample_scale_factor)
self.relu = nn.ReLU()
else:
self.upsample = nn.Upsample(None, upsample_scale_factor)
cls_in_channels = conv_channels if num_convs > 0 else self.d_model
cls_in_channels = conv_channels if upsample_method == 'deconv' else cls_in_channels
self.conv_cls = nn.Conv2D(cls_in_channels, num_classes, 1)
self._init_weights()
def _init_weights(self):
for p in self.parameters():
if p.dim() > 1:
init.xavier_uniform_(p)
init.constant_(self.conv_cls.bias, 0.)
def forward(self, roi_features, attn_features):
attn_features = attn_features.reshape([-1, self.d_model])
attn_features_iic = self.instance_interactive_conv(attn_features,
roi_features)
x = attn_features_iic.transpose([0, 2, 1]).reshape(roi_features.shape)
for conv in self.convs:
x = conv(x)
if self.upsample is not None:
x = self.upsample(x)
if self.upsample_method == 'deconv':
x = self.relu(x)
mask_pred = self.conv_cls(x)
return mask_pred
@register
class DIIHead(nn.Layer):
__shared__ = ['num_classes', 'proposal_embedding_dim']
def __init__(self,
num_classes=80,
proposal_embedding_dim=256,
feedforward_channels=2048,
dynamic_feature_channels=64,
roi_resolution=7,
num_attn_heads=8,
dropout=0.0,
num_ffn_fcs=2,
num_cls_fcs=1,
num_reg_fcs=3):
super(DIIHead, self).__init__()
self.num_classes = num_classes
self.d_model = proposal_embedding_dim
self.attention = MultiHeadAttention(self.d_model, num_attn_heads,
dropout)
self.attention_norm = nn.LayerNorm(self.d_model)
self.instance_interactive_conv = DynamicConv(
self.d_model,
dynamic_feature_channels,
roi_resolution=roi_resolution,
with_proj=True)
self.instance_interactive_conv_dropout = nn.Dropout(dropout)
self.instance_interactive_conv_norm = nn.LayerNorm(self.d_model)
self.ffn = FFN(self.d_model, feedforward_channels, num_ffn_fcs, dropout)
self.ffn_norm = nn.LayerNorm(self.d_model)
self.cls_fcs = nn.LayerList()
for _ in range(num_cls_fcs):
self.cls_fcs.append(
nn.Linear(
self.d_model, self.d_model, bias_attr=False))
self.cls_fcs.append(nn.LayerNorm(self.d_model))
self.cls_fcs.append(nn.ReLU())
self.fc_cls = nn.Linear(self.d_model, self.num_classes)
self.reg_fcs = nn.LayerList()
for _ in range(num_reg_fcs):
self.reg_fcs.append(
nn.Linear(
self.d_model, self.d_model, bias_attr=False))
self.reg_fcs.append(nn.LayerNorm(self.d_model))
self.reg_fcs.append(nn.ReLU())
self.fc_reg = nn.Linear(self.d_model, 4)
self._init_weights()
def _init_weights(self):
for p in self.parameters():
if p.dim() > 1:
init.xavier_uniform_(p)
bias_init = init.bias_init_with_prob(0.01)
init.constant_(self.fc_cls.bias, bias_init)
def forward(self, roi_features, proposal_features):
N, num_proposals = proposal_features.shape[:2]
proposal_features = proposal_features + self.attention(
proposal_features)
attn_features = self.attention_norm(proposal_features)
proposal_features = attn_features.reshape([-1, self.d_model])
proposal_features_iic = self.instance_interactive_conv(
proposal_features, roi_features)
proposal_features = proposal_features + self.instance_interactive_conv_dropout(
proposal_features_iic)
obj_features = self.instance_interactive_conv_norm(proposal_features)
obj_features = self.ffn(obj_features)
obj_features = self.ffn_norm(obj_features)
cls_feature = obj_features.clone()
reg_feature = obj_features.clone()
for cls_layer in self.cls_fcs:
cls_feature = cls_layer(cls_feature)
class_logits = self.fc_cls(cls_feature)
for reg_layer in self.reg_fcs:
reg_feature = reg_layer(reg_feature)
bbox_deltas = self.fc_reg(reg_feature)
class_logits = class_logits.reshape(
[N, num_proposals, self.num_classes])
bbox_deltas = bbox_deltas.reshape([N, num_proposals, 4])
obj_features = obj_features.reshape([N, num_proposals, self.d_model])
return class_logits, bbox_deltas, obj_features, attn_features
@staticmethod
def refine_bboxes(proposal_bboxes, bbox_deltas):
pred_bboxes = delta2bbox_v2(
bbox_deltas.reshape([-1, 4]),
proposal_bboxes.reshape([-1, 4]),
delta_mean=[0.0, 0.0, 0.0, 0.0],
delta_std=[0.5, 0.5, 1.0, 1.0],
ctr_clip=None)
return pred_bboxes.reshape(proposal_bboxes.shape)
@register
class SparseRoIHead(nn.Layer):
__inject__ = ['bbox_head', 'mask_head', 'loss_func']
def __init__(self,
num_stages=6,
bbox_roi_extractor=_get_class_default_kwargs(RoIAlign),
mask_roi_extractor=_get_class_default_kwargs(RoIAlign),
bbox_head='DIIHead',
mask_head='DynamicMaskHead',
loss_func='QueryInstLoss'):
super(SparseRoIHead, self).__init__()
self.num_stages = num_stages
self.bbox_roi_extractor = bbox_roi_extractor
self.mask_roi_extractor = mask_roi_extractor
if isinstance(bbox_roi_extractor, dict):
self.bbox_roi_extractor = RoIAlign(**bbox_roi_extractor)
if isinstance(mask_roi_extractor, dict):
self.mask_roi_extractor = RoIAlign(**mask_roi_extractor)
self.bbox_heads = nn.LayerList(
[copy.deepcopy(bbox_head) for _ in range(num_stages)])
self.mask_heads = nn.LayerList(
[copy.deepcopy(mask_head) for _ in range(num_stages)])
self.loss_helper = loss_func
@classmethod
def from_config(cls, cfg, input_shape):
bbox_roi_extractor = cfg['bbox_roi_extractor']
mask_roi_extractor = cfg['mask_roi_extractor']
assert isinstance(bbox_roi_extractor, dict)
assert isinstance(mask_roi_extractor, dict)
kwargs = RoIAlign.from_config(cfg, input_shape)
bbox_roi_extractor.update(kwargs)
mask_roi_extractor.update(kwargs)
return {
'bbox_roi_extractor': bbox_roi_extractor,
'mask_roi_extractor': mask_roi_extractor
}
@staticmethod
def get_roi_features(features, bboxes, roi_extractor):
rois_list = [
bboxes[i] for i in range(len(bboxes)) if len(bboxes[i]) > 0
]
rois_num = paddle.to_tensor(
[len(bboxes[i]) for i in range(len(bboxes))], dtype='int32')
pos_ids = paddle.cast(rois_num, dtype='bool')
if pos_ids.sum() != len(rois_num):
rois_num = rois_num[pos_ids]
features = [features[i][pos_ids] for i in range(len(features))]
return roi_extractor(features, rois_list, rois_num)
def _forward_train(self, body_feats, pro_bboxes, pro_feats, targets):
all_stage_losses = {}
for stage in range(self.num_stages):
bbox_head = self.bbox_heads[stage]
mask_head = self.mask_heads[stage]
roi_feats = self.get_roi_features(body_feats, pro_bboxes,
self.bbox_roi_extractor)
class_logits, bbox_deltas, pro_feats, attn_feats = bbox_head(
roi_feats, pro_feats)
bbox_pred = self.bbox_heads[stage].refine_bboxes(pro_bboxes,
bbox_deltas)
indices = self.loss_helper.matcher({
'pred_logits': class_logits.detach(),
'pred_boxes': bbox_pred.detach()
}, targets)
avg_factor = paddle.to_tensor(
[sum(len(tgt['labels']) for tgt in targets)], dtype='float32')
if paddle.distributed.get_world_size() > 1:
paddle.distributed.all_reduce(avg_factor)
avg_factor /= paddle.distributed.get_world_size()
avg_factor = paddle.clip(avg_factor, min=1.)
loss_classes = self.loss_helper.loss_classes(class_logits, targets,
indices, avg_factor)
if sum(len(v['labels']) for v in targets) == 0:
loss_bboxes = {
'loss_bbox': paddle.to_tensor([0.]),
'loss_giou': paddle.to_tensor([0.])
}
loss_masks = {'loss_mask': paddle.to_tensor([0.])}
else:
loss_bboxes = self.loss_helper.loss_bboxes(bbox_pred, targets,
indices, avg_factor)
pos_attn_feats = paddle.concat([
paddle.gather(
src, src_idx, axis=0)
for src, (src_idx, _) in zip(attn_feats, indices)
])
pos_bbox_pred = [
paddle.gather(
src, src_idx, axis=0)
for src, (src_idx, _) in zip(bbox_pred.detach(), indices)
]
pos_roi_feats = self.get_roi_features(body_feats, pos_bbox_pred,
self.mask_roi_extractor)
mask_logits = mask_head(pos_roi_feats, pos_attn_feats)
loss_masks = self.loss_helper.loss_masks(
pos_bbox_pred, mask_logits, targets, indices, avg_factor)
for loss in [loss_classes, loss_bboxes, loss_masks]:
for key in loss.keys():
all_stage_losses[f'stage{stage}_{key}'] = loss[key]
pro_bboxes = bbox_pred.detach()
return all_stage_losses
def _forward_test(self, body_feats, pro_bboxes, pro_feats):
for stage in range(self.num_stages):
roi_feats = self.get_roi_features(body_feats, pro_bboxes,
self.bbox_roi_extractor)
class_logits, bbox_deltas, pro_feats, attn_feats = self.bbox_heads[
stage](roi_feats, pro_feats)
bbox_pred = self.bbox_heads[stage].refine_bboxes(pro_bboxes,
bbox_deltas)
pro_bboxes = bbox_pred.detach()
roi_feats = self.get_roi_features(body_feats, bbox_pred,
self.mask_roi_extractor)
mask_logits = self.mask_heads[stage](roi_feats, attn_feats)
return {
'class_logits': class_logits,
'bbox_pred': bbox_pred,
'mask_logits': mask_logits
}
def forward(self,
body_features,
proposal_bboxes,
proposal_features,
targets=None):
if self.training:
return self._forward_train(body_features, proposal_bboxes,
proposal_features, targets)
else:
return self._forward_test(body_features, proposal_bboxes,
proposal_features)
================================================
FILE: ppdet/modeling/heads/sparsercnn_head.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/PeizeSun/SparseR-CNN/blob/main/projects/SparseRCNN/sparsercnn/head.py
Ths copyright of PeizeSun/SparseR-CNN is as follows:
MIT License [see LICENSE for details]
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import copy
import paddle
import paddle.nn as nn
from ppdet.core.workspace import register
from ppdet.modeling.heads.roi_extractor import RoIAlign
from ppdet.modeling.bbox_utils import delta2bbox
from .. import initializer as init
_DEFAULT_SCALE_CLAMP = math.log(100000. / 16)
class DynamicConv(nn.Layer):
def __init__(
self,
head_hidden_dim,
head_dim_dynamic,
head_num_dynamic, ):
super().__init__()
self.hidden_dim = head_hidden_dim
self.dim_dynamic = head_dim_dynamic
self.num_dynamic = head_num_dynamic
self.num_params = self.hidden_dim * self.dim_dynamic
self.dynamic_layer = nn.Linear(self.hidden_dim,
self.num_dynamic * self.num_params)
self.norm1 = nn.LayerNorm(self.dim_dynamic)
self.norm2 = nn.LayerNorm(self.hidden_dim)
self.activation = nn.ReLU()
pooler_resolution = 7
num_output = self.hidden_dim * pooler_resolution**2
self.out_layer = nn.Linear(num_output, self.hidden_dim)
self.norm3 = nn.LayerNorm(self.hidden_dim)
def forward(self, pro_features, roi_features):
'''
pro_features: (1, N * nr_boxes, self.d_model)
roi_features: (49, N * nr_boxes, self.d_model)
'''
features = roi_features.transpose(perm=[1, 0, 2])
parameters = self.dynamic_layer(pro_features).transpose(perm=[1, 0, 2])
param1 = parameters[:, :, :self.num_params].reshape(
[-1, self.hidden_dim, self.dim_dynamic])
param2 = parameters[:, :, self.num_params:].reshape(
[-1, self.dim_dynamic, self.hidden_dim])
features = paddle.bmm(features, param1)
features = self.norm1(features)
features = self.activation(features)
features = paddle.bmm(features, param2)
features = self.norm2(features)
features = self.activation(features)
features = features.flatten(1)
features = self.out_layer(features)
features = self.norm3(features)
features = self.activation(features)
return features
class RCNNHead(nn.Layer):
def __init__(
self,
d_model,
num_classes,
dim_feedforward,
nhead,
dropout,
head_cls,
head_reg,
head_dim_dynamic,
head_num_dynamic,
scale_clamp: float=_DEFAULT_SCALE_CLAMP,
bbox_weights=(2.0, 2.0, 1.0, 1.0), ):
super().__init__()
self.d_model = d_model
# dynamic.
self.self_attn = nn.MultiHeadAttention(d_model, nhead, dropout=dropout)
self.inst_interact = DynamicConv(d_model, head_dim_dynamic,
head_num_dynamic)
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
self.dropout3 = nn.Dropout(dropout)
self.activation = nn.ReLU()
# cls.
num_cls = head_cls
cls_module = list()
for _ in range(num_cls):
cls_module.append(nn.Linear(d_model, d_model, bias_attr=False))
cls_module.append(nn.LayerNorm(d_model))
cls_module.append(nn.ReLU())
self.cls_module = nn.LayerList(cls_module)
# reg.
num_reg = head_reg
reg_module = list()
for _ in range(num_reg):
reg_module.append(nn.Linear(d_model, d_model, bias_attr=False))
reg_module.append(nn.LayerNorm(d_model))
reg_module.append(nn.ReLU())
self.reg_module = nn.LayerList(reg_module)
# pred.
self.class_logits = nn.Linear(d_model, num_classes)
self.bboxes_delta = nn.Linear(d_model, 4)
self.scale_clamp = scale_clamp
self.bbox_weights = bbox_weights
def forward(self, features, bboxes, pro_features, pooler):
"""
:param bboxes: (N, nr_boxes, 4)
:param pro_features: (N, nr_boxes, d_model)
"""
N, nr_boxes = bboxes.shape[:2]
proposal_boxes = list()
for b in range(N):
proposal_boxes.append(bboxes[b])
roi_num = paddle.full([N], nr_boxes).astype("int32")
roi_features = pooler(features, proposal_boxes, roi_num)
roi_features = roi_features.reshape(
[N * nr_boxes, self.d_model, -1]).transpose(perm=[2, 0, 1])
# self_att.
pro_features = pro_features.reshape([N, nr_boxes, self.d_model])
pro_features2 = self.self_attn(
pro_features, pro_features, value=pro_features)
pro_features = pro_features.transpose(perm=[1, 0, 2]) + self.dropout1(
pro_features2.transpose(perm=[1, 0, 2]))
pro_features = self.norm1(pro_features)
# inst_interact.
pro_features = pro_features.reshape(
[nr_boxes, N, self.d_model]).transpose(perm=[1, 0, 2]).reshape(
[1, N * nr_boxes, self.d_model])
pro_features2 = self.inst_interact(pro_features, roi_features)
pro_features = pro_features + self.dropout2(pro_features2)
obj_features = self.norm2(pro_features)
# obj_feature.
obj_features2 = self.linear2(
self.dropout(self.activation(self.linear1(obj_features))))
obj_features = obj_features + self.dropout3(obj_features2)
obj_features = self.norm3(obj_features)
fc_feature = obj_features.transpose(perm=[1, 0, 2]).reshape(
[N * nr_boxes, -1])
cls_feature = fc_feature.clone()
reg_feature = fc_feature.clone()
for cls_layer in self.cls_module:
cls_feature = cls_layer(cls_feature)
for reg_layer in self.reg_module:
reg_feature = reg_layer(reg_feature)
class_logits = self.class_logits(cls_feature)
bboxes_deltas = self.bboxes_delta(reg_feature)
pred_bboxes = delta2bbox(bboxes_deltas,
bboxes.reshape([-1, 4]), self.bbox_weights)
return class_logits.reshape([N, nr_boxes, -1]), pred_bboxes.reshape(
[N, nr_boxes, -1]), obj_features
@register
class SparseRCNNHead(nn.Layer):
'''
SparsercnnHead
Args:
roi_input_shape (list[ShapeSpec]): The output shape of fpn
num_classes (int): Number of classes,
head_hidden_dim (int): The param of MultiHeadAttention,
head_dim_feedforward (int): The param of MultiHeadAttention,
nhead (int): The param of MultiHeadAttention,
head_dropout (float): The p of dropout,
head_cls (int): The number of class head,
head_reg (int): The number of regressionhead,
head_num_dynamic (int): The number of DynamicConv's param,
head_num_heads (int): The number of RCNNHead,
deep_supervision (int): wheather supervise the intermediate results,
num_proposals (int): the number of proposals boxes and features
'''
__inject__ = ['loss_func']
__shared__ = ['num_classes']
def __init__(
self,
head_hidden_dim,
head_dim_feedforward,
nhead,
head_dropout,
head_cls,
head_reg,
head_dim_dynamic,
head_num_dynamic,
head_num_heads,
deep_supervision,
num_proposals,
num_classes=80,
loss_func="SparseRCNNLoss",
roi_input_shape=None, ):
super().__init__()
assert head_num_heads > 0, \
f'At least one RoI Head is required, but {head_num_heads}.'
# Build RoI.
box_pooler = self._init_box_pooler(roi_input_shape)
self.box_pooler = box_pooler
# Build heads.
rcnn_head = RCNNHead(
head_hidden_dim,
num_classes,
head_dim_feedforward,
nhead,
head_dropout,
head_cls,
head_reg,
head_dim_dynamic,
head_num_dynamic, )
self.head_series = nn.LayerList(
[copy.deepcopy(rcnn_head) for i in range(head_num_heads)])
self.return_intermediate = deep_supervision
self.num_classes = num_classes
# build init proposal
self.init_proposal_features = nn.Embedding(num_proposals,
head_hidden_dim)
self.init_proposal_boxes = nn.Embedding(num_proposals, 4)
self.lossfunc = loss_func
# Init parameters.
init.reset_initialized_parameter(self)
self._reset_parameters()
def _reset_parameters(self):
# init all parameters.
prior_prob = 0.01
bias_value = -math.log((1 - prior_prob) / prior_prob)
for m in self.sublayers():
if isinstance(m, nn.Linear):
init.xavier_normal_(m.weight, reverse=True)
elif not isinstance(m, nn.Embedding) and hasattr(
m, "weight") and m.weight.dim() > 1:
init.xavier_normal_(m.weight, reverse=False)
if hasattr(m, "bias") and m.bias is not None and m.bias.shape[
-1] == self.num_classes:
init.constant_(m.bias, bias_value)
init_bboxes = paddle.empty_like(self.init_proposal_boxes.weight)
init_bboxes[:, :2] = 0.5
init_bboxes[:, 2:] = 1.0
self.init_proposal_boxes.weight.set_value(init_bboxes)
@staticmethod
def _init_box_pooler(input_shape):
pooler_resolution = 7
sampling_ratio = 2
if input_shape is not None:
pooler_scales = tuple(1.0 / input_shape[k].stride
for k in range(len(input_shape)))
in_channels = [
input_shape[f].channels for f in range(len(input_shape))
]
end_level = len(input_shape) - 1
# Check all channel counts are equal
assert len(set(in_channels)) == 1, in_channels
else:
pooler_scales = [1.0 / 4.0, 1.0 / 8.0, 1.0 / 16.0, 1.0 / 32.0]
end_level = 3
aligned = True
if paddle.device.is_compiled_with_custom_device('npu'):
aligned = False
box_pooler = RoIAlign(
resolution=pooler_resolution,
spatial_scale=pooler_scales,
sampling_ratio=sampling_ratio,
end_level=end_level,
aligned=aligned)
return box_pooler
def forward(self, features, input_whwh):
bs = len(features[0])
bboxes = box_cxcywh_to_xyxy(self.init_proposal_boxes.weight.clone(
)).unsqueeze(0)
bboxes = bboxes * input_whwh.unsqueeze(-2)
init_features = self.init_proposal_features.weight.unsqueeze(0).tile(
[1, bs, 1])
proposal_features = init_features.clone()
inter_class_logits = []
inter_pred_bboxes = []
for stage, rcnn_head in enumerate(self.head_series):
class_logits, pred_bboxes, proposal_features = rcnn_head(
features, bboxes, proposal_features, self.box_pooler)
if self.return_intermediate or stage == len(self.head_series) - 1:
inter_class_logits.append(class_logits)
inter_pred_bboxes.append(pred_bboxes)
bboxes = pred_bboxes.detach()
output = {
'pred_logits': inter_class_logits[-1],
'pred_boxes': inter_pred_bboxes[-1]
}
if self.return_intermediate:
output['aux_outputs'] = [{
'pred_logits': a,
'pred_boxes': b
} for a, b in zip(inter_class_logits[:-1], inter_pred_bboxes[:-1])]
return output
def get_loss(self, outputs, targets):
losses = self.lossfunc(outputs, targets)
weight_dict = self.lossfunc.weight_dict
for k in losses.keys():
if k in weight_dict:
losses[k] *= weight_dict[k]
return losses
def box_cxcywh_to_xyxy(x):
x_c, y_c, w, h = x.unbind(-1)
b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
return paddle.stack(b, axis=-1)
================================================
FILE: ppdet/modeling/heads/ssd_head.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from paddle.regularizer import L2Decay
from paddle import ParamAttr
from ..layers import AnchorGeneratorSSD
from ..cls_utils import _get_class_default_kwargs
class SepConvLayer(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size=3,
padding=1,
conv_decay=0.):
super(SepConvLayer, self).__init__()
self.dw_conv = nn.Conv2D(
in_channels=in_channels,
out_channels=in_channels,
kernel_size=kernel_size,
stride=1,
padding=padding,
groups=in_channels,
weight_attr=ParamAttr(regularizer=L2Decay(conv_decay)),
bias_attr=False)
self.bn = nn.BatchNorm2D(
in_channels,
weight_attr=ParamAttr(regularizer=L2Decay(0.)),
bias_attr=ParamAttr(regularizer=L2Decay(0.)))
self.pw_conv = nn.Conv2D(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
stride=1,
padding=0,
weight_attr=ParamAttr(regularizer=L2Decay(conv_decay)),
bias_attr=False)
def forward(self, x):
x = self.dw_conv(x)
x = F.relu6(self.bn(x))
x = self.pw_conv(x)
return x
class SSDExtraHead(nn.Layer):
def __init__(self,
in_channels=256,
out_channels=([256, 512], [256, 512], [128, 256], [128, 256],
[128, 256]),
strides=(2, 2, 2, 1, 1),
paddings=(1, 1, 1, 0, 0)):
super(SSDExtraHead, self).__init__()
self.convs = nn.LayerList()
for out_channel, stride, padding in zip(out_channels, strides,
paddings):
self.convs.append(
self._make_layers(in_channels, out_channel[0], out_channel[1],
stride, padding))
in_channels = out_channel[-1]
def _make_layers(self, c_in, c_hidden, c_out, stride_3x3, padding_3x3):
return nn.Sequential(
nn.Conv2D(c_in, c_hidden, 1),
nn.ReLU(),
nn.Conv2D(c_hidden, c_out, 3, stride_3x3, padding_3x3), nn.ReLU())
def forward(self, x):
out = [x]
for conv_layer in self.convs:
out.append(conv_layer(out[-1]))
return out
@register
class SSDHead(nn.Layer):
"""
SSDHead
Args:
num_classes (int): Number of classes
in_channels (list): Number of channels per input feature
anchor_generator (dict): Configuration of 'AnchorGeneratorSSD' instance
kernel_size (int): Conv kernel size
padding (int): Conv padding
use_sepconv (bool): Use SepConvLayer if true
conv_decay (float): Conv regularization coeff
loss (object): 'SSDLoss' instance
use_extra_head (bool): If use ResNet34 as baskbone, you should set `use_extra_head`=True
"""
__shared__ = ['num_classes']
__inject__ = ['anchor_generator', 'loss']
def __init__(self,
num_classes=80,
in_channels=(512, 1024, 512, 256, 256, 256),
anchor_generator=_get_class_default_kwargs(AnchorGeneratorSSD),
kernel_size=3,
padding=1,
use_sepconv=False,
conv_decay=0.,
loss='SSDLoss',
use_extra_head=False):
super(SSDHead, self).__init__()
# add background class
self.num_classes = num_classes + 1
self.in_channels = in_channels
self.anchor_generator = anchor_generator
self.loss = loss
self.use_extra_head = use_extra_head
if self.use_extra_head:
self.ssd_extra_head = SSDExtraHead()
self.in_channels = [256, 512, 512, 256, 256, 256]
if isinstance(anchor_generator, dict):
self.anchor_generator = AnchorGeneratorSSD(**anchor_generator)
self.num_priors = self.anchor_generator.num_priors
self.box_convs = []
self.score_convs = []
for i, num_prior in enumerate(self.num_priors):
box_conv_name = "boxes{}".format(i)
if not use_sepconv:
box_conv = self.add_sublayer(
box_conv_name,
nn.Conv2D(
in_channels=self.in_channels[i],
out_channels=num_prior * 4,
kernel_size=kernel_size,
padding=padding))
else:
box_conv = self.add_sublayer(
box_conv_name,
SepConvLayer(
in_channels=self.in_channels[i],
out_channels=num_prior * 4,
kernel_size=kernel_size,
padding=padding,
conv_decay=conv_decay))
self.box_convs.append(box_conv)
score_conv_name = "scores{}".format(i)
if not use_sepconv:
score_conv = self.add_sublayer(
score_conv_name,
nn.Conv2D(
in_channels=self.in_channels[i],
out_channels=num_prior * self.num_classes,
kernel_size=kernel_size,
padding=padding))
else:
score_conv = self.add_sublayer(
score_conv_name,
SepConvLayer(
in_channels=self.in_channels[i],
out_channels=num_prior * self.num_classes,
kernel_size=kernel_size,
padding=padding,
conv_decay=conv_decay))
self.score_convs.append(score_conv)
@classmethod
def from_config(cls, cfg, input_shape):
return {'in_channels': [i.channels for i in input_shape], }
def forward(self, feats, image, gt_bbox=None, gt_class=None):
if self.use_extra_head:
assert len(feats) == 1, \
("If you set use_extra_head=True, backbone feature "
"list length should be 1.")
feats = self.ssd_extra_head(feats[0])
box_preds = []
cls_scores = []
for feat, box_conv, score_conv in zip(feats, self.box_convs,
self.score_convs):
box_pred = box_conv(feat)
box_pred = paddle.transpose(box_pred, [0, 2, 3, 1])
box_pred = paddle.reshape(box_pred, [0, -1, 4])
box_preds.append(box_pred)
cls_score = score_conv(feat)
cls_score = paddle.transpose(cls_score, [0, 2, 3, 1])
cls_score = paddle.reshape(cls_score, [0, -1, self.num_classes])
cls_scores.append(cls_score)
prior_boxes = self.anchor_generator(feats, image)
if self.training:
return self.get_loss(box_preds, cls_scores, gt_bbox, gt_class,
prior_boxes)
else:
return (box_preds, cls_scores), prior_boxes
def get_loss(self, boxes, scores, gt_bbox, gt_class, prior_boxes):
return self.loss(boxes, scores, gt_bbox, gt_class, prior_boxes)
================================================
FILE: ppdet/modeling/heads/tood_head.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import Constant
from ppdet.core.workspace import register
from ..initializer import normal_, constant_, bias_init_with_prob
from ppdet.modeling.bbox_utils import bbox_center, batch_distance2bbox
from ..losses import GIoULoss
from ppdet.modeling.layers import ConvNormLayer
from ppdet.modeling.ops import get_static_shape
from ppdet.modeling.assigners.utils import generate_anchors_for_grid_cell
class ScaleReg(nn.Layer):
"""
Parameter for scaling the regression outputs.
"""
def __init__(self, init_scale=1.):
super(ScaleReg, self).__init__()
self.scale_reg = self.create_parameter(
shape=[1],
attr=ParamAttr(initializer=Constant(value=init_scale)),
dtype="float32")
def forward(self, inputs):
out = inputs * self.scale_reg
return out
class TaskDecomposition(nn.Layer):
"""This code is based on
https://github.com/fcjian/TOOD/blob/master/mmdet/models/dense_heads/tood_head.py
"""
def __init__(
self,
feat_channels,
stacked_convs,
la_down_rate=8,
norm_type='gn',
norm_groups=32, ):
super(TaskDecomposition, self).__init__()
self.feat_channels = feat_channels
self.stacked_convs = stacked_convs
self.norm_type = norm_type
self.norm_groups = norm_groups
self.in_channels = self.feat_channels * self.stacked_convs
self.la_conv1 = nn.Conv2D(self.in_channels,
self.in_channels // la_down_rate, 1)
self.la_conv2 = nn.Conv2D(self.in_channels // la_down_rate,
self.stacked_convs, 1)
self.reduction_conv = ConvNormLayer(
self.in_channels,
self.feat_channels,
filter_size=1,
stride=1,
norm_type=self.norm_type,
norm_groups=self.norm_groups)
self._init_weights()
def _init_weights(self):
normal_(self.la_conv1.weight, std=0.001)
normal_(self.la_conv2.weight, std=0.001)
def forward(self, feat, avg_feat):
feat_shape = get_static_shape(feat)
b = feat_shape[0:1]
h = feat_shape[2:3]
w = feat_shape[3:4]
weight = F.relu(self.la_conv1(avg_feat))
weight = F.sigmoid(self.la_conv2(weight)).unsqueeze(-1)
feat = paddle.reshape(
feat, [b, self.stacked_convs, self.feat_channels, h, w]) * weight
feat = self.reduction_conv(feat.flatten(1, 2))
feat = F.relu(feat)
return feat
@register
class TOODHead(nn.Layer):
"""This code is based on
https://github.com/fcjian/TOOD/blob/master/mmdet/models/dense_heads/tood_head.py
"""
__inject__ = ['nms', 'static_assigner', 'assigner']
__shared__ = ['num_classes']
def __init__(self,
num_classes=80,
feat_channels=256,
stacked_convs=6,
fpn_strides=(8, 16, 32, 64, 128),
grid_cell_scale=8,
grid_cell_offset=0.5,
norm_type='gn',
norm_groups=32,
static_assigner_epoch=4,
use_align_head=True,
loss_weight={
'class': 1.0,
'bbox': 1.0,
'iou': 2.0,
},
nms='MultiClassNMS',
static_assigner='ATSSAssigner',
assigner='TaskAlignedAssigner'):
super(TOODHead, self).__init__()
self.num_classes = num_classes
self.feat_channels = feat_channels
self.stacked_convs = stacked_convs
self.fpn_strides = fpn_strides
self.grid_cell_scale = grid_cell_scale
self.grid_cell_offset = grid_cell_offset
self.static_assigner_epoch = static_assigner_epoch
self.use_align_head = use_align_head
self.nms = nms
self.static_assigner = static_assigner
self.assigner = assigner
self.loss_weight = loss_weight
self.giou_loss = GIoULoss()
self.inter_convs = nn.LayerList()
for i in range(self.stacked_convs):
self.inter_convs.append(
ConvNormLayer(
self.feat_channels,
self.feat_channels,
filter_size=3,
stride=1,
norm_type=norm_type,
norm_groups=norm_groups))
self.cls_decomp = TaskDecomposition(
self.feat_channels,
self.stacked_convs,
self.stacked_convs * 8,
norm_type=norm_type,
norm_groups=norm_groups)
self.reg_decomp = TaskDecomposition(
self.feat_channels,
self.stacked_convs,
self.stacked_convs * 8,
norm_type=norm_type,
norm_groups=norm_groups)
self.tood_cls = nn.Conv2D(
self.feat_channels, self.num_classes, 3, padding=1)
self.tood_reg = nn.Conv2D(self.feat_channels, 4, 3, padding=1)
if self.use_align_head:
self.cls_prob_conv1 = nn.Conv2D(self.feat_channels *
self.stacked_convs,
self.feat_channels // 4, 1)
self.cls_prob_conv2 = nn.Conv2D(
self.feat_channels // 4, 1, 3, padding=1)
self.reg_offset_conv1 = nn.Conv2D(self.feat_channels *
self.stacked_convs,
self.feat_channels // 4, 1)
self.reg_offset_conv2 = nn.Conv2D(
self.feat_channels // 4, 4 * 2, 3, padding=1)
self.scales_regs = nn.LayerList([ScaleReg() for _ in self.fpn_strides])
self._init_weights()
@classmethod
def from_config(cls, cfg, input_shape):
return {
'feat_channels': input_shape[0].channels,
'fpn_strides': [i.stride for i in input_shape],
}
def _init_weights(self):
bias_cls = bias_init_with_prob(0.01)
normal_(self.tood_cls.weight, std=0.01)
constant_(self.tood_cls.bias, bias_cls)
normal_(self.tood_reg.weight, std=0.01)
if self.use_align_head:
normal_(self.cls_prob_conv1.weight, std=0.01)
normal_(self.cls_prob_conv2.weight, std=0.01)
constant_(self.cls_prob_conv2.bias, bias_cls)
normal_(self.reg_offset_conv1.weight, std=0.001)
constant_(self.reg_offset_conv2.weight)
constant_(self.reg_offset_conv2.bias)
def _reg_grid_sample(self, feat, offset, anchor_points):
feat_shape = get_static_shape(feat)
b = feat_shape[0:1]
h = feat_shape[2:3]
w = feat_shape[3:4]
feat = paddle.reshape(feat, [-1, 1, h, w])
offset = paddle.reshape(offset, [-1, 2, h, w]).transpose([0, 2, 3, 1])
grid_shape = paddle.concat([w, h]).astype('float32')
grid = (offset + anchor_points) / grid_shape
grid = 2 * grid.clip(0., 1.) - 1
feat = F.grid_sample(feat, grid)
feat = paddle.reshape(feat, [b, -1, h, w])
return feat
def forward(self, feats):
assert len(feats) == len(self.fpn_strides), \
"The size of feats is not equal to size of fpn_strides"
anchors, anchor_points, num_anchors_list, stride_tensor =\
generate_anchors_for_grid_cell(
feats, self.fpn_strides, self.grid_cell_scale,
self.grid_cell_offset)
anchor_centers_split = paddle.split(anchor_points / stride_tensor,
num_anchors_list)
cls_score_list, bbox_pred_list = [], []
for feat, scale_reg, anchor_centers, stride in zip(
feats, self.scales_regs, anchor_centers_split,
self.fpn_strides):
b, _, h, w = get_static_shape(feat)
inter_feats = []
for inter_conv in self.inter_convs:
feat = F.relu(inter_conv(feat))
inter_feats.append(feat)
feat = paddle.concat(inter_feats, axis=1)
# task decomposition
avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
cls_feat = self.cls_decomp(feat, avg_feat)
reg_feat = self.reg_decomp(feat, avg_feat)
# cls prediction and alignment
cls_logits = self.tood_cls(cls_feat)
if self.use_align_head:
cls_prob = F.relu(self.cls_prob_conv1(feat))
cls_prob = F.sigmoid(self.cls_prob_conv2(cls_prob))
cls_score = (F.sigmoid(cls_logits) * cls_prob).sqrt()
else:
cls_score = F.sigmoid(cls_logits)
cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))
# reg prediction and alignment
reg_dist = scale_reg(self.tood_reg(reg_feat).exp())
reg_dist = reg_dist.flatten(2).transpose([0, 2, 1])
reg_bbox = batch_distance2bbox(
anchor_centers.unsqueeze(0), reg_dist)
if self.use_align_head:
reg_offset = F.relu(self.reg_offset_conv1(feat))
reg_offset = self.reg_offset_conv2(reg_offset)
reg_bbox = reg_bbox.transpose([0, 2, 1]).reshape([b, 4, h, w])
anchor_centers = anchor_centers.reshape([1, h, w, 2])
bbox_pred = self._reg_grid_sample(reg_bbox, reg_offset,
anchor_centers)
bbox_pred = bbox_pred.flatten(2).transpose([0, 2, 1])
else:
bbox_pred = reg_bbox
if not self.training:
bbox_pred *= stride
bbox_pred_list.append(bbox_pred)
cls_score_list = paddle.concat(cls_score_list, axis=1)
bbox_pred_list = paddle.concat(bbox_pred_list, axis=1)
return cls_score_list, bbox_pred_list, anchors, num_anchors_list, stride_tensor
@staticmethod
def _focal_loss(score, label, alpha=0.25, gamma=2.0):
weight = (score - label).pow(gamma)
if alpha > 0:
alpha_t = alpha * label + (1 - alpha) * (1 - label)
weight *= alpha_t
loss = F.binary_cross_entropy(
score, label, weight=weight, reduction='sum')
return loss
def get_loss(self, head_outs, gt_meta):
pred_scores, pred_bboxes, anchors, \
num_anchors_list, stride_tensor = head_outs
gt_labels = gt_meta['gt_class']
gt_bboxes = gt_meta['gt_bbox']
pad_gt_mask = gt_meta['pad_gt_mask']
# label assignment
if gt_meta['epoch_id'] < self.static_assigner_epoch:
assigned_labels, assigned_bboxes, assigned_scores = self.static_assigner(
anchors,
num_anchors_list,
gt_labels,
gt_bboxes,
pad_gt_mask,
bg_index=self.num_classes)
alpha_l = 0.25
else:
assigned_labels, assigned_bboxes, assigned_scores = self.assigner(
pred_scores.detach(),
pred_bboxes.detach() * stride_tensor,
bbox_center(anchors),
num_anchors_list,
gt_labels,
gt_bboxes,
pad_gt_mask,
bg_index=self.num_classes)
alpha_l = -1
# rescale bbox
assigned_bboxes /= stride_tensor
# classification loss
loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha=alpha_l)
# select positive samples mask
mask_positive = (assigned_labels != self.num_classes)
num_pos = mask_positive.astype(paddle.float32).sum()
# bbox regression loss
if num_pos > 0:
bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 4])
pred_bboxes_pos = paddle.masked_select(pred_bboxes,
bbox_mask).reshape([-1, 4])
assigned_bboxes_pos = paddle.masked_select(
assigned_bboxes, bbox_mask).reshape([-1, 4])
bbox_weight = paddle.masked_select(
assigned_scores.sum(-1), mask_positive).unsqueeze(-1)
# iou loss
loss_iou = self.giou_loss(pred_bboxes_pos,
assigned_bboxes_pos) * bbox_weight
loss_iou = loss_iou.sum() / bbox_weight.sum()
# l1 loss
loss_l1 = F.l1_loss(pred_bboxes_pos, assigned_bboxes_pos)
else:
loss_iou = paddle.zeros([])
loss_l1 = paddle.zeros([])
loss_cls /= assigned_scores.sum().clip(min=1)
loss = self.loss_weight['class'] * loss_cls + self.loss_weight[
'iou'] * loss_iou
return {
'loss': loss,
'loss_class': loss_cls,
'loss_iou': loss_iou,
'loss_l1': loss_l1
}
def post_process(self, head_outs, img_shape, scale_factor):
pred_scores, pred_bboxes, _, _, _ = head_outs
pred_scores = pred_scores.transpose([0, 2, 1])
for i in range(len(pred_bboxes)):
pred_bboxes[i, :, 0] = pred_bboxes[i, :, 0].clip(
min=0, max=img_shape[i, 1])
pred_bboxes[i, :, 1] = pred_bboxes[i, :, 1].clip(
min=0, max=img_shape[i, 0])
pred_bboxes[i, :, 2] = pred_bboxes[i, :, 2].clip(
min=0, max=img_shape[i, 1])
pred_bboxes[i, :, 3] = pred_bboxes[i, :, 3].clip(
min=0, max=img_shape[i, 0])
# scale bbox to origin
scale_factor = scale_factor.flip([1]).tile([1, 2]).unsqueeze(1)
pred_bboxes /= scale_factor
bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
return bbox_pred, bbox_num
================================================
FILE: ppdet/modeling/heads/ttf_head.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import Constant, Normal
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register
from ppdet.modeling.layers import DeformableConvV2, LiteConv
import numpy as np
@register
class HMHead(nn.Layer):
"""
Args:
ch_in (int): The channel number of input Tensor.
ch_out (int): The channel number of output Tensor.
num_classes (int): Number of classes.
conv_num (int): The convolution number of hm_feat.
dcn_head(bool): whether use dcn in head. False by default.
lite_head(bool): whether use lite version. False by default.
norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional.
bn by default
Return:
Heatmap head output
"""
__shared__ = ['num_classes', 'norm_type']
def __init__(
self,
ch_in,
ch_out=128,
num_classes=80,
conv_num=2,
dcn_head=False,
lite_head=False,
norm_type='bn', ):
super(HMHead, self).__init__()
head_conv = nn.Sequential()
for i in range(conv_num):
name = 'conv.{}'.format(i)
if lite_head:
lite_name = 'hm.' + name
head_conv.add_sublayer(
lite_name,
LiteConv(
in_channels=ch_in if i == 0 else ch_out,
out_channels=ch_out,
norm_type=norm_type))
else:
if dcn_head:
head_conv.add_sublayer(
name,
DeformableConvV2(
in_channels=ch_in if i == 0 else ch_out,
out_channels=ch_out,
kernel_size=3,
weight_attr=ParamAttr(initializer=Normal(0, 0.01))))
else:
head_conv.add_sublayer(
name,
nn.Conv2D(
in_channels=ch_in if i == 0 else ch_out,
out_channels=ch_out,
kernel_size=3,
padding=1,
weight_attr=ParamAttr(initializer=Normal(0, 0.01)),
bias_attr=ParamAttr(
learning_rate=2., regularizer=L2Decay(0.))))
head_conv.add_sublayer(name + '.act', nn.ReLU())
self.feat = head_conv
bias_init = float(-np.log((1 - 0.01) / 0.01))
weight_attr = None if lite_head else ParamAttr(initializer=Normal(0,
0.01))
self.head = nn.Conv2D(
in_channels=ch_out,
out_channels=num_classes,
kernel_size=1,
weight_attr=weight_attr,
bias_attr=ParamAttr(
learning_rate=2.,
regularizer=L2Decay(0.),
initializer=Constant(bias_init)))
def forward(self, feat):
out = self.feat(feat)
out = self.head(out)
return out
@register
class WHHead(nn.Layer):
"""
Args:
ch_in (int): The channel number of input Tensor.
ch_out (int): The channel number of output Tensor.
conv_num (int): The convolution number of wh_feat.
dcn_head(bool): whether use dcn in head. False by default.
lite_head(bool): whether use lite version. False by default.
norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional.
bn by default
Return:
Width & Height head output
"""
__shared__ = ['norm_type']
def __init__(self,
ch_in,
ch_out=64,
conv_num=2,
dcn_head=False,
lite_head=False,
norm_type='bn'):
super(WHHead, self).__init__()
head_conv = nn.Sequential()
for i in range(conv_num):
name = 'conv.{}'.format(i)
if lite_head:
lite_name = 'wh.' + name
head_conv.add_sublayer(
lite_name,
LiteConv(
in_channels=ch_in if i == 0 else ch_out,
out_channels=ch_out,
norm_type=norm_type))
else:
if dcn_head:
head_conv.add_sublayer(
name,
DeformableConvV2(
in_channels=ch_in if i == 0 else ch_out,
out_channels=ch_out,
kernel_size=3,
weight_attr=ParamAttr(initializer=Normal(0, 0.01))))
else:
head_conv.add_sublayer(
name,
nn.Conv2D(
in_channels=ch_in if i == 0 else ch_out,
out_channels=ch_out,
kernel_size=3,
padding=1,
weight_attr=ParamAttr(initializer=Normal(0, 0.01)),
bias_attr=ParamAttr(
learning_rate=2., regularizer=L2Decay(0.))))
head_conv.add_sublayer(name + '.act', nn.ReLU())
weight_attr = None if lite_head else ParamAttr(initializer=Normal(0,
0.01))
self.feat = head_conv
self.head = nn.Conv2D(
in_channels=ch_out,
out_channels=4,
kernel_size=1,
weight_attr=weight_attr,
bias_attr=ParamAttr(
learning_rate=2., regularizer=L2Decay(0.)))
def forward(self, feat):
out = self.feat(feat)
out = self.head(out)
out = F.relu(out)
return out
@register
class TTFHead(nn.Layer):
"""
TTFHead
Args:
in_channels (int): the channel number of input to TTFHead.
num_classes (int): the number of classes, 80 by default.
hm_head_planes (int): the channel number in heatmap head,
128 by default.
wh_head_planes (int): the channel number in width & height head,
64 by default.
hm_head_conv_num (int): the number of convolution in heatmap head,
2 by default.
wh_head_conv_num (int): the number of convolution in width & height
head, 2 by default.
hm_loss (object): Instance of 'CTFocalLoss'.
wh_loss (object): Instance of 'GIoULoss'.
wh_offset_base (float): the base offset of width and height,
16.0 by default.
down_ratio (int): the actual down_ratio is calculated by base_down_ratio
(default 16) and the number of upsample layers.
lite_head(bool): whether use lite version. False by default.
norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional.
bn by default
ags_module(bool): whether use AGS module to reweight location feature.
false by default.
"""
__shared__ = ['num_classes', 'down_ratio', 'norm_type']
__inject__ = ['hm_loss', 'wh_loss']
def __init__(self,
in_channels,
num_classes=80,
hm_head_planes=128,
wh_head_planes=64,
hm_head_conv_num=2,
wh_head_conv_num=2,
hm_loss='CTFocalLoss',
wh_loss='GIoULoss',
wh_offset_base=16.,
down_ratio=4,
dcn_head=False,
lite_head=False,
norm_type='bn',
ags_module=False):
super(TTFHead, self).__init__()
self.in_channels = in_channels
self.hm_head = HMHead(in_channels, hm_head_planes, num_classes,
hm_head_conv_num, dcn_head, lite_head, norm_type)
self.wh_head = WHHead(in_channels, wh_head_planes, wh_head_conv_num,
dcn_head, lite_head, norm_type)
self.hm_loss = hm_loss
self.wh_loss = wh_loss
self.wh_offset_base = wh_offset_base
self.down_ratio = down_ratio
self.ags_module = ags_module
@classmethod
def from_config(cls, cfg, input_shape):
if isinstance(input_shape, (list, tuple)):
input_shape = input_shape[0]
return {'in_channels': input_shape.channels, }
def forward(self, feats):
hm = self.hm_head(feats)
wh = self.wh_head(feats) * self.wh_offset_base
return hm, wh
def filter_box_by_weight(self, pred, target, weight):
"""
Filter out boxes where ttf_reg_weight is 0, only keep positive samples.
"""
index = paddle.nonzero(weight > 0)
index.stop_gradient = True
weight = paddle.gather_nd(weight, index)
pred = paddle.gather_nd(pred, index)
target = paddle.gather_nd(target, index)
return pred, target, weight
def filter_loc_by_weight(self, score, weight):
index = paddle.nonzero(weight > 0)
index.stop_gradient = True
score = paddle.gather_nd(score, index)
return score
def get_loss(self, pred_hm, pred_wh, target_hm, box_target, target_weight):
pred_hm = paddle.clip(F.sigmoid(pred_hm), 1e-4, 1 - 1e-4)
hm_loss = self.hm_loss(pred_hm, target_hm)
H, W = target_hm.shape[2:]
mask = paddle.reshape(target_weight, [-1, H, W])
avg_factor = paddle.sum(mask) + 1e-4
base_step = self.down_ratio
shifts_x = paddle.arange(0, W * base_step, base_step, dtype='int32')
shifts_y = paddle.arange(0, H * base_step, base_step, dtype='int32')
shift_y, shift_x = paddle.tensor.meshgrid([shifts_y, shifts_x])
base_loc = paddle.stack([shift_x, shift_y], axis=0)
base_loc.stop_gradient = True
pred_boxes = paddle.concat(
[0 - pred_wh[:, 0:2, :, :] + base_loc.astype(pred_wh.dtype), pred_wh[:, 2:4] + base_loc.astype(pred_wh.dtype)],
axis=1)
pred_boxes = paddle.transpose(pred_boxes, [0, 2, 3, 1])
boxes = paddle.transpose(box_target, [0, 2, 3, 1])
boxes.stop_gradient = True
if self.ags_module:
pred_hm_max = paddle.max(pred_hm, axis=1, keepdim=True)
pred_hm_max_softmax = F.softmax(pred_hm_max, axis=1)
pred_hm_max_softmax = paddle.transpose(pred_hm_max_softmax,
[0, 2, 3, 1])
pred_hm_max_softmax = self.filter_loc_by_weight(pred_hm_max_softmax,
mask)
else:
pred_hm_max_softmax = None
pred_boxes, boxes, mask = self.filter_box_by_weight(pred_boxes, boxes,
mask)
mask.stop_gradient = True
wh_loss = self.wh_loss(
pred_boxes,
boxes,
iou_weight=mask.unsqueeze(1),
loc_reweight=pred_hm_max_softmax)
wh_loss = wh_loss / avg_factor
ttf_loss = {'hm_loss': hm_loss, 'wh_loss': wh_loss}
return ttf_loss
================================================
FILE: ppdet/modeling/heads/vitpose_head.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from ppdet.modeling.keypoint_utils import resize, flip_back
from paddle.nn.initializer import TruncatedNormal, Constant, Normal
from ppdet.modeling.layers import ConvTranspose2d, BatchNorm2d
trunc_normal_ = TruncatedNormal(std=.02)
normal_ = Normal(std=0.001)
zeros_ = Constant(value=0.)
ones_ = Constant(value=1.)
__all__ = ['TopdownHeatmapSimpleHead']
@register
class TopdownHeatmapSimpleHead(nn.Layer):
def __init__(self,
in_channels=768,
out_channels=17,
num_deconv_layers=3,
num_deconv_filters=(256, 256, 256),
num_deconv_kernels=(4, 4, 4),
extra=None,
in_index=0,
input_transform=None,
align_corners=False,
upsample=0,
flip_pairs=None,
shift_heatmap=False,
target_type='GaussianHeatmap'):
super(TopdownHeatmapSimpleHead, self).__init__()
self.in_channels = in_channels
self.upsample = upsample
self.flip_pairs = flip_pairs
self.shift_heatmap = shift_heatmap
self.target_type = target_type
self._init_inputs(in_channels, in_index, input_transform)
self.in_index = in_index
self.align_corners = align_corners
if extra is not None and not isinstance(extra, dict):
raise TypeError('extra should be dict or None.')
if num_deconv_layers > 0:
self.deconv_layers = self._make_deconv_layer(
num_deconv_layers,
num_deconv_filters,
num_deconv_kernels, )
elif num_deconv_layers == 0:
self.deconv_layers = nn.Identity()
else:
raise ValueError(
f'num_deconv_layers ({num_deconv_layers}) should >= 0.')
identity_final_layer = False
if extra is not None and 'final_conv_kernel' in extra:
assert extra['final_conv_kernel'] in [0, 1, 3]
if extra['final_conv_kernel'] == 3:
padding = 1
elif extra['final_conv_kernel'] == 1:
padding = 0
else:
# 0 for Identity mapping.
identity_final_layer = True
kernel_size = extra['final_conv_kernel']
else:
kernel_size = 1
padding = 0
if identity_final_layer:
self.final_layer = nn.Identity()
else:
conv_channels = num_deconv_filters[
-1] if num_deconv_layers > 0 else self.in_channels
layers = []
if extra is not None:
num_conv_layers = extra.get('num_conv_layers', 0)
num_conv_kernels = extra.get('num_conv_kernels',
[1] * num_conv_layers)
for i in range(num_conv_layers):
layers.append(
nn.Conv2D(
in_channels=conv_channels,
out_channels=conv_channels,
kernel_size=num_conv_kernels[i],
stride=1,
padding=(num_conv_kernels[i] - 1) // 2))
layers.append(nn.BatchNorm2D(conv_channels))
layers.append(nn.ReLU())
layers.append(
nn.Conv2D(
in_channels=conv_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=1,
padding=(padding, padding)))
if len(layers) > 1:
self.final_layer = nn.Sequential(*layers)
else:
self.final_layer = layers[0]
self.init_weights()
@staticmethod
def _get_deconv_cfg(deconv_kernel):
"""Get configurations for deconv layers."""
if deconv_kernel == 4:
padding = 1
output_padding = 0
elif deconv_kernel == 3:
padding = 1
output_padding = 1
elif deconv_kernel == 2:
padding = 0
output_padding = 0
else:
raise ValueError(f'Not supported num_kernels ({deconv_kernel}).')
return deconv_kernel, padding, output_padding
def _init_inputs(self, in_channels, in_index, input_transform):
"""Check and initialize input transforms.
"""
if input_transform is not None:
assert input_transform in ['resize_concat', 'multiple_select']
self.input_transform = input_transform
self.in_index = in_index
if input_transform is not None:
assert isinstance(in_channels, (list, tuple))
assert isinstance(in_index, (list, tuple))
assert len(in_channels) == len(in_index)
if input_transform == 'resize_concat':
self.in_channels = sum(in_channels)
else:
self.in_channels = in_channels
else:
assert isinstance(in_channels, int)
assert isinstance(in_index, int)
self.in_channels = in_channels
def _transform_inputs(self, inputs):
"""Transform inputs for decoder.
"""
if not isinstance(inputs, list):
if not isinstance(inputs, list):
if self.upsample > 0:
inputs = resize(
input=F.relu(inputs),
scale_factor=self.upsample,
mode='bilinear',
align_corners=self.align_corners)
return inputs
if self.input_transform == 'resize_concat':
inputs = [inputs[i] for i in self.in_index]
upsampled_inputs = [
resize(
input=x,
size=inputs[0].shape[2:],
mode='bilinear',
align_corners=self.align_corners) for x in inputs
]
inputs = paddle.concat(upsampled_inputs, dim=1)
elif self.input_transform == 'multiple_select':
inputs = [inputs[i] for i in self.in_index]
else:
inputs = inputs[self.in_index]
return inputs
def forward(self, x):
"""Forward function."""
x = self._transform_inputs(x)
x = self.deconv_layers(x)
x = self.final_layer(x)
return x
def inference_model(self, x, flip_pairs=None):
"""Inference function.
Returns:
output_heatmap (np.ndarray): Output heatmaps.
Args:
x (torch.Tensor[N,K,H,W]): Input features.
flip_pairs (None | list[tuple]):
Pairs of keypoints which are mirrored.
"""
output = self.forward(x)
if flip_pairs is not None:
output_heatmap = flip_back(
output, self.flip_pairs, target_type=self.target_type)
# feature is not aligned, shift flipped heatmap for higher accuracy
if self.shift_heatmap:
output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1]
else:
output_heatmap = output
return output_heatmap
def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
"""Make deconv layers."""
if num_layers != len(num_filters):
error_msg = f'num_layers({num_layers}) ' \
f'!= length of num_filters({len(num_filters)})'
raise ValueError(error_msg)
if num_layers != len(num_kernels):
error_msg = f'num_layers({num_layers}) ' \
f'!= length of num_kernels({len(num_kernels)})'
raise ValueError(error_msg)
layers = []
for i in range(num_layers):
kernel, padding, output_padding = \
self._get_deconv_cfg(num_kernels[i])
planes = num_filters[i]
layers.append(
ConvTranspose2d(
in_channels=self.in_channels,
out_channels=planes,
kernel_size=kernel,
stride=2,
padding=padding,
output_padding=output_padding,
bias=False))
layers.append(nn.BatchNorm2D(planes))
layers.append(nn.ReLU())
self.in_channels = planes
return nn.Sequential(*layers)
def init_weights(self):
"""Initialize model weights."""
if not isinstance(self.deconv_layers, nn.Identity):
for m in self.deconv_layers:
if isinstance(m, nn.BatchNorm2D):
ones_(m.weight)
ones_(m.bias)
if not isinstance(self.final_layer, nn.Conv2D):
for m in self.final_layer:
if isinstance(m, nn.Conv2D):
normal_(m.weight)
zeros_(m.bias)
elif isinstance(m, nn.BatchNorm2D):
ones_(m.weight)
ones_(m.bias)
else:
normal_(self.final_layer.weight)
zeros_(self.final_layer.bias)
================================================
FILE: ppdet/modeling/heads/yolo_head.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register
import math
import numpy as np
from ..initializer import bias_init_with_prob, constant_
from ..backbones.csp_darknet import BaseConv, DWConv
from ..losses import IouLoss
from ppdet.modeling.assigners.simota_assigner import SimOTAAssigner
from ppdet.modeling.bbox_utils import bbox_overlaps
from ppdet.modeling.layers import MultiClassNMS
__all__ = ['YOLOv3Head', 'YOLOXHead']
def _de_sigmoid(x, eps=1e-7):
x = paddle.clip(x, eps, 1. / eps)
x = paddle.clip(1. / x - 1., eps, 1. / eps)
x = -paddle.log(x)
return x
@register
class YOLOv3Head(nn.Layer):
__shared__ = ['num_classes', 'data_format']
__inject__ = ['loss']
def __init__(self,
in_channels=[1024, 512, 256],
anchors=[[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
[59, 119], [116, 90], [156, 198], [373, 326]],
anchor_masks=[[6, 7, 8], [3, 4, 5], [0, 1, 2]],
num_classes=80,
loss='YOLOv3Loss',
iou_aware=False,
iou_aware_factor=0.4,
data_format='NCHW'):
"""
Head for YOLOv3 network
Args:
num_classes (int): number of foreground classes
anchors (list): anchors
anchor_masks (list): anchor masks
loss (object): YOLOv3Loss instance
iou_aware (bool): whether to use iou_aware
iou_aware_factor (float): iou aware factor
data_format (str): data format, NCHW or NHWC
"""
super(YOLOv3Head, self).__init__()
assert len(in_channels) > 0, "in_channels length should > 0"
self.in_channels = in_channels
self.num_classes = num_classes
self.loss = loss
self.iou_aware = iou_aware
self.iou_aware_factor = iou_aware_factor
self.parse_anchor(anchors, anchor_masks)
self.num_outputs = len(self.anchors)
self.data_format = data_format
self.yolo_outputs = []
for i in range(len(self.anchors)):
if self.iou_aware:
num_filters = len(self.anchors[i]) * (self.num_classes + 6)
else:
num_filters = len(self.anchors[i]) * (self.num_classes + 5)
name = 'yolo_output.{}'.format(i)
conv = nn.Conv2D(
in_channels=self.in_channels[i],
out_channels=num_filters,
kernel_size=1,
stride=1,
padding=0,
data_format=data_format,
bias_attr=ParamAttr(regularizer=L2Decay(0.)))
conv.skip_quant = True
yolo_output = self.add_sublayer(name, conv)
self.yolo_outputs.append(yolo_output)
def parse_anchor(self, anchors, anchor_masks):
self.anchors = [[anchors[i] for i in mask] for mask in anchor_masks]
self.mask_anchors = []
anchor_num = len(anchors)
for masks in anchor_masks:
self.mask_anchors.append([])
for mask in masks:
assert mask < anchor_num, "anchor mask index overflow"
self.mask_anchors[-1].extend(anchors[mask])
def forward(self, feats, targets=None):
assert len(feats) == len(self.anchors)
yolo_outputs = []
for i, feat in enumerate(feats):
yolo_output = self.yolo_outputs[i](feat)
if self.data_format == 'NHWC':
yolo_output = paddle.transpose(yolo_output, [0, 3, 1, 2])
yolo_outputs.append(yolo_output)
if self.training:
return self.loss(yolo_outputs, targets, self.anchors)
else:
if self.iou_aware:
y = []
for i, out in enumerate(yolo_outputs):
na = len(self.anchors[i])
ioup, x = out[:, 0:na, :, :], out[:, na:, :, :]
b, c, h, w = x.shape
no = c // na
x = x.reshape((b, na, no, h * w))
ioup = ioup.reshape((b, na, 1, h * w))
obj = x[:, :, 4:5, :]
ioup = F.sigmoid(ioup)
obj = F.sigmoid(obj)
obj_t = (obj**(1 - self.iou_aware_factor)) * (
ioup**self.iou_aware_factor)
obj_t = _de_sigmoid(obj_t)
loc_t = x[:, :, :4, :]
cls_t = x[:, :, 5:, :]
y_t = paddle.concat([loc_t, obj_t, cls_t], axis=2)
y_t = y_t.reshape((b, c, h, w))
y.append(y_t)
return y
else:
return yolo_outputs
@classmethod
def from_config(cls, cfg, input_shape):
return {'in_channels': [i.channels for i in input_shape], }
@register
class YOLOXHead(nn.Layer):
__shared__ = ['num_classes', 'width_mult', 'act', 'trt', 'exclude_nms']
__inject__ = ['assigner', 'nms']
def __init__(self,
num_classes=80,
width_mult=1.0,
depthwise=False,
in_channels=[256, 512, 1024],
feat_channels=256,
fpn_strides=(8, 16, 32),
l1_epoch=285,
act='silu',
assigner=SimOTAAssigner(use_vfl=False),
nms='MultiClassNMS',
loss_weight={
'cls': 1.0,
'obj': 1.0,
'iou': 5.0,
'l1': 1.0,
},
trt=False,
exclude_nms=False):
super(YOLOXHead, self).__init__()
self._dtype = paddle.framework.get_default_dtype()
self.num_classes = num_classes
assert len(in_channels) > 0, "in_channels length should > 0"
self.in_channels = in_channels
feat_channels = int(feat_channels * width_mult)
self.fpn_strides = fpn_strides
self.l1_epoch = l1_epoch
self.assigner = assigner
self.nms = nms
if isinstance(self.nms, MultiClassNMS) and trt:
self.nms.trt = trt
self.exclude_nms = exclude_nms
self.loss_weight = loss_weight
self.iou_loss = IouLoss(loss_weight=1.0) # default loss_weight 2.5
ConvBlock = DWConv if depthwise else BaseConv
self.stem_conv = nn.LayerList()
self.conv_cls = nn.LayerList()
self.conv_reg = nn.LayerList() # reg [x,y,w,h] + obj
for in_c in self.in_channels:
self.stem_conv.append(BaseConv(in_c, feat_channels, 1, 1, act=act))
self.conv_cls.append(
nn.Sequential(* [
ConvBlock(
feat_channels, feat_channels, 3, 1, act=act), ConvBlock(
feat_channels, feat_channels, 3, 1, act=act),
nn.Conv2D(
feat_channels,
self.num_classes,
1,
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
]))
self.conv_reg.append(
nn.Sequential(* [
ConvBlock(
feat_channels, feat_channels, 3, 1, act=act),
ConvBlock(
feat_channels, feat_channels, 3, 1, act=act),
nn.Conv2D(
feat_channels,
4 + 1, # reg [x,y,w,h] + obj
1,
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
]))
self._init_weights()
@classmethod
def from_config(cls, cfg, input_shape):
return {'in_channels': [i.channels for i in input_shape], }
def _init_weights(self):
bias_cls = bias_init_with_prob(0.01)
bias_reg = paddle.full([5], math.log(5.), dtype=self._dtype)
bias_reg[:2] = 0.
bias_reg[-1] = bias_cls
for cls_, reg_ in zip(self.conv_cls, self.conv_reg):
constant_(cls_[-1].weight)
constant_(cls_[-1].bias, bias_cls)
constant_(reg_[-1].weight)
reg_[-1].bias.set_value(bias_reg)
def _generate_anchor_point(self, feat_sizes, strides, offset=0.):
anchor_points, stride_tensor = [], []
num_anchors_list = []
for feat_size, stride in zip(feat_sizes, strides):
h, w = feat_size
x = (paddle.arange(w) + offset) * stride
y = (paddle.arange(h) + offset) * stride
y, x = paddle.meshgrid(y, x)
anchor_points.append(paddle.stack([x, y], axis=-1).reshape([-1, 2]))
stride_tensor.append(
paddle.full(
[len(anchor_points[-1]), 1], stride, dtype=self._dtype))
num_anchors_list.append(len(anchor_points[-1]))
anchor_points = paddle.concat(anchor_points).astype(self._dtype)
anchor_points.stop_gradient = True
stride_tensor = paddle.concat(stride_tensor)
stride_tensor.stop_gradient = True
return anchor_points, stride_tensor, num_anchors_list
def forward(self, feats, targets=None):
assert len(feats) == len(self.fpn_strides), \
"The size of feats is not equal to size of fpn_strides"
feat_sizes = [[f.shape[-2], f.shape[-1]] for f in feats]
cls_score_list, reg_pred_list = [], []
obj_score_list = []
for i, feat in enumerate(feats):
feat = self.stem_conv[i](feat)
cls_logit = self.conv_cls[i](feat)
reg_pred = self.conv_reg[i](feat)
# cls prediction
cls_score = F.sigmoid(cls_logit)
cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))
# reg prediction
reg_xywh, obj_logit = paddle.split(reg_pred, [4, 1], axis=1)
reg_xywh = reg_xywh.flatten(2).transpose([0, 2, 1])
reg_pred_list.append(reg_xywh)
# obj prediction
obj_score = F.sigmoid(obj_logit)
obj_score_list.append(obj_score.flatten(2).transpose([0, 2, 1]))
cls_score_list = paddle.concat(cls_score_list, axis=1)
reg_pred_list = paddle.concat(reg_pred_list, axis=1)
obj_score_list = paddle.concat(obj_score_list, axis=1)
# bbox decode
anchor_points, stride_tensor, _ =\
self._generate_anchor_point(feat_sizes, self.fpn_strides)
reg_xy, reg_wh = paddle.split(reg_pred_list, 2, axis=-1)
reg_xy += (anchor_points / stride_tensor)
reg_wh = paddle.exp(reg_wh) * 0.5
bbox_pred_list = paddle.concat(
[reg_xy - reg_wh, reg_xy + reg_wh], axis=-1)
if self.training:
anchor_points, stride_tensor, num_anchors_list =\
self._generate_anchor_point(feat_sizes, self.fpn_strides, 0.5)
yolox_losses = self.get_loss([
cls_score_list, bbox_pred_list, obj_score_list, anchor_points,
stride_tensor, num_anchors_list
], targets)
return yolox_losses
else:
pred_scores = (cls_score_list * obj_score_list).sqrt()
return pred_scores, bbox_pred_list, stride_tensor
def get_loss(self, head_outs, targets):
pred_cls, pred_bboxes, pred_obj,\
anchor_points, stride_tensor, num_anchors_list = head_outs
gt_labels = targets['gt_class']
gt_bboxes = targets['gt_bbox']
pred_scores = (pred_cls * pred_obj).sqrt()
# label assignment
center_and_strides = paddle.concat(
[anchor_points, stride_tensor, stride_tensor], axis=-1)
pos_num_list, label_list, bbox_target_list = [], [], []
for pred_score, pred_bbox, gt_box, gt_label in zip(
pred_scores.detach(),
pred_bboxes.detach() * stride_tensor, gt_bboxes, gt_labels):
pos_num, label, _, bbox_target = self.assigner(
pred_score, center_and_strides, pred_bbox, gt_box, gt_label)
pos_num_list.append(pos_num)
label_list.append(label)
bbox_target_list.append(bbox_target)
labels = paddle.to_tensor(np.stack(label_list, axis=0))
bbox_targets = paddle.to_tensor(np.stack(bbox_target_list, axis=0))
bbox_targets /= stride_tensor # rescale bbox
# 1. obj score loss
mask_positive = (labels != self.num_classes)
loss_obj = F.binary_cross_entropy(
pred_obj,
mask_positive.astype(pred_obj.dtype).unsqueeze(-1),
reduction='sum')
num_pos = sum(pos_num_list)
if num_pos > 0:
num_pos = paddle.to_tensor(num_pos, dtype=self._dtype).clip(min=1)
loss_obj /= num_pos
# 2. iou loss
bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 4])
pred_bboxes_pos = paddle.masked_select(pred_bboxes,
bbox_mask).reshape([-1, 4])
assigned_bboxes_pos = paddle.masked_select(
bbox_targets, bbox_mask).reshape([-1, 4])
bbox_iou = bbox_overlaps(pred_bboxes_pos, assigned_bboxes_pos)
bbox_iou = paddle.diag(bbox_iou)
loss_iou = self.iou_loss(
pred_bboxes_pos.split(
4, axis=-1),
assigned_bboxes_pos.split(
4, axis=-1))
loss_iou = loss_iou.sum() / num_pos
# 3. cls loss
cls_mask = mask_positive.unsqueeze(-1).tile(
[1, 1, self.num_classes])
pred_cls_pos = paddle.masked_select(
pred_cls, cls_mask).reshape([-1, self.num_classes])
assigned_cls_pos = paddle.masked_select(labels, mask_positive)
assigned_cls_pos = F.one_hot(assigned_cls_pos,
self.num_classes + 1)[..., :-1]
assigned_cls_pos *= bbox_iou.unsqueeze(-1)
loss_cls = F.binary_cross_entropy(
pred_cls_pos, assigned_cls_pos, reduction='sum')
loss_cls /= num_pos
# 4. l1 loss
if targets['epoch_id'] >= self.l1_epoch:
loss_l1 = F.l1_loss(
pred_bboxes_pos, assigned_bboxes_pos, reduction='sum')
loss_l1 /= num_pos
else:
loss_l1 = paddle.zeros([])
loss_l1.stop_gradient = False
else:
loss_cls = paddle.zeros([])
loss_iou = paddle.zeros([])
loss_l1 = paddle.zeros([])
loss_cls.stop_gradient = False
loss_iou.stop_gradient = False
loss_l1.stop_gradient = False
loss = self.loss_weight['obj'] * loss_obj + \
self.loss_weight['cls'] * loss_cls + \
self.loss_weight['iou'] * loss_iou
if targets['epoch_id'] >= self.l1_epoch:
loss += (self.loss_weight['l1'] * loss_l1)
yolox_losses = {
'loss': loss,
'loss_cls': loss_cls,
'loss_obj': loss_obj,
'loss_iou': loss_iou,
'loss_l1': loss_l1,
}
return yolox_losses
def post_process(self, head_outs, img_shape, scale_factor):
pred_scores, pred_bboxes, stride_tensor = head_outs
pred_scores = pred_scores.transpose([0, 2, 1])
pred_bboxes *= stride_tensor
# scale bbox to origin image
scale_factor = scale_factor.flip(-1).tile([1, 2]).unsqueeze(1)
pred_bboxes /= scale_factor
if self.exclude_nms:
# `exclude_nms=True` just use in benchmark
return pred_bboxes.sum(), pred_scores.sum()
else:
bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
return bbox_pred, bbox_num
================================================
FILE: ppdet/modeling/heads/yolof_head.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddle.nn.initializer import Normal, Constant
from ppdet.modeling.layers import MultiClassNMS
from ppdet.core.workspace import register
from ppdet.modeling.bbox_utils import delta2bbox_v2
__all__ = ['YOLOFHead']
INF = 1e8
def reduce_mean(tensor):
world_size = paddle.distributed.get_world_size()
if world_size == 1:
return tensor
paddle.distributed.all_reduce(tensor)
return tensor / world_size
def find_inside_anchor(feat_size, stride, num_anchors, im_shape):
feat_h, feat_w = feat_size[:2]
im_h, im_w = im_shape[:2]
inside_h = min(int(np.ceil(im_h / stride)), feat_h)
inside_w = min(int(np.ceil(im_w / stride)), feat_w)
inside_mask = paddle.zeros([feat_h, feat_w], dtype=paddle.bool)
inside_mask[:inside_h, :inside_w] = True
inside_mask = inside_mask.unsqueeze(-1).expand(
[feat_h, feat_w, num_anchors])
return inside_mask.reshape([-1])
@register
class YOLOFFeat(nn.Layer):
def __init__(self,
feat_in=256,
feat_out=256,
num_cls_convs=2,
num_reg_convs=4,
norm_type='bn'):
super(YOLOFFeat, self).__init__()
assert norm_type == 'bn', "YOLOFFeat only support BN now."
self.feat_in = feat_in
self.feat_out = feat_out
self.num_cls_convs = num_cls_convs
self.num_reg_convs = num_reg_convs
self.norm_type = norm_type
cls_subnet, reg_subnet = [], []
for i in range(self.num_cls_convs):
feat_in = self.feat_in if i == 0 else self.feat_out
cls_subnet.append(
nn.Conv2D(
feat_in,
self.feat_out,
3,
stride=1,
padding=1,
weight_attr=ParamAttr(initializer=Normal(
mean=0.0, std=0.01)),
bias_attr=ParamAttr(initializer=Constant(value=0.0))))
cls_subnet.append(
nn.BatchNorm2D(
self.feat_out,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0))))
cls_subnet.append(nn.ReLU())
for i in range(self.num_reg_convs):
feat_in = self.feat_in if i == 0 else self.feat_out
reg_subnet.append(
nn.Conv2D(
feat_in,
self.feat_out,
3,
stride=1,
padding=1,
weight_attr=ParamAttr(initializer=Normal(
mean=0.0, std=0.01)),
bias_attr=ParamAttr(initializer=Constant(value=0.0))))
reg_subnet.append(
nn.BatchNorm2D(
self.feat_out,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0))))
reg_subnet.append(nn.ReLU())
self.cls_subnet = nn.Sequential(*cls_subnet)
self.reg_subnet = nn.Sequential(*reg_subnet)
def forward(self, fpn_feat):
cls_feat = self.cls_subnet(fpn_feat)
reg_feat = self.reg_subnet(fpn_feat)
return cls_feat, reg_feat
@register
class YOLOFHead(nn.Layer):
__shared__ = ['num_classes', 'trt', 'exclude_nms']
__inject__ = [
'conv_feat', 'anchor_generator', 'bbox_assigner', 'loss_class',
'loss_bbox', 'nms'
]
def __init__(self,
num_classes=80,
conv_feat='YOLOFFeat',
anchor_generator='AnchorGenerator',
bbox_assigner='UniformAssigner',
loss_class='FocalLoss',
loss_bbox='GIoULoss',
ctr_clip=32.0,
delta_mean=[0.0, 0.0, 0.0, 0.0],
delta_std=[1.0, 1.0, 1.0, 1.0],
nms='MultiClassNMS',
prior_prob=0.01,
nms_pre=1000,
use_inside_anchor=False,
trt=False,
exclude_nms=False):
super(YOLOFHead, self).__init__()
self.num_classes = num_classes
self.conv_feat = conv_feat
self.anchor_generator = anchor_generator
self.na = self.anchor_generator.num_anchors
self.bbox_assigner = bbox_assigner
self.loss_class = loss_class
self.loss_bbox = loss_bbox
self.ctr_clip = ctr_clip
self.delta_mean = delta_mean
self.delta_std = delta_std
self.nms = nms
self.nms_pre = nms_pre
self.use_inside_anchor = use_inside_anchor
if isinstance(self.nms, MultiClassNMS) and trt:
self.nms.trt = trt
self.exclude_nms = exclude_nms
bias_init_value = -math.log((1 - prior_prob) / prior_prob)
self.cls_score = self.add_sublayer(
'cls_score',
nn.Conv2D(
in_channels=conv_feat.feat_out,
out_channels=self.num_classes * self.na,
kernel_size=3,
stride=1,
padding=1,
weight_attr=ParamAttr(initializer=Normal(
mean=0.0, std=0.01)),
bias_attr=ParamAttr(initializer=Constant(
value=bias_init_value))))
self.bbox_pred = self.add_sublayer(
'bbox_pred',
nn.Conv2D(
in_channels=conv_feat.feat_out,
out_channels=4 * self.na,
kernel_size=3,
stride=1,
padding=1,
weight_attr=ParamAttr(initializer=Normal(
mean=0.0, std=0.01)),
bias_attr=ParamAttr(initializer=Constant(value=0))))
self.object_pred = self.add_sublayer(
'object_pred',
nn.Conv2D(
in_channels=conv_feat.feat_out,
out_channels=self.na,
kernel_size=3,
stride=1,
padding=1,
weight_attr=ParamAttr(initializer=Normal(
mean=0.0, std=0.01)),
bias_attr=ParamAttr(initializer=Constant(value=0))))
def forward(self, feats, targets=None):
assert len(feats) == 1, "YOLOF only has one level feature."
conv_cls_feat, conv_reg_feat = self.conv_feat(feats[0])
cls_logits = self.cls_score(conv_cls_feat)
objectness = self.object_pred(conv_reg_feat)
bboxes_reg = self.bbox_pred(conv_reg_feat)
N, C, H, W = cls_logits.shape[:]
cls_logits = cls_logits.reshape((N, self.na, self.num_classes, H, W))
objectness = objectness.reshape((N, self.na, 1, H, W))
norm_cls_logits = cls_logits + objectness - paddle.log(
1.0 + paddle.clip(
cls_logits.exp(), max=INF) + paddle.clip(
objectness.exp(), max=INF))
norm_cls_logits = norm_cls_logits.reshape((N, C, H, W))
anchors = self.anchor_generator([norm_cls_logits])
if self.training:
yolof_losses = self.get_loss(
[anchors[0], norm_cls_logits, bboxes_reg], targets)
return yolof_losses
else:
return anchors[0], norm_cls_logits, bboxes_reg
def get_loss(self, head_outs, targets):
anchors, cls_logits, bbox_preds = head_outs
feat_size = cls_logits.shape[-2:]
cls_logits = cls_logits.transpose([0, 2, 3, 1])
cls_logits = cls_logits.reshape([0, -1, self.num_classes])
bbox_preds = bbox_preds.transpose([0, 2, 3, 1])
bbox_preds = bbox_preds.reshape([0, -1, 4])
num_pos_list = []
cls_pred_list, cls_tar_list = [], []
reg_pred_list, reg_tar_list = [], []
# find and gather preds and targets in each image
for cls_logit, bbox_pred, gt_bbox, gt_class, im_shape in zip(
cls_logits, bbox_preds, targets['gt_bbox'], targets['gt_class'],
targets['im_shape']):
if self.use_inside_anchor:
inside_mask = find_inside_anchor(
feat_size, self.anchor_generator.strides[0], self.na,
im_shape.tolist())
cls_logit = cls_logit[inside_mask]
bbox_pred = bbox_pred[inside_mask]
anchors = anchors[inside_mask]
bbox_pred = delta2bbox_v2(
bbox_pred,
anchors,
self.delta_mean,
self.delta_std,
ctr_clip=self.ctr_clip)
bbox_pred = bbox_pred.reshape([-1, bbox_pred.shape[-1]])
# -2:ignore, -1:neg, >=0:pos
match_labels, pos_bbox_pred, pos_bbox_tar = self.bbox_assigner(
bbox_pred, anchors, gt_bbox)
pos_mask = (match_labels >= 0)
neg_mask = (match_labels == -1)
chosen_mask = paddle.logical_or(pos_mask, neg_mask)
gt_class = gt_class.reshape([-1])
bg_class = paddle.to_tensor(
[self.num_classes], dtype=gt_class.dtype)
# a trick to assign num_classes to negative targets
gt_class = paddle.concat([gt_class, bg_class], axis=-1)
match_labels = paddle.where(
neg_mask,
paddle.full_like(match_labels, gt_class.size - 1), match_labels)
num_pos_list.append(max(1.0, pos_mask.sum().item()))
cls_pred_list.append(cls_logit[chosen_mask])
cls_tar_list.append(gt_class[match_labels[chosen_mask]])
reg_pred_list.append(pos_bbox_pred)
reg_tar_list.append(pos_bbox_tar)
num_tot_pos = paddle.to_tensor(sum(num_pos_list))
num_tot_pos = reduce_mean(num_tot_pos).item()
num_tot_pos = max(1.0, num_tot_pos)
cls_pred = paddle.concat(cls_pred_list)
cls_tar = paddle.concat(cls_tar_list)
cls_loss = self.loss_class(
cls_pred, cls_tar, reduction='sum') / num_tot_pos
reg_pred_list = [_ for _ in reg_pred_list if _ is not None]
reg_tar_list = [_ for _ in reg_tar_list if _ is not None]
if len(reg_pred_list) == 0:
reg_loss = bbox_preds.sum() * 0.0
else:
reg_pred = paddle.concat(reg_pred_list)
reg_tar = paddle.concat(reg_tar_list)
reg_loss = self.loss_bbox(reg_pred, reg_tar).sum() / num_tot_pos
yolof_losses = {
'loss': cls_loss + reg_loss,
'loss_cls': cls_loss,
'loss_reg': reg_loss,
}
return yolof_losses
def get_bboxes_single(self,
anchors,
cls_scores,
bbox_preds,
im_shape,
scale_factor,
rescale=True):
assert len(cls_scores) == len(bbox_preds)
mlvl_bboxes = []
mlvl_scores = []
for anchor, cls_score, bbox_pred in zip(anchors, cls_scores,
bbox_preds):
cls_score = cls_score.reshape([-1, self.num_classes])
bbox_pred = bbox_pred.reshape([-1, 4])
if self.nms_pre is not None and cls_score.shape[0] > self.nms_pre:
max_score = cls_score.max(axis=1)
_, topk_inds = max_score.topk(self.nms_pre)
bbox_pred = bbox_pred.gather(topk_inds)
anchor = anchor.gather(topk_inds)
cls_score = cls_score.gather(topk_inds)
bbox_pred = delta2bbox_v2(
bbox_pred,
anchor,
self.delta_mean,
self.delta_std,
max_shape=im_shape,
ctr_clip=self.ctr_clip).squeeze()
mlvl_bboxes.append(bbox_pred)
mlvl_scores.append(F.sigmoid(cls_score))
mlvl_bboxes = paddle.concat(mlvl_bboxes)
mlvl_bboxes = paddle.squeeze(mlvl_bboxes)
if rescale:
mlvl_bboxes = mlvl_bboxes / paddle.concat(
[scale_factor[::-1], scale_factor[::-1]])
mlvl_scores = paddle.concat(mlvl_scores)
mlvl_scores = mlvl_scores.transpose([1, 0])
return mlvl_bboxes, mlvl_scores
def decode(self, anchors, cls_scores, bbox_preds, im_shape, scale_factor):
batch_bboxes = []
batch_scores = []
for img_id in range(cls_scores[0].shape[0]):
num_lvls = len(cls_scores)
cls_score_list = [cls_scores[i][img_id] for i in range(num_lvls)]
bbox_pred_list = [bbox_preds[i][img_id] for i in range(num_lvls)]
bboxes, scores = self.get_bboxes_single(
anchors, cls_score_list, bbox_pred_list, im_shape[img_id],
scale_factor[img_id])
batch_bboxes.append(bboxes)
batch_scores.append(scores)
batch_bboxes = paddle.stack(batch_bboxes, 0)
batch_scores = paddle.stack(batch_scores, 0)
return batch_bboxes, batch_scores
def post_process(self, head_outs, im_shape, scale_factor):
anchors, cls_scores, bbox_preds = head_outs
cls_scores = cls_scores.transpose([0, 2, 3, 1])
bbox_preds = bbox_preds.transpose([0, 2, 3, 1])
pred_bboxes, pred_scores = self.decode(
[anchors], [cls_scores], [bbox_preds], im_shape, scale_factor)
if self.exclude_nms:
# `exclude_nms=True` just use in benchmark
return pred_bboxes.sum(), pred_scores.sum()
else:
bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
return bbox_pred, bbox_num
================================================
FILE: ppdet/modeling/initializer.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file.
"""
import math
import numpy as np
import paddle
import paddle.nn as nn
__all__ = [
'uniform_',
'normal_',
'constant_',
'ones_',
'zeros_',
'xavier_uniform_',
'xavier_normal_',
'kaiming_uniform_',
'kaiming_normal_',
'linear_init_',
'conv_init_',
'reset_initialized_parameter',
]
def _no_grad_uniform_(tensor, a, b):
with paddle.no_grad():
tensor.set_value(
paddle.uniform(
shape=tensor.shape, dtype=tensor.dtype, min=a, max=b))
return tensor
def _no_grad_normal_(tensor, mean=0., std=1.):
with paddle.no_grad():
tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape))
return tensor
def _no_grad_fill_(tensor, value=0.):
with paddle.no_grad():
tensor.set_value(paddle.full_like(tensor, value, dtype=tensor.dtype))
return tensor
def uniform_(tensor, a, b):
"""
Modified tensor inspace using uniform_
Args:
tensor (paddle.Tensor): paddle Tensor
a (float|int): min value.
b (float|int): max value.
Return:
tensor
"""
return _no_grad_uniform_(tensor, a, b)
def normal_(tensor, mean=0., std=1.):
"""
Modified tensor inspace using normal_
Args:
tensor (paddle.Tensor): paddle Tensor
mean (float|int): mean value.
std (float|int): std value.
Return:
tensor
"""
return _no_grad_normal_(tensor, mean, std)
def constant_(tensor, value=0.):
"""
Modified tensor inspace using constant_
Args:
tensor (paddle.Tensor): paddle Tensor
value (float|int): value to fill tensor.
Return:
tensor
"""
return _no_grad_fill_(tensor, value)
def ones_(tensor):
"""
Modified tensor inspace using ones_
Args:
tensor (paddle.Tensor): paddle Tensor
Return:
tensor
"""
return _no_grad_fill_(tensor, 1)
def zeros_(tensor):
"""
Modified tensor inspace using zeros_
Args:
tensor (paddle.Tensor): paddle Tensor
Return:
tensor
"""
return _no_grad_fill_(tensor, 0)
def vector_(tensor, vector):
with paddle.no_grad():
tensor.set_value(paddle.to_tensor(vector, dtype=tensor.dtype))
return tensor
def _calculate_fan_in_and_fan_out(tensor, reverse=False):
"""
Calculate (fan_in, _fan_out) for tensor
Args:
tensor (Tensor): paddle.Tensor
reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True
Return:
Tuple[fan_in, fan_out]
"""
if tensor.ndim < 2:
raise ValueError(
"Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
)
if reverse:
num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
else:
num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0]
receptive_field_size = 1
if tensor.ndim > 2:
receptive_field_size = np.prod(tensor.shape[2:])
fan_in = num_input_fmaps * receptive_field_size
fan_out = num_output_fmaps * receptive_field_size
return fan_in, fan_out
def xavier_uniform_(tensor, gain=1., reverse=False):
"""
Modified tensor inspace using xavier_uniform_
Args:
tensor (paddle.Tensor): paddle Tensor
gain (float): super parameter, 1. default.
reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
Return:
tensor
"""
fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
k = math.sqrt(3.0) * std
return _no_grad_uniform_(tensor, -k, k)
def xavier_normal_(tensor, gain=1., reverse=False):
"""
Modified tensor inspace using xavier_normal_
Args:
tensor (paddle.Tensor): paddle Tensor
gain (float): super parameter, 1. default.
reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
Return:
tensor
"""
fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
return _no_grad_normal_(tensor, 0, std)
# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html
def _calculate_correct_fan(tensor, mode, reverse=False):
mode = mode.lower()
valid_modes = ['fan_in', 'fan_out']
if mode not in valid_modes:
raise ValueError("Mode {} not supported, please use one of {}".format(
mode, valid_modes))
fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)
return fan_in if mode == 'fan_in' else fan_out
def _calculate_gain(nonlinearity, param=None):
linear_fns = [
'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',
'conv_transpose2d', 'conv_transpose3d'
]
if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
return 1
elif nonlinearity == 'tanh':
return 5.0 / 3
elif nonlinearity == 'relu':
return math.sqrt(2.0)
elif nonlinearity == 'leaky_relu':
if param is None:
negative_slope = 0.01
elif not isinstance(param, bool) and isinstance(
param, int) or isinstance(param, float):
# True/False are instances of int, hence check above
negative_slope = param
else:
raise ValueError("negative_slope {} not a valid number".format(
param))
return math.sqrt(2.0 / (1 + negative_slope**2))
elif nonlinearity == 'selu':
return 3.0 / 4
else:
raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
def kaiming_uniform_(tensor,
a=0,
mode='fan_in',
nonlinearity='leaky_relu',
reverse=False):
"""
Modified tensor inspace using kaiming_uniform method
Args:
tensor (paddle.Tensor): paddle Tensor
mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
nonlinearity (str): nonlinearity method name
reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
Return:
tensor
"""
fan = _calculate_correct_fan(tensor, mode, reverse)
gain = _calculate_gain(nonlinearity, a)
std = gain / math.sqrt(fan)
k = math.sqrt(3.0) * std
return _no_grad_uniform_(tensor, -k, k)
def kaiming_normal_(tensor,
a=0,
mode='fan_in',
nonlinearity='leaky_relu',
reverse=False):
"""
Modified tensor inspace using kaiming_normal_
Args:
tensor (paddle.Tensor): paddle Tensor
mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
nonlinearity (str): nonlinearity method name
reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
Return:
tensor
"""
fan = _calculate_correct_fan(tensor, mode, reverse)
gain = _calculate_gain(nonlinearity, a)
std = gain / math.sqrt(fan)
return _no_grad_normal_(tensor, 0, std)
def linear_init_(module):
bound = 1 / math.sqrt(module.weight.shape[0])
uniform_(module.weight, -bound, bound)
if hasattr(module, "bias") and module.bias is not None:
uniform_(module.bias, -bound, bound)
def conv_init_(module):
bound = 1 / np.sqrt(np.prod(module.weight.shape[1:]))
uniform_(module.weight, -bound, bound)
if module.bias is not None:
uniform_(module.bias, -bound, bound)
def bias_init_with_prob(prior_prob=0.01):
"""initialize conv/fc bias value according to a given probability value."""
bias_init = float(-np.log((1 - prior_prob) / prior_prob))
return bias_init
@paddle.no_grad()
def reset_initialized_parameter(model, include_self=True):
"""
Reset initialized parameter using following method for [conv, linear, embedding, bn]
Args:
model (paddle.Layer): paddle Layer
include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself
Return:
None
"""
for _, m in model.named_sublayers(include_self=include_self):
if isinstance(m, nn.Conv2D):
k = float(m._groups) / (m._in_channels * m._kernel_size[0] *
m._kernel_size[1])
k = math.sqrt(k)
_no_grad_uniform_(m.weight, -k, k)
if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
_no_grad_uniform_(m.bias, -k, k)
elif isinstance(m, nn.Linear):
k = math.sqrt(1. / m.weight.shape[0])
_no_grad_uniform_(m.weight, -k, k)
if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
_no_grad_uniform_(m.bias, -k, k)
elif isinstance(m, nn.Embedding):
_no_grad_normal_(m.weight, mean=0., std=1.)
elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)):
_no_grad_fill_(m.weight, 1.)
if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
_no_grad_fill_(m.bias, 0)
================================================
FILE: ppdet/modeling/keypoint_utils.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
this code is based on https://github.com/open-mmlab/mmpose
"""
import cv2
import numpy as np
import paddle.nn.functional as F
def get_affine_mat_kernel(h, w, s, inv=False):
if w < h:
w_ = s
h_ = int(np.ceil((s / w * h) / 64.) * 64)
scale_w = w
scale_h = h_ / w_ * w
else:
h_ = s
w_ = int(np.ceil((s / h * w) / 64.) * 64)
scale_h = h
scale_w = w_ / h_ * h
center = np.array([np.round(w / 2.), np.round(h / 2.)])
size_resized = (w_, h_)
trans = get_affine_transform(
center, np.array([scale_w, scale_h]), 0, size_resized, inv=inv)
return trans, size_resized
def get_affine_transform(center,
input_size,
rot,
output_size,
shift=(0., 0.),
inv=False):
"""Get the affine transform matrix, given the center/scale/rot/output_size.
Args:
center (np.ndarray[2, ]): Center of the bounding box (x, y).
input_size (np.ndarray[2, ]): Size of input feature (width, height).
rot (float): Rotation angle (degree).
output_size (np.ndarray[2, ]): Size of the destination heatmaps.
shift (0-100%): Shift translation ratio wrt the width/height.
Default (0., 0.).
inv (bool): Option to inverse the affine transform direction.
(inv=False: src->dst or inv=True: dst->src)
Returns:
np.ndarray: The transform matrix.
"""
assert len(center) == 2
assert len(output_size) == 2
assert len(shift) == 2
if not isinstance(input_size, (np.ndarray, list)):
input_size = np.array([input_size, input_size], dtype=np.float32)
scale_tmp = input_size
shift = np.array(shift)
src_w = scale_tmp[0]
dst_w = output_size[0]
dst_h = output_size[1]
rot_rad = np.pi * rot / 180
src_dir = rotate_point([0., src_w * -0.5], rot_rad)
dst_dir = np.array([0., dst_w * -0.5])
src = np.zeros((3, 2), dtype=np.float32)
src[0, :] = center + scale_tmp * shift
src[1, :] = center + src_dir + scale_tmp * shift
src[2, :] = _get_3rd_point(src[0, :], src[1, :])
dst = np.zeros((3, 2), dtype=np.float32)
dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
if inv:
trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
else:
trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
return trans
def get_warp_matrix(theta, size_input, size_dst, size_target):
"""This code is based on
https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py
Calculate the transformation matrix under the constraint of unbiased.
Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased
Data Processing for Human Pose Estimation (CVPR 2020).
Args:
theta (float): Rotation angle in degrees.
size_input (np.ndarray): Size of input image [w, h].
size_dst (np.ndarray): Size of output image [w, h].
size_target (np.ndarray): Size of ROI in input plane [w, h].
Returns:
matrix (np.ndarray): A matrix for transformation.
"""
theta = np.deg2rad(theta)
matrix = np.zeros((2, 3), dtype=np.float32)
scale_x = size_dst[0] / size_target[0]
scale_y = size_dst[1] / size_target[1]
matrix[0, 0] = np.cos(theta) * scale_x
matrix[0, 1] = -np.sin(theta) * scale_x
matrix[0, 2] = scale_x * (
-0.5 * size_input[0] * np.cos(theta) + 0.5 * size_input[1] *
np.sin(theta) + 0.5 * size_target[0])
matrix[1, 0] = np.sin(theta) * scale_y
matrix[1, 1] = np.cos(theta) * scale_y
matrix[1, 2] = scale_y * (
-0.5 * size_input[0] * np.sin(theta) - 0.5 * size_input[1] *
np.cos(theta) + 0.5 * size_target[1])
return matrix
def _get_3rd_point(a, b):
"""To calculate the affine matrix, three pairs of points are required. This
function is used to get the 3rd point, given 2D points a & b.
The 3rd point is defined by rotating vector `a - b` by 90 degrees
anticlockwise, using b as the rotation center.
Args:
a (np.ndarray): point(x,y)
b (np.ndarray): point(x,y)
Returns:
np.ndarray: The 3rd point.
"""
assert len(
a) == 2, 'input of _get_3rd_point should be point with length of 2'
assert len(
b) == 2, 'input of _get_3rd_point should be point with length of 2'
direction = a - b
third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)
return third_pt
def rotate_point(pt, angle_rad):
"""Rotate a point by an angle.
Args:
pt (list[float]): 2 dimensional point to be rotated
angle_rad (float): rotation angle by radian
Returns:
list[float]: Rotated point.
"""
assert len(pt) == 2
sn, cs = np.sin(angle_rad), np.cos(angle_rad)
new_x = pt[0] * cs - pt[1] * sn
new_y = pt[0] * sn + pt[1] * cs
rotated_pt = [new_x, new_y]
return rotated_pt
def transpred(kpts, h, w, s):
trans, _ = get_affine_mat_kernel(h, w, s, inv=True)
return warp_affine_joints(kpts[..., :2].copy(), trans)
def warp_affine_joints(joints, mat):
"""Apply affine transformation defined by the transform matrix on the
joints.
Args:
joints (np.ndarray[..., 2]): Origin coordinate of joints.
mat (np.ndarray[3, 2]): The affine matrix.
Returns:
matrix (np.ndarray[..., 2]): Result coordinate of joints.
"""
joints = np.array(joints)
shape = joints.shape
joints = joints.reshape(-1, 2)
return np.dot(np.concatenate(
(joints, joints[:, 0:1] * 0 + 1), axis=1),
mat.T).reshape(shape)
def affine_transform(pt, t):
new_pt = np.array([pt[0], pt[1], 1.]).T
new_pt = np.dot(t, new_pt)
return new_pt[:2]
def transform_preds(coords, center, scale, output_size):
target_coords = np.zeros(coords.shape)
trans = get_affine_transform(center, scale * 200, 0, output_size, inv=1)
for p in range(coords.shape[0]):
target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
return target_coords
def oks_iou(g, d, a_g, a_d, sigmas=None, in_vis_thre=None):
if not isinstance(sigmas, np.ndarray):
sigmas = np.array([
.26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07,
.87, .87, .89, .89
]) / 10.0
vars = (sigmas * 2)**2
xg = g[0::3]
yg = g[1::3]
vg = g[2::3]
ious = np.zeros((d.shape[0]))
for n_d in range(0, d.shape[0]):
xd = d[n_d, 0::3]
yd = d[n_d, 1::3]
vd = d[n_d, 2::3]
dx = xd - xg
dy = yd - yg
e = (dx**2 + dy**2) / vars / ((a_g + a_d[n_d]) / 2 + np.spacing(1)) / 2
if in_vis_thre is not None:
ind = list(vg > in_vis_thre) and list(vd > in_vis_thre)
e = e[ind]
ious[n_d] = np.sum(np.exp(-e)) / e.shape[0] if e.shape[0] != 0 else 0.0
return ious
def oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None):
"""greedily select boxes with high confidence and overlap with current maximum <= thresh
rule out overlap >= thresh
Args:
kpts_db (list): The predicted keypoints within the image
thresh (float): The threshold to select the boxes
sigmas (np.array): The variance to calculate the oks iou
Default: None
in_vis_thre (float): The threshold to select the high confidence boxes
Default: None
Return:
keep (list): indexes to keep
"""
if len(kpts_db) == 0:
return []
scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))])
kpts = np.array(
[kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))])
areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))])
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
sigmas, in_vis_thre)
inds = np.where(oks_ovr <= thresh)[0]
order = order[inds + 1]
return keep
def rescore(overlap, scores, thresh, type='gaussian'):
assert overlap.shape[0] == scores.shape[0]
if type == 'linear':
inds = np.where(overlap >= thresh)[0]
scores[inds] = scores[inds] * (1 - overlap[inds])
else:
scores = scores * np.exp(-overlap**2 / thresh)
return scores
def soft_oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None):
"""greedily select boxes with high confidence and overlap with current maximum <= thresh
rule out overlap >= thresh
Args:
kpts_db (list): The predicted keypoints within the image
thresh (float): The threshold to select the boxes
sigmas (np.array): The variance to calculate the oks iou
Default: None
in_vis_thre (float): The threshold to select the high confidence boxes
Default: None
Return:
keep (list): indexes to keep
"""
if len(kpts_db) == 0:
return []
scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))])
kpts = np.array(
[kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))])
areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))])
order = scores.argsort()[::-1]
scores = scores[order]
# max_dets = order.size
max_dets = 20
keep = np.zeros(max_dets, dtype=np.intp)
keep_cnt = 0
while order.size > 0 and keep_cnt < max_dets:
i = order[0]
oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
sigmas, in_vis_thre)
order = order[1:]
scores = rescore(oks_ovr, scores[1:], thresh)
tmp = scores.argsort()[::-1]
order = order[tmp]
scores = scores[tmp]
keep[keep_cnt] = i
keep_cnt += 1
keep = keep[:keep_cnt]
return keep
def resize(input,
size=None,
scale_factor=None,
mode='nearest',
align_corners=None,
warning=True):
if warning:
if size is not None and align_corners:
input_h, input_w = tuple(int(x) for x in input.shape[2:])
output_h, output_w = tuple(int(x) for x in size)
if output_h > input_h or output_w > output_h:
if ((output_h > 1 and output_w > 1 and input_h > 1 and
input_w > 1) and (output_h - 1) % (input_h - 1) and
(output_w - 1) % (input_w - 1)):
warnings.warn(
f'When align_corners={align_corners}, '
'the output would more aligned if '
f'input size {(input_h, input_w)} is `x+1` and '
f'out size {(output_h, output_w)} is `nx+1`')
return F.interpolate(input, size, scale_factor, mode, align_corners)
def flip_back(output_flipped, flip_pairs, target_type='GaussianHeatmap'):
"""Flip the flipped heatmaps back to the original form.
Note:
- batch_size: N
- num_keypoints: K
- heatmap height: H
- heatmap width: W
Args:
output_flipped (np.ndarray[N, K, H, W]): The output heatmaps obtained
from the flipped images.
flip_pairs (list[tuple()): Pairs of keypoints which are mirrored
(for example, left ear -- right ear).
target_type (str): GaussianHeatmap or CombinedTarget
Returns:
np.ndarray: heatmaps that flipped back to the original image
"""
assert len(output_flipped.shape) == 4, \
'output_flipped should be [batch_size, num_keypoints, height, width]'
shape_ori = output_flipped.shape
channels = 1
if target_type.lower() == 'CombinedTarget'.lower():
channels = 3
output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...]
output_flipped = output_flipped.reshape((shape_ori[0], -1, channels,
shape_ori[2], shape_ori[3]))
output_flipped_back = output_flipped.clone()
# Swap left-right parts
for left, right in flip_pairs:
output_flipped_back[:, left, ...] = output_flipped[:, right, ...]
output_flipped_back[:, right, ...] = output_flipped[:, left, ...]
output_flipped_back = output_flipped_back.reshape(shape_ori)
# Flip horizontally
output_flipped_back = output_flipped_back[..., ::-1]
return output_flipped_back
def _calc_distances(preds, targets, mask, normalize):
"""Calculate the normalized distances between preds and target.
Note:
batch_size: N
num_keypoints: K
dimension of keypoints: D (normally, D=2 or D=3)
Args:
preds (np.ndarray[N, K, D]): Predicted keypoint location.
targets (np.ndarray[N, K, D]): Groundtruth keypoint location.
mask (np.ndarray[N, K]): Visibility of the target. False for invisible
joints, and True for visible. Invisible joints will be ignored for
accuracy calculation.
normalize (np.ndarray[N, D]): Typical value is heatmap_size
Returns:
np.ndarray[K, N]: The normalized distances. \
If target keypoints are missing, the distance is -1.
"""
N, K, _ = preds.shape
# set mask=0 when normalize==0
_mask = mask.copy()
_mask[np.where((normalize == 0).sum(1))[0], :] = False
distances = np.full((N, K), -1, dtype=np.float32)
# handle invalid values
normalize[np.where(normalize <= 0)] = 1e6
distances[_mask] = np.linalg.norm(
((preds - targets) / normalize[:, None, :])[_mask], axis=-1)
return distances.T
def _distance_acc(distances, thr=0.5):
"""Return the percentage below the distance threshold, while ignoring
distances values with -1.
Note:
batch_size: N
Args:
distances (np.ndarray[N, ]): The normalized distances.
thr (float): Threshold of the distances.
Returns:
float: Percentage of distances below the threshold. \
If all target keypoints are missing, return -1.
"""
distance_valid = distances != -1
num_distance_valid = distance_valid.sum()
if num_distance_valid > 0:
return (distances[distance_valid] < thr).sum() / num_distance_valid
return -1
def keypoint_pck_accuracy(pred, gt, mask, thr, normalize):
"""Calculate the pose accuracy of PCK for each individual keypoint and the
averaged accuracy across all keypoints for coordinates.
Note:
PCK metric measures accuracy of the localization of the body joints.
The distances between predicted positions and the ground-truth ones
are typically normalized by the bounding box size.
The threshold (thr) of the normalized distance is commonly set
as 0.05, 0.1 or 0.2 etc.
- batch_size: N
- num_keypoints: K
Args:
pred (np.ndarray[N, K, 2]): Predicted keypoint location.
gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
mask (np.ndarray[N, K]): Visibility of the target. False for invisible
joints, and True for visible. Invisible joints will be ignored for
accuracy calculation.
thr (float): Threshold of PCK calculation.
normalize (np.ndarray[N, 2]): Normalization factor for H&W.
Returns:
tuple: A tuple containing keypoint accuracy.
- acc (np.ndarray[K]): Accuracy of each keypoint.
- avg_acc (float): Averaged accuracy across all keypoints.
- cnt (int): Number of valid keypoints.
"""
distances = _calc_distances(pred, gt, mask, normalize)
acc = np.array([_distance_acc(d, thr) for d in distances])
valid_acc = acc[acc >= 0]
cnt = len(valid_acc)
avg_acc = valid_acc.mean() if cnt > 0 else 0
return acc, avg_acc, cnt
def keypoint_auc(pred, gt, mask, normalize, num_step=20):
"""Calculate the pose accuracy of PCK for each individual keypoint and the
averaged accuracy across all keypoints for coordinates.
Note:
- batch_size: N
- num_keypoints: K
Args:
pred (np.ndarray[N, K, 2]): Predicted keypoint location.
gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
mask (np.ndarray[N, K]): Visibility of the target. False for invisible
joints, and True for visible. Invisible joints will be ignored for
accuracy calculation.
normalize (float): Normalization factor.
Returns:
float: Area under curve.
"""
nor = np.tile(np.array([[normalize, normalize]]), (pred.shape[0], 1))
x = [1.0 * i / num_step for i in range(num_step)]
y = []
for thr in x:
_, avg_acc, _ = keypoint_pck_accuracy(pred, gt, mask, thr, nor)
y.append(avg_acc)
auc = 0
for i in range(num_step):
auc += 1.0 / num_step * y[i]
return auc
def keypoint_epe(pred, gt, mask):
"""Calculate the end-point error.
Note:
- batch_size: N
- num_keypoints: K
Args:
pred (np.ndarray[N, K, 2]): Predicted keypoint location.
gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
mask (np.ndarray[N, K]): Visibility of the target. False for invisible
joints, and True for visible. Invisible joints will be ignored for
accuracy calculation.
Returns:
float: Average end-point error.
"""
normalize = np.ones((pred.shape[0], pred.shape[2]), dtype=np.float32)
distances = _calc_distances(pred, gt, mask, normalize)
distance_valid = distances[distances != -1]
return distance_valid.sum() / max(1, len(distance_valid))
================================================
FILE: ppdet/modeling/lane_utils.py
================================================
import os
import cv2
import numpy as np
from scipy.interpolate import InterpolatedUnivariateSpline
class Lane:
def __init__(self, points=None, invalid_value=-2., metadata=None):
super(Lane, self).__init__()
self.curr_iter = 0
self.points = points
self.invalid_value = invalid_value
self.function = InterpolatedUnivariateSpline(
points[:, 1], points[:, 0], k=min(3, len(points) - 1))
self.min_y = points[:, 1].min() - 0.01
self.max_y = points[:, 1].max() + 0.01
self.metadata = metadata or {}
def __repr__(self):
return '[Lane]\n' + str(self.points) + '\n[/Lane]'
def __call__(self, lane_ys):
lane_xs = self.function(lane_ys)
lane_xs[(lane_ys < self.min_y) | (lane_ys > self.max_y
)] = self.invalid_value
return lane_xs
def to_array(self, sample_y_range, img_w, img_h):
self.sample_y = range(sample_y_range[0], sample_y_range[1],
sample_y_range[2])
sample_y = self.sample_y
img_w, img_h = img_w, img_h
ys = np.array(sample_y) / float(img_h)
xs = self(ys)
valid_mask = (xs >= 0) & (xs < 1)
lane_xs = xs[valid_mask] * img_w
lane_ys = ys[valid_mask] * img_h
lane = np.concatenate(
(lane_xs.reshape(-1, 1), lane_ys.reshape(-1, 1)), axis=1)
return lane
def __iter__(self):
return self
def __next__(self):
if self.curr_iter < len(self.points):
self.curr_iter += 1
return self.points[self.curr_iter - 1]
self.curr_iter = 0
raise StopIteration
COLORS = [
(255, 0, 0),
(0, 255, 0),
(0, 0, 255),
(255, 255, 0),
(255, 0, 255),
(0, 255, 255),
(128, 255, 0),
(255, 128, 0),
(128, 0, 255),
(255, 0, 128),
(0, 128, 255),
(0, 255, 128),
(128, 255, 255),
(255, 128, 255),
(255, 255, 128),
(60, 180, 0),
(180, 60, 0),
(0, 60, 180),
(0, 180, 60),
(60, 0, 180),
(180, 0, 60),
(255, 0, 0),
(0, 255, 0),
(0, 0, 255),
(255, 255, 0),
(255, 0, 255),
(0, 255, 255),
(128, 255, 0),
(255, 128, 0),
(128, 0, 255),
]
def imshow_lanes(img, lanes, show=False, out_file=None, width=4):
lanes_xys = []
for _, lane in enumerate(lanes):
xys = []
for x, y in lane:
if x <= 0 or y <= 0:
continue
x, y = int(x), int(y)
xys.append((x, y))
lanes_xys.append(xys)
lanes_xys.sort(key=lambda xys: xys[0][0] if len(xys) > 0 else 0)
for idx, xys in enumerate(lanes_xys):
for i in range(1, len(xys)):
cv2.line(img, xys[i - 1], xys[i], COLORS[idx], thickness=width)
if show:
cv2.imshow('view', img)
cv2.waitKey(0)
if out_file:
if not os.path.exists(os.path.dirname(out_file)):
os.makedirs(os.path.dirname(out_file))
cv2.imwrite(out_file, img)
================================================
FILE: ppdet/modeling/layers.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import six
import numpy as np
from numbers import Integral
import paddle
import paddle.nn as nn
from paddle import ParamAttr
from paddle import to_tensor
import paddle.nn.functional as F
from paddle.nn.initializer import Normal, Constant, XavierUniform
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register, serializable
from ppdet.modeling.bbox_utils import delta2bbox
from . import ops
from .initializer import xavier_uniform_, constant_
from paddle.vision.ops import DeformConv2D
def _to_list(l):
if isinstance(l, (list, tuple)):
return list(l)
return [l]
class AlignConv(nn.Layer):
def __init__(self, in_channels, out_channels, kernel_size=3, groups=1):
super(AlignConv, self).__init__()
self.kernel_size = kernel_size
self.align_conv = paddle.vision.ops.DeformConv2D(
in_channels,
out_channels,
kernel_size=self.kernel_size,
padding=(self.kernel_size - 1) // 2,
groups=groups,
weight_attr=ParamAttr(initializer=Normal(0, 0.01)),
bias_attr=None)
@paddle.no_grad()
def get_offset(self, anchors, featmap_size, stride):
"""
Args:
anchors: [B, L, 5] xc,yc,w,h,angle
featmap_size: (feat_h, feat_w)
stride: 8
Returns:
"""
batch = anchors.shape[0]
dtype = anchors.dtype
feat_h, feat_w = featmap_size
pad = (self.kernel_size - 1) // 2
idx = paddle.arange(-pad, pad + 1, dtype=dtype)
yy, xx = paddle.meshgrid(idx, idx)
xx = paddle.reshape(xx, [-1])
yy = paddle.reshape(yy, [-1])
# get sampling locations of default conv
xc = paddle.arange(0, feat_w, dtype=dtype)
yc = paddle.arange(0, feat_h, dtype=dtype)
yc, xc = paddle.meshgrid(yc, xc)
xc = paddle.reshape(xc, [-1, 1])
yc = paddle.reshape(yc, [-1, 1])
x_conv = xc + xx
y_conv = yc + yy
# get sampling locations of anchors
x_ctr, y_ctr, w, h, a = paddle.split(anchors, 5, axis=-1)
x_ctr = x_ctr / stride
y_ctr = y_ctr / stride
w_s = w / stride
h_s = h / stride
cos, sin = paddle.cos(a), paddle.sin(a)
dw, dh = w_s / self.kernel_size, h_s / self.kernel_size
x, y = dw * xx, dh * yy
xr = cos * x - sin * y
yr = sin * x + cos * y
x_anchor, y_anchor = xr + x_ctr, yr + y_ctr
# get offset filed
offset_x = x_anchor - x_conv
offset_y = y_anchor - y_conv
offset = paddle.stack([offset_y, offset_x], axis=-1)
offset = offset.reshape(
[batch, feat_h, feat_w, self.kernel_size * self.kernel_size * 2])
offset = offset.transpose([0, 3, 1, 2])
return offset
def forward(self, x, refine_anchors, featmap_size, stride):
batch = x.shape[0].numpy()
offset = self.get_offset(refine_anchors, featmap_size, stride)
if self.training:
x = F.relu(self.align_conv(x, offset.detach()))
else:
x = F.relu(self.align_conv(x, offset))
return x
class DeformableConvV2(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
weight_attr=None,
bias_attr=None,
lr_scale=1,
regularizer=None,
skip_quant=False,
dcn_bias_regularizer=L2Decay(0.),
dcn_bias_lr_scale=2.):
super(DeformableConvV2, self).__init__()
self.offset_channel = 2 * kernel_size**2
self.mask_channel = kernel_size**2
if lr_scale == 1 and regularizer is None:
offset_bias_attr = ParamAttr(initializer=Constant(0.))
else:
offset_bias_attr = ParamAttr(
initializer=Constant(0.),
learning_rate=lr_scale,
regularizer=regularizer)
self.conv_offset = nn.Conv2D(
in_channels,
3 * kernel_size**2,
kernel_size,
stride=stride,
padding=(kernel_size - 1) // 2,
weight_attr=ParamAttr(initializer=Constant(0.0)),
bias_attr=offset_bias_attr)
if skip_quant:
self.conv_offset.skip_quant = True
if bias_attr:
# in FCOS-DCN head, specifically need learning_rate and regularizer
dcn_bias_attr = ParamAttr(
initializer=Constant(value=0),
regularizer=dcn_bias_regularizer,
learning_rate=dcn_bias_lr_scale)
else:
# in ResNet backbone, do not need bias
dcn_bias_attr = False
self.conv_dcn = DeformConv2D(
in_channels,
out_channels,
kernel_size,
stride=stride,
padding=(kernel_size - 1) // 2 * dilation,
dilation=dilation,
groups=groups,
weight_attr=weight_attr,
bias_attr=dcn_bias_attr)
def forward(self, x):
offset_mask = self.conv_offset(x)
offset, mask = paddle.split(
offset_mask,
num_or_sections=[self.offset_channel, self.mask_channel],
axis=1)
mask = F.sigmoid(mask)
y = self.conv_dcn(x, offset, mask=mask)
return y
class ConvNormLayer(nn.Layer):
def __init__(self,
ch_in,
ch_out,
filter_size,
stride,
groups=1,
norm_type='bn',
norm_decay=0.,
norm_groups=32,
use_dcn=False,
bias_on=False,
lr_scale=1.,
freeze_norm=False,
initializer=Normal(
mean=0., std=0.01),
skip_quant=False,
dcn_lr_scale=2.,
dcn_regularizer=L2Decay(0.)):
super(ConvNormLayer, self).__init__()
assert norm_type in ['bn', 'sync_bn', 'gn', None]
if bias_on:
bias_attr = ParamAttr(
initializer=Constant(value=0.), learning_rate=lr_scale)
else:
bias_attr = False
if not use_dcn:
self.conv = nn.Conv2D(
in_channels=ch_in,
out_channels=ch_out,
kernel_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
weight_attr=ParamAttr(
initializer=initializer, learning_rate=1.),
bias_attr=bias_attr)
if skip_quant:
self.conv.skip_quant = True
else:
# in FCOS-DCN head, specifically need learning_rate and regularizer
self.conv = DeformableConvV2(
in_channels=ch_in,
out_channels=ch_out,
kernel_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
weight_attr=ParamAttr(
initializer=initializer, learning_rate=1.),
bias_attr=True,
lr_scale=dcn_lr_scale,
regularizer=dcn_regularizer,
dcn_bias_regularizer=dcn_regularizer,
dcn_bias_lr_scale=dcn_lr_scale,
skip_quant=skip_quant)
norm_lr = 0. if freeze_norm else 1.
param_attr = ParamAttr(
learning_rate=norm_lr,
regularizer=L2Decay(norm_decay) if norm_decay is not None else None)
bias_attr = ParamAttr(
learning_rate=norm_lr,
regularizer=L2Decay(norm_decay) if norm_decay is not None else None)
if norm_type in ['bn', 'sync_bn']:
self.norm = nn.BatchNorm2D(
ch_out, weight_attr=param_attr, bias_attr=bias_attr)
elif norm_type == 'gn':
self.norm = nn.GroupNorm(
num_groups=norm_groups,
num_channels=ch_out,
weight_attr=param_attr,
bias_attr=bias_attr)
else:
self.norm = None
def forward(self, inputs):
out = self.conv(inputs)
if self.norm is not None:
out = self.norm(out)
return out
class LiteConv(nn.Layer):
def __init__(self,
in_channels,
out_channels,
stride=1,
with_act=True,
norm_type='sync_bn',
name=None):
super(LiteConv, self).__init__()
self.lite_conv = nn.Sequential()
conv1 = ConvNormLayer(
in_channels,
in_channels,
filter_size=5,
stride=stride,
groups=in_channels,
norm_type=norm_type,
initializer=XavierUniform())
conv2 = ConvNormLayer(
in_channels,
out_channels,
filter_size=1,
stride=stride,
norm_type=norm_type,
initializer=XavierUniform())
conv3 = ConvNormLayer(
out_channels,
out_channels,
filter_size=1,
stride=stride,
norm_type=norm_type,
initializer=XavierUniform())
conv4 = ConvNormLayer(
out_channels,
out_channels,
filter_size=5,
stride=stride,
groups=out_channels,
norm_type=norm_type,
initializer=XavierUniform())
conv_list = [conv1, conv2, conv3, conv4]
self.lite_conv.add_sublayer('conv1', conv1)
self.lite_conv.add_sublayer('relu6_1', nn.ReLU6())
self.lite_conv.add_sublayer('conv2', conv2)
if with_act:
self.lite_conv.add_sublayer('relu6_2', nn.ReLU6())
self.lite_conv.add_sublayer('conv3', conv3)
self.lite_conv.add_sublayer('relu6_3', nn.ReLU6())
self.lite_conv.add_sublayer('conv4', conv4)
if with_act:
self.lite_conv.add_sublayer('relu6_4', nn.ReLU6())
def forward(self, inputs):
out = self.lite_conv(inputs)
return out
class DropBlock(nn.Layer):
def __init__(self, block_size, keep_prob, name=None, data_format='NCHW'):
"""
DropBlock layer, see https://arxiv.org/abs/1810.12890
Args:
block_size (int): block size
keep_prob (int): keep probability
name (str): layer name
data_format (str): data format, NCHW or NHWC
"""
super(DropBlock, self).__init__()
self.block_size = block_size
self.keep_prob = keep_prob
self.name = name
self.data_format = data_format
def forward(self, x):
if not self.training or self.keep_prob == 1:
return x
else:
gamma = (1. - self.keep_prob) / (self.block_size**2)
if self.data_format == 'NCHW':
shape = x.shape[2:]
else:
shape = x.shape[1:3]
for s in shape:
gamma *= s / (s - self.block_size + 1)
matrix = paddle.cast(paddle.rand(x.shape) < gamma, x.dtype)
mask_inv = F.max_pool2d(
matrix,
self.block_size,
stride=1,
padding=self.block_size // 2,
data_format=self.data_format)
mask = 1. - mask_inv
mask = mask.astype('float32')
x = x.astype('float32')
y = x * mask * (mask.numel() / mask.sum())
return y
@register
@serializable
class AnchorGeneratorSSD(object):
def __init__(self,
steps=[8, 16, 32, 64, 100, 300],
aspect_ratios=[[2.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]],
min_ratio=15,
max_ratio=90,
base_size=300,
min_sizes=[30.0, 60.0, 111.0, 162.0, 213.0, 264.0],
max_sizes=[60.0, 111.0, 162.0, 213.0, 264.0, 315.0],
offset=0.5,
flip=True,
clip=False,
min_max_aspect_ratios_order=False):
self.steps = steps
self.aspect_ratios = aspect_ratios
self.min_ratio = min_ratio
self.max_ratio = max_ratio
self.base_size = base_size
self.min_sizes = min_sizes
self.max_sizes = max_sizes
self.offset = offset
self.flip = flip
self.clip = clip
self.min_max_aspect_ratios_order = min_max_aspect_ratios_order
if self.min_sizes == [] and self.max_sizes == []:
num_layer = len(aspect_ratios)
step = int(
math.floor(((self.max_ratio - self.min_ratio)) / (num_layer - 2
)))
for ratio in six.moves.range(self.min_ratio, self.max_ratio + 1,
step):
self.min_sizes.append(self.base_size * ratio / 100.)
self.max_sizes.append(self.base_size * (ratio + step) / 100.)
self.min_sizes = [self.base_size * .10] + self.min_sizes
self.max_sizes = [self.base_size * .20] + self.max_sizes
self.num_priors = []
for aspect_ratio, min_size, max_size in zip(
aspect_ratios, self.min_sizes, self.max_sizes):
if isinstance(min_size, (list, tuple)):
self.num_priors.append(
len(_to_list(min_size)) + len(_to_list(max_size)))
else:
self.num_priors.append((len(aspect_ratio) * 2 + 1) * len(
_to_list(min_size)) + len(_to_list(max_size)))
def __call__(self, inputs, image):
boxes = []
for input, min_size, max_size, aspect_ratio, step in zip(
inputs, self.min_sizes, self.max_sizes, self.aspect_ratios,
self.steps):
box, _ = ops.prior_box(
input=input,
image=image,
min_sizes=_to_list(min_size),
max_sizes=_to_list(max_size),
aspect_ratios=aspect_ratio,
flip=self.flip,
clip=self.clip,
steps=[step, step],
offset=self.offset,
min_max_aspect_ratios_order=self.min_max_aspect_ratios_order)
boxes.append(paddle.reshape(box, [-1, 4]))
return boxes
@register
@serializable
class RCNNBox(object):
__shared__ = ['num_classes', 'export_onnx']
def __init__(self,
prior_box_var=[10., 10., 5., 5.],
code_type="decode_center_size",
box_normalized=False,
num_classes=80,
export_onnx=False):
super(RCNNBox, self).__init__()
self.prior_box_var = prior_box_var
self.code_type = code_type
self.box_normalized = box_normalized
self.num_classes = num_classes
self.export_onnx = export_onnx
def __call__(self, bbox_head_out, rois, im_shape, scale_factor):
bbox_pred = bbox_head_out[0]
cls_prob = bbox_head_out[1]
roi = rois[0]
rois_num = rois[1]
if self.export_onnx:
onnx_rois_num_per_im = rois_num[0]
origin_shape = paddle.expand(im_shape[0, :],
[onnx_rois_num_per_im, 2])
else:
origin_shape_list = []
if isinstance(roi, list):
batch_size = len(roi)
else:
batch_size = paddle.slice(paddle.shape(im_shape), [0], [0], [1])
# bbox_pred.shape: [N, C*4]
for idx in range(batch_size):
rois_num_per_im = rois_num[idx]
expand_im_shape = paddle.expand(im_shape[idx, :],
[rois_num_per_im, 2])
origin_shape_list.append(expand_im_shape)
origin_shape = paddle.concat(origin_shape_list)
# bbox_pred.shape: [N, C*4]
# C=num_classes in faster/mask rcnn(bbox_head), C=1 in cascade rcnn(cascade_head)
bbox = paddle.concat(roi)
bbox = delta2bbox(bbox_pred, bbox, self.prior_box_var)
scores = cls_prob[:, :-1]
# bbox.shape: [N, C, 4]
# bbox.shape[1] must be equal to scores.shape[1]
total_num = bbox.shape[0]
bbox_dim = bbox.shape[-1]
bbox = paddle.expand(bbox, [total_num, self.num_classes, bbox_dim])
origin_h = paddle.unsqueeze(origin_shape[:, 0], axis=1)
origin_w = paddle.unsqueeze(origin_shape[:, 1], axis=1)
zeros = paddle.zeros_like(origin_h)
x1 = paddle.maximum(paddle.minimum(bbox[:, :, 0], origin_w), zeros)
y1 = paddle.maximum(paddle.minimum(bbox[:, :, 1], origin_h), zeros)
x2 = paddle.maximum(paddle.minimum(bbox[:, :, 2], origin_w), zeros)
y2 = paddle.maximum(paddle.minimum(bbox[:, :, 3], origin_h), zeros)
bbox = paddle.stack([x1, y1, x2, y2], axis=-1)
bboxes = (bbox, rois_num)
return bboxes, scores
@register
@serializable
class MultiClassNMS(object):
def __init__(self,
score_threshold=.05,
nms_top_k=-1,
keep_top_k=100,
nms_threshold=.5,
normalized=True,
nms_eta=1.0,
return_index=False,
return_rois_num=True,
trt=False,
cpu=False):
super(MultiClassNMS, self).__init__()
self.score_threshold = score_threshold
self.nms_top_k = nms_top_k
self.keep_top_k = keep_top_k
self.nms_threshold = nms_threshold
self.normalized = normalized
self.nms_eta = nms_eta
self.return_index = return_index
self.return_rois_num = return_rois_num
self.trt = trt
self.cpu = cpu
def __call__(self, bboxes, score, background_label=-1):
"""
bboxes (Tensor|List[Tensor]): 1. (Tensor) Predicted bboxes with shape
[N, M, 4], N is the batch size and M
is the number of bboxes
2. (List[Tensor]) bboxes and bbox_num,
bboxes have shape of [M, C, 4], C
is the class number and bbox_num means
the number of bboxes of each batch with
shape [N,]
score (Tensor): Predicted scores with shape [N, C, M] or [M, C]
background_label (int): Ignore the background label; For example, RCNN
is num_classes and YOLO is -1.
"""
kwargs = self.__dict__.copy()
if isinstance(bboxes, tuple):
bboxes, bbox_num = bboxes
kwargs.update({'rois_num': bbox_num})
if background_label > -1:
kwargs.update({'background_label': background_label})
kwargs.pop('trt')
kwargs.pop('cpu')
# TODO(wangxinxin08): paddle version should be develop or 2.3 and above to run nms on tensorrt
if self.trt and (int(paddle.version.major) == 0 or
(int(paddle.version.major) >= 2 and
int(paddle.version.minor) >= 3)):
# TODO(wangxinxin08): tricky switch to run nms on tensorrt
kwargs.update({'nms_eta': 1.1})
bbox, bbox_num, _ = ops.multiclass_nms(bboxes, score, **kwargs)
bbox = bbox.reshape([1, -1, 6])
idx = paddle.nonzero(bbox[..., 0] != -1)
bbox = paddle.gather_nd(bbox, idx)
return bbox, bbox_num, None
else:
if self.cpu:
device = paddle.device.get_device()
paddle.set_device('cpu')
outputs = ops.multiclass_nms(bboxes, score, **kwargs)
paddle.set_device(device)
return outputs
else:
return ops.multiclass_nms(bboxes, score, **kwargs)
@register
@serializable
class MatrixNMS(object):
__append_doc__ = True
def __init__(self,
score_threshold=.05,
post_threshold=.05,
nms_top_k=-1,
keep_top_k=100,
use_gaussian=False,
gaussian_sigma=2.,
normalized=False,
background_label=0):
super(MatrixNMS, self).__init__()
self.score_threshold = score_threshold
self.post_threshold = post_threshold
self.nms_top_k = nms_top_k
self.keep_top_k = keep_top_k
self.normalized = normalized
self.use_gaussian = use_gaussian
self.gaussian_sigma = gaussian_sigma
self.background_label = background_label
def __call__(self, bbox, score, *args):
return ops.matrix_nms(
bboxes=bbox,
scores=score,
score_threshold=self.score_threshold,
post_threshold=self.post_threshold,
nms_top_k=self.nms_top_k,
keep_top_k=self.keep_top_k,
use_gaussian=self.use_gaussian,
gaussian_sigma=self.gaussian_sigma,
background_label=self.background_label,
normalized=self.normalized)
@register
@serializable
class YOLOBox(object):
__shared__ = ['num_classes']
def __init__(self,
num_classes=80,
conf_thresh=0.005,
downsample_ratio=32,
clip_bbox=True,
scale_x_y=1.):
self.num_classes = num_classes
self.conf_thresh = conf_thresh
self.downsample_ratio = downsample_ratio
self.clip_bbox = clip_bbox
self.scale_x_y = scale_x_y
def __call__(self,
yolo_head_out,
anchors,
im_shape,
scale_factor,
var_weight=None):
boxes_list = []
scores_list = []
origin_shape = im_shape / scale_factor
origin_shape = paddle.cast(origin_shape, 'int32')
for i, head_out in enumerate(yolo_head_out):
boxes, scores = paddle.vision.ops.yolo_box(
head_out,
origin_shape,
anchors[i],
self.num_classes,
self.conf_thresh,
self.downsample_ratio // 2**i,
self.clip_bbox,
scale_x_y=self.scale_x_y)
boxes_list.append(boxes)
scores_list.append(paddle.transpose(scores, perm=[0, 2, 1]))
yolo_boxes = paddle.concat(boxes_list, axis=1)
yolo_scores = paddle.concat(scores_list, axis=2)
return yolo_boxes, yolo_scores
@register
@serializable
class SSDBox(object):
def __init__(self,
is_normalized=True,
prior_box_var=[0.1, 0.1, 0.2, 0.2],
use_fuse_decode=False):
self.is_normalized = is_normalized
self.norm_delta = float(not self.is_normalized)
self.prior_box_var = prior_box_var
self.use_fuse_decode = use_fuse_decode
def __call__(self,
preds,
prior_boxes,
im_shape,
scale_factor,
var_weight=None):
boxes, scores = preds
boxes = paddle.concat(boxes, axis=1)
prior_boxes = paddle.concat(prior_boxes)
if self.use_fuse_decode:
output_boxes = ops.box_coder(
prior_boxes,
self.prior_box_var,
boxes,
code_type="decode_center_size",
box_normalized=self.is_normalized)
else:
pb_w = prior_boxes[:, 2] - prior_boxes[:, 0] + self.norm_delta
pb_h = prior_boxes[:, 3] - prior_boxes[:, 1] + self.norm_delta
pb_x = prior_boxes[:, 0] + pb_w * 0.5
pb_y = prior_boxes[:, 1] + pb_h * 0.5
out_x = pb_x + boxes[:, :, 0] * pb_w * self.prior_box_var[0]
out_y = pb_y + boxes[:, :, 1] * pb_h * self.prior_box_var[1]
out_w = paddle.exp(boxes[:, :, 2] * self.prior_box_var[2]) * pb_w
out_h = paddle.exp(boxes[:, :, 3] * self.prior_box_var[3]) * pb_h
output_boxes = paddle.stack(
[
out_x - out_w / 2., out_y - out_h / 2., out_x + out_w / 2.,
out_y + out_h / 2.
],
axis=-1)
if self.is_normalized:
h = (im_shape[:, 0] / scale_factor[:, 0]).unsqueeze(-1)
w = (im_shape[:, 1] / scale_factor[:, 1]).unsqueeze(-1)
im_shape = paddle.stack([w, h, w, h], axis=-1)
output_boxes *= im_shape
else:
output_boxes[..., -2:] -= 1.0
output_scores = F.softmax(paddle.concat(
scores, axis=1)).transpose([0, 2, 1])
return output_boxes, output_scores
@register
class TTFBox(object):
__shared__ = ['down_ratio']
def __init__(self, max_per_img=100, score_thresh=0.01, down_ratio=4):
super(TTFBox, self).__init__()
self.max_per_img = max_per_img
self.score_thresh = score_thresh
self.down_ratio = down_ratio
def _simple_nms(self, heat, kernel=3):
"""
Use maxpool to filter the max score, get local peaks.
"""
pad = (kernel - 1) // 2
hmax = F.max_pool2d(heat, kernel, stride=1, padding=pad)
keep = paddle.cast(hmax == heat, 'float32')
return heat * keep
def _topk(self, scores):
"""
Select top k scores and decode to get xy coordinates.
"""
k = self.max_per_img
shape_fm = paddle.shape(scores)
shape_fm.stop_gradient = True
cat, height, width = shape_fm[1], shape_fm[2], shape_fm[3]
# batch size is 1
scores_r = paddle.reshape(scores, [cat, -1])
topk_scores, topk_inds = paddle.topk(scores_r, k)
topk_ys = topk_inds // width
topk_xs = topk_inds % width
topk_score_r = paddle.reshape(topk_scores, [-1])
topk_score, topk_ind = paddle.topk(topk_score_r, k)
k_t = paddle.full(topk_ind.shape, k, dtype='int64')
topk_clses = paddle.cast(paddle.floor_divide(topk_ind, k_t), 'float32')
topk_inds = paddle.reshape(topk_inds, [-1])
topk_ys = paddle.reshape(topk_ys, [-1, 1])
topk_xs = paddle.reshape(topk_xs, [-1, 1])
topk_inds = paddle.gather(topk_inds, topk_ind)
topk_ys = paddle.gather(topk_ys, topk_ind)
topk_xs = paddle.gather(topk_xs, topk_ind)
return topk_score, topk_inds, topk_clses, topk_ys, topk_xs
def _decode(self, hm, wh, im_shape, scale_factor):
heatmap = F.sigmoid(hm)
heat = self._simple_nms(heatmap)
scores, inds, clses, ys, xs = self._topk(heat)
ys = paddle.cast(ys, 'float32') * self.down_ratio
xs = paddle.cast(xs, 'float32') * self.down_ratio
scores = paddle.tensor.unsqueeze(scores, [1])
clses = paddle.tensor.unsqueeze(clses, [1])
wh_t = paddle.transpose(wh, [0, 2, 3, 1])
wh = paddle.reshape(wh_t, [-1, wh_t.shape[-1]])
wh = paddle.gather(wh, inds)
x1 = xs - wh[:, 0:1]
y1 = ys - wh[:, 1:2]
x2 = xs + wh[:, 2:3]
y2 = ys + wh[:, 3:4]
bboxes = paddle.concat([x1, y1, x2, y2], axis=1)
scale_y = scale_factor[:, 0:1]
scale_x = scale_factor[:, 1:2]
scale_expand = paddle.concat(
[scale_x, scale_y, scale_x, scale_y], axis=1)
boxes_shape = paddle.shape(bboxes)
boxes_shape.stop_gradient = True
scale_expand = paddle.expand(scale_expand, shape=boxes_shape)
bboxes = paddle.divide(bboxes, scale_expand)
results = paddle.concat([clses, scores, bboxes], axis=1)
# hack: append result with cls=-1 and score=1. to avoid all scores
# are less than score_thresh which may cause error in gather.
fill_r = paddle.to_tensor(np.array([[-1, 1, 0, 0, 0, 0]]))
fill_r = paddle.cast(fill_r, results.dtype)
results = paddle.concat([results, fill_r])
scores = results[:, 1]
valid_ind = paddle.nonzero(scores > self.score_thresh)
results = paddle.gather(results, valid_ind)
return results, results.shape[0:1]
def __call__(self, hm, wh, im_shape, scale_factor):
results = []
results_num = []
for i in range(scale_factor.shape[0]):
result, num = self._decode(hm[i:i + 1, ], wh[i:i + 1, ],
im_shape[i:i + 1, ],
scale_factor[i:i + 1, ])
results.append(result)
results_num.append(num)
results = paddle.concat(results, axis=0)
results_num = paddle.concat(results_num, axis=0)
return results, results_num
@register
@serializable
class JDEBox(object):
__shared__ = ['num_classes']
def __init__(self, num_classes=1, conf_thresh=0.3, downsample_ratio=32):
self.num_classes = num_classes
self.conf_thresh = conf_thresh
self.downsample_ratio = downsample_ratio
def generate_anchor(self, nGh, nGw, anchor_wh):
nA = len(anchor_wh)
yv, xv = paddle.meshgrid([paddle.arange(nGh), paddle.arange(nGw)])
mesh = paddle.stack(
(xv, yv), axis=0).cast(dtype='float32') # 2 x nGh x nGw
meshs = paddle.tile(mesh, [nA, 1, 1, 1])
anchor_offset_mesh = anchor_wh[:, :, None][:, :, :, None].repeat(
int(nGh), axis=-2).repeat(
int(nGw), axis=-1)
anchor_offset_mesh = paddle.to_tensor(
anchor_offset_mesh.astype(np.float32))
# nA x 2 x nGh x nGw
anchor_mesh = paddle.concat([meshs, anchor_offset_mesh], axis=1)
anchor_mesh = paddle.transpose(anchor_mesh,
[0, 2, 3, 1]) # (nA x nGh x nGw) x 4
return anchor_mesh
def decode_delta(self, delta, fg_anchor_list):
px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \
fg_anchor_list[:, 2], fg_anchor_list[:,3]
dx, dy, dw, dh = delta[:, 0], delta[:, 1], delta[:, 2], delta[:, 3]
gx = pw * dx + px
gy = ph * dy + py
gw = pw * paddle.exp(dw)
gh = ph * paddle.exp(dh)
gx1 = gx - gw * 0.5
gy1 = gy - gh * 0.5
gx2 = gx + gw * 0.5
gy2 = gy + gh * 0.5
return paddle.stack([gx1, gy1, gx2, gy2], axis=1)
def decode_delta_map(self, nA, nGh, nGw, delta_map, anchor_vec):
anchor_mesh = self.generate_anchor(nGh, nGw, anchor_vec)
anchor_mesh = paddle.unsqueeze(anchor_mesh, 0)
pred_list = self.decode_delta(
paddle.reshape(
delta_map, shape=[-1, 4]),
paddle.reshape(
anchor_mesh, shape=[-1, 4]))
pred_map = paddle.reshape(pred_list, shape=[nA * nGh * nGw, 4])
return pred_map
def _postprocessing_by_level(self, nA, stride, head_out, anchor_vec):
boxes_shape = head_out.shape # [nB, nA*6, nGh, nGw]
nGh, nGw = boxes_shape[-2], boxes_shape[-1]
nB = 1 # TODO: only support bs=1 now
boxes_list, scores_list = [], []
for idx in range(nB):
p = paddle.reshape(
head_out[idx], shape=[nA, self.num_classes + 5, nGh, nGw])
p = paddle.transpose(p, perm=[0, 2, 3, 1]) # [nA, nGh, nGw, 6]
delta_map = p[:, :, :, :4]
boxes = self.decode_delta_map(nA, nGh, nGw, delta_map, anchor_vec)
# [nA * nGh * nGw, 4]
boxes_list.append(boxes * stride)
p_conf = paddle.transpose(
p[:, :, :, 4:6], perm=[3, 0, 1, 2]) # [2, nA, nGh, nGw]
p_conf = F.softmax(
p_conf, axis=0)[1, :, :, :].unsqueeze(-1) # [nA, nGh, nGw, 1]
scores = paddle.reshape(p_conf, shape=[nA * nGh * nGw, 1])
scores_list.append(scores)
boxes_results = paddle.stack(boxes_list)
scores_results = paddle.stack(scores_list)
return boxes_results, scores_results
def __call__(self, yolo_head_out, anchors):
bbox_pred_list = []
for i, head_out in enumerate(yolo_head_out):
stride = self.downsample_ratio // 2**i
anc_w, anc_h = anchors[i][0::2], anchors[i][1::2]
anchor_vec = np.stack((anc_w, anc_h), axis=1) / stride
nA = len(anc_w)
boxes, scores = self._postprocessing_by_level(nA, stride, head_out,
anchor_vec)
bbox_pred_list.append(paddle.concat([boxes, scores], axis=-1))
yolo_boxes_scores = paddle.concat(bbox_pred_list, axis=1)
boxes_idx_over_conf_thr = paddle.nonzero(
yolo_boxes_scores[:, :, -1] > self.conf_thresh)
boxes_idx_over_conf_thr.stop_gradient = True
return boxes_idx_over_conf_thr, yolo_boxes_scores
@register
@serializable
class MaskMatrixNMS(object):
"""
Matrix NMS for multi-class masks.
Args:
update_threshold (float): Updated threshold of categroy score in second time.
pre_nms_top_n (int): Number of total instance to be kept per image before NMS
post_nms_top_n (int): Number of total instance to be kept per image after NMS.
kernel (str): 'linear' or 'gaussian'.
sigma (float): std in gaussian method.
Input:
seg_preds (Variable): shape (n, h, w), segmentation feature maps
seg_masks (Variable): shape (n, h, w), segmentation feature maps
cate_labels (Variable): shape (n), mask labels in descending order
cate_scores (Variable): shape (n), mask scores in descending order
sum_masks (Variable): a float tensor of the sum of seg_masks
Returns:
Variable: cate_scores, tensors of shape (n)
"""
def __init__(self,
update_threshold=0.05,
pre_nms_top_n=500,
post_nms_top_n=100,
kernel='gaussian',
sigma=2.0):
super(MaskMatrixNMS, self).__init__()
self.update_threshold = update_threshold
self.pre_nms_top_n = pre_nms_top_n
self.post_nms_top_n = post_nms_top_n
self.kernel = kernel
self.sigma = sigma
def _sort_score(self, scores, top_num):
if scores.shape[0] > top_num:
return paddle.topk(scores, top_num)[1]
else:
return paddle.argsort(scores, descending=True)
def __call__(self,
seg_preds,
seg_masks,
cate_labels,
cate_scores,
sum_masks=None):
# sort and keep top nms_pre
sort_inds = self._sort_score(cate_scores, self.pre_nms_top_n)
seg_masks = paddle.gather(seg_masks, index=sort_inds)
seg_preds = paddle.gather(seg_preds, index=sort_inds)
sum_masks = paddle.gather(sum_masks, index=sort_inds)
cate_scores = paddle.gather(cate_scores, index=sort_inds)
cate_labels = paddle.gather(cate_labels, index=sort_inds)
seg_masks = paddle.flatten(seg_masks, start_axis=1, stop_axis=-1)
# inter.
inter_matrix = paddle.mm(seg_masks, paddle.transpose(seg_masks, [1, 0]))
n_samples = cate_labels.shape
n_samples = paddle.to_tensor(n_samples, dtype="int32")
# union.
sum_masks_x = paddle.expand(sum_masks, shape=[n_samples, n_samples])
# iou.
iou_matrix = (inter_matrix / (
sum_masks_x + paddle.transpose(sum_masks_x, [1, 0]) - inter_matrix))
iou_matrix = paddle.triu(iou_matrix, diagonal=1)
# label_specific matrix.
cate_labels_x = paddle.expand(cate_labels, shape=[n_samples, n_samples])
label_matrix = paddle.cast(
(cate_labels_x == paddle.transpose(cate_labels_x, [1, 0])),
'float32')
label_matrix = paddle.triu(label_matrix, diagonal=1)
# IoU compensation
compensate_iou = paddle.max((iou_matrix * label_matrix), axis=0)
compensate_iou = paddle.expand(
compensate_iou, shape=[n_samples, n_samples])
compensate_iou = paddle.transpose(compensate_iou, [1, 0])
# IoU decay
decay_iou = iou_matrix * label_matrix
# matrix nms
if self.kernel == 'gaussian':
decay_matrix = paddle.exp(-1 * self.sigma * (decay_iou**2))
compensate_matrix = paddle.exp(-1 * self.sigma *
(compensate_iou**2))
decay_coefficient = paddle.min(decay_matrix / compensate_matrix,
axis=0)
elif self.kernel == 'linear':
decay_matrix = (1 - decay_iou) / (1 - compensate_iou)
decay_coefficient = paddle.min(decay_matrix, axis=0)
else:
raise NotImplementedError
# update the score.
cate_scores = cate_scores * decay_coefficient
y = paddle.zeros(shape=cate_scores.shape, dtype='float32')
keep = paddle.where(cate_scores >= self.update_threshold, cate_scores,
y)
keep = paddle.nonzero(keep)
keep = paddle.squeeze(keep, axis=[1])
# Prevent empty and increase fake data
keep = paddle.concat(
[keep, paddle.cast(paddle.shape(cate_scores)[0:1] - 1, 'int64')])
seg_preds = paddle.gather(seg_preds, index=keep)
cate_scores = paddle.gather(cate_scores, index=keep)
cate_labels = paddle.gather(cate_labels, index=keep)
# sort and keep top_k
sort_inds = self._sort_score(cate_scores, self.post_nms_top_n)
seg_preds = paddle.gather(seg_preds, index=sort_inds)
cate_scores = paddle.gather(cate_scores, index=sort_inds)
cate_labels = paddle.gather(cate_labels, index=sort_inds)
return seg_preds, cate_scores, cate_labels
def Conv2d(in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
weight_init=Normal(std=0.001),
bias_init=Constant(0.)):
weight_attr = paddle.framework.ParamAttr(initializer=weight_init)
if bias:
bias_attr = paddle.framework.ParamAttr(initializer=bias_init)
else:
bias_attr = False
conv = nn.Conv2D(
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
weight_attr=weight_attr,
bias_attr=bias_attr)
return conv
def ConvTranspose2d(in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
output_padding=0,
groups=1,
bias=True,
dilation=1,
weight_init=Normal(std=0.001),
bias_init=Constant(0.)):
weight_attr = paddle.framework.ParamAttr(initializer=weight_init)
if bias:
bias_attr = paddle.framework.ParamAttr(initializer=bias_init)
else:
bias_attr = False
conv = nn.Conv2DTranspose(
in_channels,
out_channels,
kernel_size,
stride,
padding,
output_padding,
dilation,
groups,
weight_attr=weight_attr,
bias_attr=bias_attr)
return conv
def BatchNorm2d(num_features, eps=1e-05, momentum=0.9, affine=True):
if not affine:
weight_attr = False
bias_attr = False
else:
weight_attr = None
bias_attr = None
batchnorm = nn.BatchNorm2D(
num_features,
momentum,
eps,
weight_attr=weight_attr,
bias_attr=bias_attr)
return batchnorm
def ReLU():
return nn.ReLU()
def Upsample(scale_factor=None, mode='nearest', align_corners=False):
return nn.Upsample(None, scale_factor, mode, align_corners)
def MaxPool(kernel_size, stride, padding, ceil_mode=False):
return nn.MaxPool2D(kernel_size, stride, padding, ceil_mode=ceil_mode)
class Concat(nn.Layer):
def __init__(self, dim=0):
super(Concat, self).__init__()
self.dim = dim
def forward(self, inputs):
return paddle.concat(inputs, axis=self.dim)
def extra_repr(self):
return 'dim={}'.format(self.dim)
def _convert_attention_mask(attn_mask, dtype):
"""
Convert the attention mask to the target dtype we expect.
Parameters:
attn_mask (Tensor, optional): A tensor used in multi-head attention
to prevents attention to some unwanted positions, usually the
paddings or the subsequent positions. It is a tensor with shape
broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
When the data type is bool, the unwanted positions have `False`
values and the others have `True` values. When the data type is
int, the unwanted positions have 0 values and the others have 1
values. When the data type is float, the unwanted positions have
`-INF` values and the others have 0 values. It can be None when
nothing wanted or needed to be prevented attention to. Default None.
dtype (VarType): The target type of `attn_mask` we expect.
Returns:
Tensor: A Tensor with shape same as input `attn_mask`, with data type `dtype`.
"""
return nn.layer.transformer._convert_attention_mask(attn_mask, dtype)
@register
class MultiHeadAttention(nn.Layer):
"""
Attention mapps queries and a set of key-value pairs to outputs, and
Multi-Head Attention performs multiple parallel attention to jointly attending
to information from different representation subspaces.
Please refer to `Attention Is All You Need `_
for more details.
Parameters:
embed_dim (int): The expected feature size in the input and output.
num_heads (int): The number of heads in multi-head attention.
dropout (float, optional): The dropout probability used on attention
weights to drop some attention targets. 0 for no dropout. Default 0
kdim (int, optional): The feature size in key. If None, assumed equal to
`embed_dim`. Default None.
vdim (int, optional): The feature size in value. If None, assumed equal to
`embed_dim`. Default None.
need_weights (bool, optional): Indicate whether to return the attention
weights. Default False.
Examples:
.. code-block:: python
import paddle
# encoder input: [batch_size, sequence_length, d_model]
query = paddle.rand((2, 4, 128))
# self attention mask: [batch_size, num_heads, query_len, query_len]
attn_mask = paddle.rand((2, 2, 4, 4))
multi_head_attn = paddle.nn.MultiHeadAttention(128, 2)
output = multi_head_attn(query, None, None, attn_mask=attn_mask) # [2, 4, 128]
"""
def __init__(self,
embed_dim,
num_heads,
dropout=0.,
kdim=None,
vdim=None,
need_weights=False):
super(MultiHeadAttention, self).__init__()
self.embed_dim = embed_dim
self.kdim = kdim if kdim is not None else embed_dim
self.vdim = vdim if vdim is not None else embed_dim
self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.need_weights = need_weights
self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
if self._qkv_same_embed_dim:
self.in_proj_weight = self.create_parameter(
shape=[embed_dim, 3 * embed_dim],
attr=None,
dtype=self._dtype,
is_bias=False)
self.in_proj_bias = self.create_parameter(
shape=[3 * embed_dim],
attr=None,
dtype=self._dtype,
is_bias=True)
else:
self.q_proj = nn.Linear(embed_dim, embed_dim)
self.k_proj = nn.Linear(self.kdim, embed_dim)
self.v_proj = nn.Linear(self.vdim, embed_dim)
self.out_proj = nn.Linear(embed_dim, embed_dim)
self._type_list = ('q_proj', 'k_proj', 'v_proj')
self._reset_parameters()
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
xavier_uniform_(p)
else:
constant_(p)
def compute_qkv(self, tensor, index):
if self._qkv_same_embed_dim:
tensor = F.linear(
x=tensor,
weight=self.in_proj_weight[:, index * self.embed_dim:(index + 1)
* self.embed_dim],
bias=self.in_proj_bias[index * self.embed_dim:(index + 1) *
self.embed_dim]
if self.in_proj_bias is not None else None)
else:
tensor = getattr(self, self._type_list[index])(tensor)
tensor = tensor.reshape(
[0, 0, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
return tensor
def forward(self, query, key=None, value=None, attn_mask=None):
r"""
Applies multi-head attention to map queries and a set of key-value pairs
to outputs.
Parameters:
query (Tensor): The queries for multi-head attention. It is a
tensor with shape `[batch_size, query_length, embed_dim]`. The
data type should be float32 or float64.
key (Tensor, optional): The keys for multi-head attention. It is
a tensor with shape `[batch_size, key_length, kdim]`. The
data type should be float32 or float64. If None, use `query` as
`key`. Default None.
value (Tensor, optional): The values for multi-head attention. It
is a tensor with shape `[batch_size, value_length, vdim]`.
The data type should be float32 or float64. If None, use `query` as
`value`. Default None.
attn_mask (Tensor, optional): A tensor used in multi-head attention
to prevents attention to some unwanted positions, usually the
paddings or the subsequent positions. It is a tensor with shape
broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
When the data type is bool, the unwanted positions have `False`
values and the others have `True` values. When the data type is
int, the unwanted positions have 0 values and the others have 1
values. When the data type is float, the unwanted positions have
`-INF` values and the others have 0 values. It can be None when
nothing wanted or needed to be prevented attention to. Default None.
Returns:
Tensor|tuple: It is a tensor that has the same shape and data type \
as `query`, representing attention output. Or a tuple if \
`need_weights` is True or `cache` is not None. If `need_weights` \
is True, except for attention output, the tuple also includes \
the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \
If `cache` is not None, the tuple then includes the new cache \
having the same type as `cache`, and if it is `StaticCache`, it \
is same as the input `cache`, if it is `Cache`, the new cache \
reserves tensors concatanating raw tensors with intermediate \
results of current query.
"""
key = query if key is None else key
value = query if value is None else value
# compute q ,k ,v
q, k, v = (self.compute_qkv(t, i)
for i, t in enumerate([query, key, value]))
# scale dot product attention
product = paddle.matmul(x=q, y=k, transpose_y=True)
scaling = float(self.head_dim)**-0.5
product = product * scaling
if attn_mask is not None:
# Support bool or int mask
attn_mask = _convert_attention_mask(attn_mask, product.dtype)
product = product + attn_mask
weights = F.softmax(product)
if self.dropout:
weights = F.dropout(
weights,
self.dropout,
training=self.training,
mode="upscale_in_train")
out = paddle.matmul(weights, v)
# combine heads
out = paddle.transpose(out, perm=[0, 2, 1, 3])
out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
# project to output
out = self.out_proj(out)
outs = [out]
if self.need_weights:
outs.append(weights)
return out if len(outs) == 1 else tuple(outs)
@register
class ConvMixer(nn.Layer):
def __init__(
self,
dim,
depth,
kernel_size=3, ):
super().__init__()
self.dim = dim
self.depth = depth
self.kernel_size = kernel_size
self.mixer = self.conv_mixer(dim, depth, kernel_size)
def forward(self, x):
return self.mixer(x)
@staticmethod
def conv_mixer(
dim,
depth,
kernel_size, ):
Seq, ActBn = nn.Sequential, lambda x: Seq(x, nn.GELU(), nn.BatchNorm2D(dim))
Residual = type('Residual', (Seq, ),
{'forward': lambda self, x: self[0](x) + x})
return Seq(*[
Seq(Residual(
ActBn(
nn.Conv2D(
dim, dim, kernel_size, groups=dim, padding="same"))),
ActBn(nn.Conv2D(dim, dim, 1))) for i in range(depth)
])
================================================
FILE: ppdet/modeling/losses/__init__.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import yolo_loss
from . import iou_aware_loss
from . import iou_loss
from . import ssd_loss
from . import fcos_loss
from . import solov2_loss
from . import ctfocal_loss
from . import keypoint_loss
from . import jde_loss
from . import fairmot_loss
from . import gfocal_loss
from . import detr_loss
from . import sparsercnn_loss
from . import focal_loss
from . import smooth_l1_loss
from . import probiou_loss
from . import cot_loss
from . import supcontrast
from . import queryinst_loss
from . import clrnet_loss
from . import clrnet_line_iou_loss
from .yolo_loss import *
from .iou_aware_loss import *
from .iou_loss import *
from .ssd_loss import *
from .fcos_loss import *
from .solov2_loss import *
from .ctfocal_loss import *
from .keypoint_loss import *
from .jde_loss import *
from .fairmot_loss import *
from .gfocal_loss import *
from .detr_loss import *
from .sparsercnn_loss import *
from .focal_loss import *
from .smooth_l1_loss import *
from .pose3d_loss import *
from .probiou_loss import *
from .cot_loss import *
from .supcontrast import *
from .queryinst_loss import *
from .clrnet_loss import *
from .clrnet_line_iou_loss import *
================================================
FILE: ppdet/modeling/losses/clrnet_line_iou_loss.py
================================================
import paddle
def line_iou(pred, target, img_w, length=15, aligned=True):
'''
Calculate the line iou value between predictions and targets
Args:
pred: lane predictions, shape: (num_pred, 72)
target: ground truth, shape: (num_target, 72)
img_w: image width
length: extended radius
aligned: True for iou loss calculation, False for pair-wise ious in assign
'''
px1 = pred - length
px2 = pred + length
tx1 = target - length
tx2 = target + length
if aligned:
invalid_mask = target
ovr = paddle.minimum(px2, tx2) - paddle.maximum(px1, tx1)
union = paddle.maximum(px2, tx2) - paddle.minimum(px1, tx1)
else:
num_pred = pred.shape[0]
invalid_mask = target.tile([num_pred, 1, 1])
ovr = (paddle.minimum(px2[:, None, :], tx2[None, ...]) - paddle.maximum(
px1[:, None, :], tx1[None, ...]))
union = (paddle.maximum(px2[:, None, :], tx2[None, ...]) -
paddle.minimum(px1[:, None, :], tx1[None, ...]))
invalid_masks = (invalid_mask < 0) | (invalid_mask >= img_w)
ovr[invalid_masks] = 0.
union[invalid_masks] = 0.
iou = ovr.sum(axis=-1) / (union.sum(axis=-1) + 1e-9)
return iou
def liou_loss(pred, target, img_w, length=15):
return (1 - line_iou(pred, target, img_w, length)).mean()
================================================
FILE: ppdet/modeling/losses/clrnet_loss.py
================================================
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from ppdet.modeling.clrnet_utils import accuracy
from ppdet.modeling.assigners.clrnet_assigner import assign
from ppdet.modeling.losses.clrnet_line_iou_loss import liou_loss
__all__ = ['CLRNetLoss']
class SoftmaxFocalLoss(nn.Layer):
def __init__(self, gamma, ignore_lb=255, *args, **kwargs):
super(SoftmaxFocalLoss, self).__init__()
self.gamma = gamma
self.nll = nn.NLLLoss(ignore_index=ignore_lb)
def forward(self, logits, labels):
scores = F.softmax(logits, dim=1)
factor = paddle.pow(1. - scores, self.gamma)
log_score = F.log_softmax(logits, dim=1)
log_score = factor * log_score
loss = self.nll(log_score, labels)
return loss
def focal_loss(input: paddle.Tensor,
target: paddle.Tensor,
alpha: float,
gamma: float=2.0,
reduction: str='none',
eps: float=1e-8) -> paddle.Tensor:
r"""Function that computes Focal loss.
See :class:`~kornia.losses.FocalLoss` for details.
"""
if not paddle.is_tensor(input):
raise TypeError("Input type is not a torch.Tensor. Got {}".format(
type(input)))
if not len(input.shape) >= 2:
raise ValueError("Invalid input shape, we expect BxCx*. Got: {}".format(
input.shape))
if input.shape[0] != target.shape[0]:
raise ValueError(
'Expected input batch_size ({}) to match target batch_size ({}).'.
format(input.shape[0], target.shape[0]))
n = input.shape[0]
out_size = (n, ) + tuple(input.shape[2:])
if target.shape[1:] != input.shape[2:]:
raise ValueError('Expected target size {}, got {}'.format(out_size,
target.shape))
if (isinstance(input.place, paddle.CUDAPlace) and
isinstance(target.place, paddle.CPUPlace)) | (isinstance(
input.place, paddle.CPUPlace) and isinstance(target.place,
paddle.CUDAPlace)):
raise ValueError(
"input and target must be in the same device. Got: {} and {}".
format(input.place, target.place))
# compute softmax over the classes axis
input_soft: paddle.Tensor = F.softmax(input, axis=1) + eps
# create the labels one hot tensor
target_one_hot: paddle.Tensor = paddle.to_tensor(
F.one_hot(
target, num_classes=input.shape[1]).cast(input.dtype),
place=input.place)
# compute the actual focal loss
weight = paddle.pow(-input_soft + 1., gamma)
focal = -alpha * weight * paddle.log(input_soft)
loss_tmp = paddle.sum(target_one_hot * focal, axis=1)
if reduction == 'none':
loss = loss_tmp
elif reduction == 'mean':
loss = paddle.mean(loss_tmp)
elif reduction == 'sum':
loss = paddle.sum(loss_tmp)
else:
raise NotImplementedError("Invalid reduction mode: {}".format(
reduction))
return loss
class FocalLoss(nn.Layer):
r"""Criterion that computes Focal loss.
According to [1], the Focal loss is computed as follows:
.. math::
\text{FL}(p_t) = -\alpha_t (1 - p_t)^{\gamma} \, \text{log}(p_t)
where:
- :math:`p_t` is the model's estimated probability for each class.
Arguments:
alpha (float): Weighting factor :math:`\alpha \in [0, 1]`.
gamma (float): Focusing parameter :math:`\gamma >= 0`.
reduction (str, optional): Specifies the reduction to apply to the
output: ‘none’ | ‘mean’ | ‘sum’. ‘none’: no reduction will be applied,
‘mean’: the sum of the output will be divided by the number of elements
in the output, ‘sum’: the output will be summed. Default: ‘none’.
Shape:
- Input: :math:`(N, C, *)` where C = number of classes.
- Target: :math:`(N, *)` where each value is
:math:`0 ≤ targets[i] ≤ C−1`.
Examples:
>>> N = 5 # num_classes
>>> kwargs = {"alpha": 0.5, "gamma": 2.0, "reduction": 'mean'}
>>> loss = kornia.losses.FocalLoss(**kwargs)
>>> input = torch.randn(1, N, 3, 5, requires_grad=True)
>>> target = torch.empty(1, 3, 5, dtype=torch.long).random_(N)
>>> output = loss(input, target)
>>> output.backward()
References:
[1] https://arxiv.org/abs/1708.02002
"""
def __init__(self, alpha: float, gamma: float=2.0,
reduction: str='none') -> None:
super(FocalLoss, self).__init__()
self.alpha: float = alpha
self.gamma: float = gamma
self.reduction: str = reduction
self.eps: float = 1e-6
def forward( # type: ignore
self, input: paddle.Tensor, target: paddle.Tensor) -> paddle.Tensor:
return focal_loss(input, target, self.alpha, self.gamma, self.reduction,
self.eps)
@register
class CLRNetLoss(nn.Layer):
__shared__ = ['img_w', 'img_h', 'num_classes', 'num_points']
def __init__(self,
cls_loss_weight=2.0,
xyt_loss_weight=0.2,
iou_loss_weight=2.0,
seg_loss_weight=1.0,
refine_layers=3,
num_points=72,
img_w=800,
img_h=320,
num_classes=5,
ignore_label=255,
bg_weight=0.4):
super(CLRNetLoss, self).__init__()
self.cls_loss_weight = cls_loss_weight
self.xyt_loss_weight = xyt_loss_weight
self.iou_loss_weight = iou_loss_weight
self.seg_loss_weight = seg_loss_weight
self.refine_layers = refine_layers
self.img_w = img_w
self.img_h = img_h
self.n_strips = num_points - 1
self.num_classes = num_classes
self.ignore_label = ignore_label
weights = paddle.ones(shape=[self.num_classes])
weights[0] = bg_weight
self.criterion = nn.NLLLoss(
ignore_index=self.ignore_label, weight=weights)
def forward(self, output, batch):
predictions_lists = output['predictions_lists']
targets = batch['lane_line'].clone()
cls_criterion = FocalLoss(alpha=0.25, gamma=2.0)
cls_loss = paddle.to_tensor(0.0)
reg_xytl_loss = paddle.to_tensor(0.0)
iou_loss = paddle.to_tensor(0.0)
cls_acc = []
cls_acc_stage = []
for stage in range(self.refine_layers):
predictions_list = predictions_lists[stage]
for predictions, target in zip(predictions_list, targets):
target = target[target[:, 1] == 1]
if len(target) == 0:
# If there are no targets, all predictions have to be negatives (i.e., 0 confidence)
cls_target = paddle.zeros(
[predictions.shape[0]], dtype='int64')
cls_pred = predictions[:, :2]
cls_loss = cls_loss + cls_criterion(cls_pred,
cls_target).sum()
continue
with paddle.no_grad():
matched_row_inds, matched_col_inds = assign(
predictions, target, self.img_w, self.img_h)
# classification targets
cls_target = paddle.zeros([predictions.shape[0]], dtype='int64')
cls_target[matched_row_inds] = 1
cls_pred = predictions[:, :2]
# regression targets -> [start_y, start_x, theta] (all transformed to absolute values), only on matched pairs
reg_yxtl = predictions.index_select(matched_row_inds)[..., 2:6]
reg_yxtl[:, 0] *= self.n_strips
reg_yxtl[:, 1] *= (self.img_w - 1)
reg_yxtl[:, 2] *= 180
reg_yxtl[:, 3] *= self.n_strips
target_yxtl = target.index_select(matched_col_inds)[..., 2:
6].clone()
# regression targets -> S coordinates (all transformed to absolute values)
reg_pred = predictions.index_select(matched_row_inds)[..., 6:]
reg_pred *= (self.img_w - 1)
reg_targets = target.index_select(matched_col_inds)[...,
6:].clone()
with paddle.no_grad():
predictions_starts = paddle.clip(
(predictions.index_select(matched_row_inds)[..., 2] *
self.n_strips).round().cast("int64"),
min=0,
max=self.
n_strips) # ensure the predictions starts is valid
target_starts = (
target.index_select(matched_col_inds)[..., 2] *
self.n_strips).round().cast("int64")
target_yxtl[:, -1] -= (
predictions_starts - target_starts) # reg length
# Loss calculation
cls_loss = cls_loss + cls_criterion(
cls_pred, cls_target).sum() / target.shape[0]
target_yxtl[:, 0] *= self.n_strips
target_yxtl[:, 2] *= 180
reg_xytl_loss = reg_xytl_loss + F.smooth_l1_loss(
input=reg_yxtl, label=target_yxtl, reduction='none').mean()
iou_loss = iou_loss + liou_loss(
reg_pred, reg_targets, self.img_w, length=15)
cls_accuracy = accuracy(cls_pred, cls_target)
cls_acc_stage.append(cls_accuracy)
cls_acc.append(sum(cls_acc_stage) / (len(cls_acc_stage) + 1e-5))
# extra segmentation loss
seg_loss = self.criterion(
F.log_softmax(
output['seg'], axis=1), batch['seg'].cast('int64'))
cls_loss /= (len(targets) * self.refine_layers)
reg_xytl_loss /= (len(targets) * self.refine_layers)
iou_loss /= (len(targets) * self.refine_layers)
loss = cls_loss * self.cls_loss_weight \
+ reg_xytl_loss * self.xyt_loss_weight \
+ seg_loss * self.seg_loss_weight \
+ iou_loss * self.iou_loss_weight
return_value = {
'loss': loss,
'cls_loss': cls_loss * self.cls_loss_weight,
'reg_xytl_loss': reg_xytl_loss * self.xyt_loss_weight,
'seg_loss': seg_loss * self.seg_loss_weight,
'iou_loss': iou_loss * self.iou_loss_weight
}
for i in range(self.refine_layers):
if not isinstance(cls_acc[i], paddle.Tensor):
cls_acc[i] = paddle.to_tensor(cls_acc[i])
return_value['stage_{}_acc'.format(i)] = cls_acc[i]
return return_value
================================================
FILE: ppdet/modeling/losses/cot_loss.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import numpy as np
from ppdet.core.workspace import register
__all__ = ['COTLoss']
@register
class COTLoss(nn.Layer):
__shared__ = ['num_classes']
def __init__(self,
num_classes=80,
cot_scale=1,
cot_lambda=1):
super(COTLoss, self).__init__()
self.cot_scale = cot_scale
self.cot_lambda = cot_lambda
self.num_classes = num_classes
def forward(self, scores, targets, cot_relation):
cls_name = 'loss_bbox_cls_cot'
loss_bbox = {}
tgt_labels, tgt_bboxes, tgt_gt_inds = targets
tgt_labels = paddle.concat(tgt_labels) if len(
tgt_labels) > 1 else tgt_labels[0]
mask = (tgt_labels < self.num_classes)
valid_inds = paddle.nonzero(tgt_labels >= 0).flatten()
if valid_inds.shape[0] == 0:
loss_bbox[cls_name] = paddle.zeros([1], dtype='float32')
else:
tgt_labels = tgt_labels.cast('int64')
valid_cot_targets = []
for i in range(tgt_labels.shape[0]):
train_label = tgt_labels[i]
if train_label < self.num_classes:
valid_cot_targets.append(cot_relation[train_label])
coco_targets = paddle.to_tensor(valid_cot_targets)
coco_targets.stop_gradient = True
coco_loss = - coco_targets * F.log_softmax(scores[mask][:, :-1] * self.cot_scale)
loss_bbox[cls_name] = self.cot_lambda * paddle.mean(paddle.sum(coco_loss, axis=-1))
return loss_bbox
================================================
FILE: ppdet/modeling/losses/ctfocal_loss.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from ppdet.core.workspace import register, serializable
__all__ = ['CTFocalLoss']
@register
@serializable
class CTFocalLoss(object):
"""
CTFocalLoss: CornerNet & CenterNet Focal Loss
Args:
loss_weight (float): loss weight
gamma (float): gamma parameter for Focal Loss
"""
def __init__(self, loss_weight=1., gamma=2.0):
self.loss_weight = loss_weight
self.gamma = gamma
def __call__(self, pred, target):
"""
Calculate the loss
Args:
pred (Tensor): heatmap prediction
target (Tensor): target for positive samples
Return:
ct_focal_loss (Tensor): Focal Loss used in CornerNet & CenterNet.
Note that the values in target are in [0, 1] since gaussian is
used to reduce the punishment and we treat [0, 1) as neg example.
"""
fg_map = paddle.cast(target == 1, 'float32')
fg_map.stop_gradient = True
bg_map = paddle.cast(target < 1, 'float32')
bg_map.stop_gradient = True
neg_weights = paddle.pow(1 - target, 4)
pos_loss = 0 - paddle.log(pred) * paddle.pow(1 - pred,
self.gamma) * fg_map
neg_loss = 0 - paddle.log(1 - pred) * paddle.pow(
pred, self.gamma) * neg_weights * bg_map
pos_loss = paddle.sum(pos_loss)
neg_loss = paddle.sum(neg_loss)
fg_num = paddle.sum(fg_map)
ct_focal_loss = (pos_loss + neg_loss) / (
fg_num + paddle.cast(fg_num == 0, 'float32'))
return ct_focal_loss * self.loss_weight
================================================
FILE: ppdet/modeling/losses/detr_loss.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from .iou_loss import GIoULoss
from ..transformers import bbox_cxcywh_to_xyxy, sigmoid_focal_loss, varifocal_loss_with_logits
from ..bbox_utils import bbox_iou
__all__ = ['DETRLoss', 'DINOLoss', 'DINOv3Loss']
@register
class DETRLoss(nn.Layer):
__shared__ = ['num_classes', 'use_focal_loss']
__inject__ = ['matcher']
def __init__(self,
num_classes=80,
matcher='HungarianMatcher',
loss_coeff={
'class': 1,
'bbox': 5,
'giou': 2,
'no_object': 0.1,
'mask': 1,
'dice': 1
},
aux_loss=True,
use_focal_loss=False,
use_vfl=False,
vfl_iou_type='bbox',
use_uni_match=False,
uni_match_ind=0):
r"""
Args:
num_classes (int): The number of classes.
matcher (HungarianMatcher): It computes an assignment between the targets
and the predictions of the network.
loss_coeff (dict): The coefficient of loss.
aux_loss (bool): If 'aux_loss = True', loss at each decoder layer are to be used.
use_focal_loss (bool): Use focal loss or not.
"""
super(DETRLoss, self).__init__()
self.num_classes = num_classes
self.matcher = matcher
self.loss_coeff = loss_coeff
self.aux_loss = aux_loss
self.use_focal_loss = use_focal_loss
self.use_vfl = use_vfl
self.vfl_iou_type = vfl_iou_type
self.use_uni_match = use_uni_match
self.uni_match_ind = uni_match_ind
if not self.use_focal_loss:
self.loss_coeff['class'] = paddle.full([num_classes + 1],
loss_coeff['class'])
self.loss_coeff['class'][-1] = loss_coeff['no_object']
self.giou_loss = GIoULoss()
def _get_loss_class(self,
logits,
gt_class,
match_indices,
bg_index,
num_gts,
postfix="",
iou_score=None,
gt_score=None):
# logits: [b, query, num_classes], gt_class: list[[n, 1]]
name_class = "loss_class" + postfix
target_label = paddle.full(logits.shape[:2], bg_index, dtype='int64')
bs, num_query_objects = target_label.shape
num_gt = sum(len(a) for a in gt_class)
if num_gt > 0:
index, updates = self._get_index_updates(num_query_objects,
gt_class, match_indices)
target_label = paddle.scatter(
target_label.reshape([-1, 1]), index, updates.astype('int64'))
target_label = target_label.reshape([bs, num_query_objects])
if self.use_focal_loss:
target_label = F.one_hot(target_label,
self.num_classes + 1)[..., :-1]
if iou_score is not None and self.use_vfl:
if gt_score is not None:
target_score = paddle.zeros([bs, num_query_objects])
target_score = paddle.scatter(
target_score.reshape([-1, 1]), index, gt_score)
target_score = target_score.reshape(
[bs, num_query_objects, 1]) * target_label
target_score_iou = paddle.zeros([bs, num_query_objects])
target_score_iou = paddle.scatter(
target_score_iou.reshape([-1, 1]), index, iou_score)
target_score_iou = target_score_iou.reshape(
[bs, num_query_objects, 1]) * target_label
target_score = paddle.multiply(target_score,
target_score_iou)
loss_ = self.loss_coeff[
'class'] * varifocal_loss_with_logits(
logits, target_score, target_label,
num_gts / num_query_objects)
else:
target_score = paddle.zeros([bs, num_query_objects])
if num_gt > 0:
target_score = paddle.scatter(
target_score.reshape([-1, 1]), index, iou_score)
target_score = target_score.reshape(
[bs, num_query_objects, 1]) * target_label
loss_ = self.loss_coeff[
'class'] * varifocal_loss_with_logits(
logits, target_score, target_label,
num_gts / num_query_objects)
else:
loss_ = self.loss_coeff['class'] * sigmoid_focal_loss(
logits, target_label, num_gts / num_query_objects)
else:
loss_ = F.cross_entropy(
logits, target_label, weight=self.loss_coeff['class'])
return {name_class: loss_}
def _get_loss_bbox(self, boxes, gt_bbox, match_indices, num_gts,
postfix=""):
# boxes: [b, query, 4], gt_bbox: list[[n, 4]]
name_bbox = "loss_bbox" + postfix
name_giou = "loss_giou" + postfix
loss = dict()
if sum(len(a) for a in gt_bbox) == 0:
loss[name_bbox] = paddle.to_tensor([0.])
loss[name_giou] = paddle.to_tensor([0.])
return loss
src_bbox, target_bbox = self._get_src_target_assign(boxes, gt_bbox,
match_indices)
loss[name_bbox] = self.loss_coeff['bbox'] * F.l1_loss(
src_bbox, target_bbox, reduction='sum') / num_gts
loss[name_giou] = self.giou_loss(
bbox_cxcywh_to_xyxy(src_bbox), bbox_cxcywh_to_xyxy(target_bbox))
loss[name_giou] = loss[name_giou].sum() / num_gts
loss[name_giou] = self.loss_coeff['giou'] * loss[name_giou]
return loss
def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts,
postfix=""):
# masks: [b, query, h, w], gt_mask: list[[n, H, W]]
name_mask = "loss_mask" + postfix
name_dice = "loss_dice" + postfix
loss = dict()
if sum(len(a) for a in gt_mask) == 0:
loss[name_mask] = paddle.to_tensor([0.])
loss[name_dice] = paddle.to_tensor([0.])
return loss
src_masks, target_masks = self._get_src_target_assign(masks, gt_mask,
match_indices)
src_masks = F.interpolate(
src_masks.unsqueeze(0),
size=target_masks.shape[-2:],
mode="bilinear")[0]
loss[name_mask] = self.loss_coeff['mask'] * F.sigmoid_focal_loss(
src_masks,
target_masks,
paddle.to_tensor(
[num_gts], dtype='float32'))
loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss(
src_masks, target_masks, num_gts)
return loss
def _dice_loss(self, inputs, targets, num_gts):
inputs = F.sigmoid(inputs)
inputs = inputs.flatten(1)
targets = targets.flatten(1)
numerator = 2 * (inputs * targets).sum(1)
denominator = inputs.sum(-1) + targets.sum(-1)
loss = 1 - (numerator + 1) / (denominator + 1)
return loss.sum() / num_gts
def _get_loss_aux(self,
boxes,
logits,
gt_bbox,
gt_class,
bg_index,
num_gts,
dn_match_indices=None,
postfix="",
masks=None,
gt_mask=None,
gt_score=None):
loss_class = []
loss_bbox, loss_giou = [], []
loss_mask, loss_dice = [], []
if dn_match_indices is not None:
match_indices = dn_match_indices
elif self.use_uni_match:
match_indices = self.matcher(
boxes[self.uni_match_ind],
logits[self.uni_match_ind],
gt_bbox,
gt_class,
masks=masks[self.uni_match_ind] if masks is not None else None,
gt_mask=gt_mask)
for i, (aux_boxes, aux_logits) in enumerate(zip(boxes, logits)):
aux_masks = masks[i] if masks is not None else None
if not self.use_uni_match and dn_match_indices is None:
match_indices = self.matcher(
aux_boxes,
aux_logits,
gt_bbox,
gt_class,
masks=aux_masks,
gt_mask=gt_mask)
if self.use_vfl:
if sum(len(a) for a in gt_bbox) > 0:
src_bbox, target_bbox = self._get_src_target_assign(
aux_boxes.detach(), gt_bbox, match_indices)
iou_score = bbox_iou(
bbox_cxcywh_to_xyxy(src_bbox).split(4, -1),
bbox_cxcywh_to_xyxy(target_bbox).split(4, -1))
else:
iou_score = None
if gt_score is not None:
_, target_score = self._get_src_target_assign(
logits[-1].detach(), gt_score, match_indices)
else:
iou_score = None
loss_class.append(
self._get_loss_class(
aux_logits,
gt_class,
match_indices,
bg_index,
num_gts,
postfix,
iou_score,
gt_score=target_score
if gt_score is not None else None)['loss_class' + postfix])
loss_ = self._get_loss_bbox(aux_boxes, gt_bbox, match_indices,
num_gts, postfix)
loss_bbox.append(loss_['loss_bbox' + postfix])
loss_giou.append(loss_['loss_giou' + postfix])
if masks is not None and gt_mask is not None:
loss_ = self._get_loss_mask(aux_masks, gt_mask, match_indices,
num_gts, postfix)
loss_mask.append(loss_['loss_mask' + postfix])
loss_dice.append(loss_['loss_dice' + postfix])
loss = {
"loss_class_aux" + postfix: paddle.add_n(loss_class),
"loss_bbox_aux" + postfix: paddle.add_n(loss_bbox),
"loss_giou_aux" + postfix: paddle.add_n(loss_giou)
}
if masks is not None and gt_mask is not None:
loss["loss_mask_aux" + postfix] = paddle.add_n(loss_mask)
loss["loss_dice_aux" + postfix] = paddle.add_n(loss_dice)
return loss
def _get_index_updates(self, num_query_objects, target, match_indices):
batch_idx = paddle.concat([
paddle.full_like(src, i) for i, (src, _) in enumerate(match_indices)
])
src_idx = paddle.concat([src for (src, _) in match_indices])
src_idx += (batch_idx * num_query_objects)
target_assign = paddle.concat([
paddle.gather(
t, dst, axis=0) for t, (_, dst) in zip(target, match_indices)
])
return src_idx, target_assign
def _get_src_target_assign(self, src, target, match_indices):
src_assign = paddle.concat([
paddle.gather(
t, I, axis=0) if len(I) > 0 else paddle.zeros([0, t.shape[-1]])
for t, (I, _) in zip(src, match_indices)
])
target_assign = paddle.concat([
paddle.gather(
t, J, axis=0) if len(J) > 0 else paddle.zeros([0, t.shape[-1]])
for t, (_, J) in zip(target, match_indices)
])
return src_assign, target_assign
def _get_num_gts(self, targets, dtype="float32"):
num_gts = sum(len(a) for a in targets)
num_gts = paddle.to_tensor([num_gts], dtype=dtype)
if paddle.distributed.get_world_size() > 1:
paddle.distributed.all_reduce(num_gts)
num_gts /= paddle.distributed.get_world_size()
num_gts = paddle.clip(num_gts, min=1.)
return num_gts
def _get_prediction_loss(self,
boxes,
logits,
gt_bbox,
gt_class,
masks=None,
gt_mask=None,
postfix="",
dn_match_indices=None,
num_gts=1,
gt_score=None):
if dn_match_indices is None:
match_indices = self.matcher(
boxes, logits, gt_bbox, gt_class, masks=masks, gt_mask=gt_mask)
else:
match_indices = dn_match_indices
if self.use_vfl:
if gt_score is not None: #ssod
_, target_score = self._get_src_target_assign(
logits[-1].detach(), gt_score, match_indices)
elif sum(len(a) for a in gt_bbox) > 0:
if self.vfl_iou_type == 'bbox':
src_bbox, target_bbox = self._get_src_target_assign(
boxes.detach(), gt_bbox, match_indices)
iou_score = bbox_iou(
bbox_cxcywh_to_xyxy(src_bbox).split(4, -1),
bbox_cxcywh_to_xyxy(target_bbox).split(4, -1))
elif self.vfl_iou_type == 'mask':
assert (masks is not None and gt_mask is not None,
'Make sure the input has `mask` and `gt_mask`')
assert sum(len(a) for a in gt_mask) > 0
src_mask, target_mask = self._get_src_target_assign(
masks.detach(), gt_mask, match_indices)
src_mask = F.interpolate(
src_mask.unsqueeze(0),
scale_factor=2,
mode='bilinear',
align_corners=False).squeeze(0)
target_mask = F.interpolate(
target_mask.unsqueeze(0),
size=src_mask.shape[-2:],
mode='bilinear',
align_corners=False).squeeze(0)
src_mask = src_mask.flatten(1)
src_mask = F.sigmoid(src_mask)
src_mask = paddle.where(
src_mask > 0.5, 1., 0.).astype(masks.dtype)
target_mask = target_mask.flatten(1)
target_mask = paddle.where(
target_mask > 0.5, 1., 0.).astype(masks.dtype)
inter = (src_mask * target_mask).sum(1)
union = src_mask.sum(1) + target_mask.sum(1) - inter
iou_score = (inter + 1e-2) / (union + 1e-2)
iou_score = iou_score.unsqueeze(-1)
else:
iou_score = None
else:
iou_score = None
else:
iou_score = None
loss = dict()
loss.update(
self._get_loss_class(
logits,
gt_class,
match_indices,
self.num_classes,
num_gts,
postfix,
iou_score,
gt_score=target_score if gt_score is not None else None))
loss.update(
self._get_loss_bbox(boxes, gt_bbox, match_indices, num_gts,
postfix))
if masks is not None and gt_mask is not None:
loss.update(
self._get_loss_mask(masks, gt_mask, match_indices, num_gts,
postfix))
return loss
def forward(self,
boxes,
logits,
gt_bbox,
gt_class,
masks=None,
gt_mask=None,
postfix="",
gt_score=None,
o2m=1,
**kwargs):
r"""
Args:
boxes (Tensor): [l, b, query, 4]
logits (Tensor): [l, b, query, num_classes]
gt_bbox (List(Tensor)): list[[n, 4]]
gt_class (List(Tensor)): list[[n, 1]]
masks (Tensor, optional): [l, b, query, h, w]
gt_mask (List(Tensor), optional): list[[n, H, W]]
postfix (str): postfix of loss name
"""
dn_match_indices = kwargs.get("dn_match_indices", None)
num_gts = kwargs.get("num_gts", None)
if num_gts is None:
num_gts = self._get_num_gts(gt_class)
total_loss = self._get_prediction_loss(
boxes[-1],
logits[-1],
gt_bbox,
gt_class,
masks=masks[-1] if masks is not None else None,
gt_mask=gt_mask,
postfix=postfix,
dn_match_indices=dn_match_indices,
num_gts=num_gts,
gt_score=gt_score if gt_score is not None else None)
if self.aux_loss:
total_loss.update(
self._get_loss_aux(
boxes[:-1],
logits[:-1],
gt_bbox,
gt_class,
self.num_classes,
num_gts,
dn_match_indices,
postfix,
masks=masks[:-1] if masks is not None else None,
gt_mask=gt_mask,
gt_score=gt_score if gt_score is not None else None))
return total_loss
@register
class DINOLoss(DETRLoss):
def forward(self,
boxes,
logits,
gt_bbox,
gt_class,
masks=None,
gt_mask=None,
postfix="",
dn_out_bboxes=None,
dn_out_logits=None,
dn_meta=None,
gt_score=None,
**kwargs):
num_gts = self._get_num_gts(gt_class)
total_loss = super(DINOLoss, self).forward(
boxes,
logits,
gt_bbox,
gt_class,
num_gts=num_gts,
gt_score=gt_score)
if dn_meta is not None:
dn_positive_idx, dn_num_group = \
dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
assert len(gt_class) == len(dn_positive_idx)
# denoising match indices
dn_match_indices = self.get_dn_match_indices(
gt_class, dn_positive_idx, dn_num_group)
# compute denoising training loss
num_gts *= dn_num_group
dn_loss = super(DINOLoss, self).forward(
dn_out_bboxes,
dn_out_logits,
gt_bbox,
gt_class,
postfix="_dn",
dn_match_indices=dn_match_indices,
num_gts=num_gts,
gt_score=gt_score)
total_loss.update(dn_loss)
else:
total_loss.update(
{k + '_dn': paddle.to_tensor([0.])
for k in total_loss.keys()})
return total_loss
@staticmethod
def get_dn_match_indices(labels, dn_positive_idx, dn_num_group):
dn_match_indices = []
for i in range(len(labels)):
num_gt = len(labels[i])
if num_gt > 0:
gt_idx = paddle.arange(end=num_gt, dtype="int64")
gt_idx = gt_idx.tile([dn_num_group])
assert len(dn_positive_idx[i]) == len(gt_idx)
dn_match_indices.append((dn_positive_idx[i], gt_idx))
else:
dn_match_indices.append((paddle.zeros(
[0], dtype="int64"), paddle.zeros(
[0], dtype="int64")))
return dn_match_indices
@register
class DINOv3Loss(DETRLoss):
def forward(self,
boxes,
logits,
gt_bbox,
gt_class,
masks=None,
gt_mask=None,
postfix="",
dn_out_bboxes=None,
dn_out_logits=None,
dn_meta=None,
gt_score=None,
o2m=1,
**kwargs):
if o2m != 1:
gt_boxes_copy = [box.tile([o2m, 1]) for box in gt_bbox]
gt_class_copy = [label.tile([o2m, 1]) for label in gt_class]
else:
gt_boxes_copy = gt_bbox
gt_class_copy = gt_class
num_gts_copy = self._get_num_gts(gt_class_copy)
total_loss = self._get_prediction_loss(
boxes[-1],
logits[-1],
gt_boxes_copy,
gt_class_copy,
masks=masks[-1] if masks is not None else None,
gt_mask=gt_mask,
postfix=postfix,
dn_match_indices=None,
num_gts=num_gts_copy,
gt_score=gt_score if gt_score is not None else None)
if self.aux_loss:
total_loss.update(
self._get_loss_aux(
boxes[:-1],
logits[:-1],
gt_boxes_copy,
gt_class_copy,
self.num_classes,
num_gts_copy,
dn_match_indices=None,
postfix=postfix,
masks=masks[:-1] if masks is not None else None,
gt_mask=gt_mask,
gt_score=gt_score if gt_score is not None else None))
if dn_meta is not None:
num_gts = self._get_num_gts(gt_class)
dn_positive_idx, dn_num_group = \
dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
assert len(gt_class) == len(dn_positive_idx)
# denoising match indices
dn_match_indices = self.get_dn_match_indices(
gt_class, dn_positive_idx, dn_num_group)
# compute denoising training loss
num_gts *= dn_num_group
dn_loss = super(DINOv3Loss, self).forward(
dn_out_bboxes,
dn_out_logits,
gt_bbox,
gt_class,
postfix="_dn",
dn_match_indices=dn_match_indices,
num_gts=num_gts,
gt_score=gt_score)
total_loss.update(dn_loss)
else:
total_loss.update(
{k + '_dn': paddle.to_tensor([0.])
for k in total_loss.keys()})
return total_loss
@staticmethod
def get_dn_match_indices(labels, dn_positive_idx, dn_num_group):
dn_match_indices = []
for i in range(len(labels)):
num_gt = len(labels[i])
if num_gt > 0:
gt_idx = paddle.arange(end=num_gt, dtype="int64")
gt_idx = gt_idx.tile([dn_num_group])
assert len(dn_positive_idx[i]) == len(gt_idx)
dn_match_indices.append((dn_positive_idx[i], gt_idx))
else:
dn_match_indices.append((paddle.zeros(
[0], dtype="int64"), paddle.zeros(
[0], dtype="int64")))
return dn_match_indices
@register
class MaskDINOLoss(DETRLoss):
__shared__ = ['num_classes', 'use_focal_loss', 'num_sample_points']
__inject__ = ['matcher']
def __init__(self,
num_classes=80,
matcher='HungarianMatcher',
loss_coeff={
'class': 4,
'bbox': 5,
'giou': 2,
'mask': 5,
'dice': 5
},
aux_loss=True,
use_focal_loss=False,
use_vfl=False,
vfl_iou_type='bbox',
num_sample_points=12544,
oversample_ratio=3.0,
important_sample_ratio=0.75):
super(MaskDINOLoss, self).__init__(num_classes, matcher, loss_coeff,
aux_loss, use_focal_loss, use_vfl, vfl_iou_type)
assert oversample_ratio >= 1
assert important_sample_ratio <= 1 and important_sample_ratio >= 0
self.num_sample_points = num_sample_points
self.oversample_ratio = oversample_ratio
self.important_sample_ratio = important_sample_ratio
self.num_oversample_points = int(num_sample_points * oversample_ratio)
self.num_important_points = int(num_sample_points *
important_sample_ratio)
self.num_random_points = num_sample_points - self.num_important_points
def forward(self,
boxes,
logits,
gt_bbox,
gt_class,
masks=None,
gt_mask=None,
postfix="",
dn_out_bboxes=None,
dn_out_logits=None,
dn_out_masks=None,
dn_meta=None,
**kwargs):
num_gts = self._get_num_gts(gt_class)
total_loss = super(MaskDINOLoss, self).forward(
boxes,
logits,
gt_bbox,
gt_class,
masks=masks,
gt_mask=gt_mask,
num_gts=num_gts)
if dn_meta is not None:
dn_positive_idx, dn_num_group = \
dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
assert len(gt_class) == len(dn_positive_idx)
# denoising match indices
dn_match_indices = DINOLoss.get_dn_match_indices(
gt_class, dn_positive_idx, dn_num_group)
# compute denoising training loss
num_gts *= dn_num_group
dn_loss = super(MaskDINOLoss, self).forward(
dn_out_bboxes,
dn_out_logits,
gt_bbox,
gt_class,
masks=dn_out_masks,
gt_mask=gt_mask,
postfix="_dn",
dn_match_indices=dn_match_indices,
num_gts=num_gts)
total_loss.update(dn_loss)
else:
total_loss.update(
{k + '_dn': paddle.to_tensor([0.])
for k in total_loss.keys()})
return total_loss
def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts,
postfix=""):
# masks: [b, query, h, w], gt_mask: list[[n, H, W]]
name_mask = "loss_mask" + postfix
name_dice = "loss_dice" + postfix
loss = dict()
if sum(len(a) for a in gt_mask) == 0:
loss[name_mask] = paddle.to_tensor([0.])
loss[name_dice] = paddle.to_tensor([0.])
return loss
src_masks, target_masks = self._get_src_target_assign(masks, gt_mask,
match_indices)
# sample points
sample_points = self._get_point_coords_by_uncertainty(src_masks)
sample_points = 2.0 * sample_points.unsqueeze(1) - 1.0
src_masks = F.grid_sample(
src_masks.unsqueeze(1), sample_points,
align_corners=False).squeeze([1, 2])
target_masks = F.grid_sample(
target_masks.unsqueeze(1), sample_points,
align_corners=False).squeeze([1, 2]).detach()
loss[name_mask] = self.loss_coeff[
'mask'] * F.binary_cross_entropy_with_logits(
src_masks, target_masks,
reduction='none').mean(1).sum() / num_gts
loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss(
src_masks, target_masks, num_gts)
return loss
def _get_point_coords_by_uncertainty(self, masks):
# Sample points based on their uncertainty.
masks = masks.detach()
num_masks = masks.shape[0]
sample_points = paddle.rand(
[num_masks, 1, self.num_oversample_points, 2])
out_mask = F.grid_sample(
masks.unsqueeze(1), 2.0 * sample_points - 1.0,
align_corners=False).squeeze([1, 2])
out_mask = -paddle.abs(out_mask)
_, topk_ind = paddle.topk(out_mask, self.num_important_points, axis=1)
batch_ind = paddle.arange(end=num_masks, dtype=topk_ind.dtype)
batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_important_points])
topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
sample_points = paddle.gather_nd(sample_points.squeeze(1), topk_ind)
if self.num_random_points > 0:
sample_points = paddle.concat(
[
sample_points,
paddle.rand([num_masks, self.num_random_points, 2])
],
axis=1)
return sample_points
================================================
FILE: ppdet/modeling/losses/fairmot_loss.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
from paddle.nn.initializer import Constant
from ppdet.core.workspace import register
__all__ = ['FairMOTLoss']
@register
class FairMOTLoss(nn.Layer):
def __init__(self):
super(FairMOTLoss, self).__init__()
self.det_weight = self.create_parameter(
shape=[1], default_initializer=Constant(-1.85))
self.reid_weight = self.create_parameter(
shape=[1], default_initializer=Constant(-1.05))
def forward(self, det_loss, reid_loss):
loss = paddle.exp(-self.det_weight) * det_loss + paddle.exp(
-self.reid_weight) * reid_loss + (self.det_weight + self.reid_weight
)
loss *= 0.5
return {'loss': loss}
================================================
FILE: ppdet/modeling/losses/fcos_loss.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from ppdet.modeling import ops
from functools import partial
__all__ = ['FCOSLoss', 'FCOSLossMILC', 'FCOSLossCR']
def flatten_tensor(inputs, channel_first=False):
"""
Flatten a Tensor
Args:
inputs (Tensor): 4-D Tensor with shape [N, C, H, W] or [N, H, W, C]
channel_first (bool): If true the dimension order of Tensor is
[N, C, H, W], otherwise is [N, H, W, C]
Return:
output_channel_last (Tensor): The flattened Tensor in channel_last style
"""
if channel_first:
input_channel_last = paddle.transpose(inputs, perm=[0, 2, 3, 1])
else:
input_channel_last = inputs
output_channel_last = paddle.flatten(
input_channel_last, start_axis=0, stop_axis=2)
return output_channel_last
@register
class FCOSLoss(nn.Layer):
"""
FCOSLoss
Args:
loss_alpha (float): alpha in focal loss
loss_gamma (float): gamma in focal loss
iou_loss_type (str): location loss type, IoU/GIoU/LINEAR_IoU
reg_weights (float): weight for location loss
quality (str): quality branch, centerness/iou
"""
def __init__(self,
loss_alpha=0.25,
loss_gamma=2.0,
iou_loss_type="giou",
reg_weights=1.0,
quality='centerness'):
super(FCOSLoss, self).__init__()
self.loss_alpha = loss_alpha
self.loss_gamma = loss_gamma
self.iou_loss_type = iou_loss_type
self.reg_weights = reg_weights
self.quality = quality
def _iou_loss(self,
pred,
targets,
positive_mask,
weights=None,
return_iou=False):
"""
Calculate the loss for location prediction
Args:
pred (Tensor): bounding boxes prediction
targets (Tensor): targets for positive samples
positive_mask (Tensor): mask of positive samples
weights (Tensor): weights for each positive samples
Return:
loss (Tensor): location loss
"""
plw = pred[:, 0] * positive_mask
pth = pred[:, 1] * positive_mask
prw = pred[:, 2] * positive_mask
pbh = pred[:, 3] * positive_mask
tlw = targets[:, 0] * positive_mask
tth = targets[:, 1] * positive_mask
trw = targets[:, 2] * positive_mask
tbh = targets[:, 3] * positive_mask
tlw.stop_gradient = True
trw.stop_gradient = True
tth.stop_gradient = True
tbh.stop_gradient = True
ilw = paddle.minimum(plw, tlw)
irw = paddle.minimum(prw, trw)
ith = paddle.minimum(pth, tth)
ibh = paddle.minimum(pbh, tbh)
clw = paddle.maximum(plw, tlw)
crw = paddle.maximum(prw, trw)
cth = paddle.maximum(pth, tth)
cbh = paddle.maximum(pbh, tbh)
area_predict = (plw + prw) * (pth + pbh)
area_target = (tlw + trw) * (tth + tbh)
area_inter = (ilw + irw) * (ith + ibh)
ious = (area_inter + 1.0) / (
area_predict + area_target - area_inter + 1.0)
ious = ious * positive_mask
if return_iou:
return ious
if self.iou_loss_type.lower() == "linear_iou":
loss = 1.0 - ious
elif self.iou_loss_type.lower() == "giou":
area_uniou = area_predict + area_target - area_inter
area_circum = (clw + crw) * (cth + cbh) + 1e-7
giou = ious - (area_circum - area_uniou) / area_circum
loss = 1.0 - giou
elif self.iou_loss_type.lower() == "iou":
loss = 0.0 - paddle.log(ious)
else:
raise KeyError
if weights is not None:
loss = loss * weights
return loss
def forward(self, cls_logits, bboxes_reg, centerness, tag_labels,
tag_bboxes, tag_center):
"""
Calculate the loss for classification, location and centerness
Args:
cls_logits (list): list of Tensor, which is predicted
score for all anchor points with shape [N, M, C]
bboxes_reg (list): list of Tensor, which is predicted
offsets for all anchor points with shape [N, M, 4]
centerness (list): list of Tensor, which is predicted
centerness for all anchor points with shape [N, M, 1]
tag_labels (list): list of Tensor, which is category
targets for each anchor point
tag_bboxes (list): list of Tensor, which is bounding
boxes targets for positive samples
tag_center (list): list of Tensor, which is centerness
targets for positive samples
Return:
loss (dict): loss composed by classification loss, bounding box
"""
cls_logits_flatten_list = []
bboxes_reg_flatten_list = []
centerness_flatten_list = []
tag_labels_flatten_list = []
tag_bboxes_flatten_list = []
tag_center_flatten_list = []
num_lvl = len(cls_logits)
for lvl in range(num_lvl):
cls_logits_flatten_list.append(
flatten_tensor(cls_logits[lvl], True))
bboxes_reg_flatten_list.append(
flatten_tensor(bboxes_reg[lvl], True))
centerness_flatten_list.append(
flatten_tensor(centerness[lvl], True))
tag_labels_flatten_list.append(
flatten_tensor(tag_labels[lvl], False))
tag_bboxes_flatten_list.append(
flatten_tensor(tag_bboxes[lvl], False))
tag_center_flatten_list.append(
flatten_tensor(tag_center[lvl], False))
cls_logits_flatten = paddle.concat(cls_logits_flatten_list, axis=0)
bboxes_reg_flatten = paddle.concat(bboxes_reg_flatten_list, axis=0)
centerness_flatten = paddle.concat(centerness_flatten_list, axis=0)
tag_labels_flatten = paddle.concat(tag_labels_flatten_list, axis=0)
tag_bboxes_flatten = paddle.concat(tag_bboxes_flatten_list, axis=0)
tag_center_flatten = paddle.concat(tag_center_flatten_list, axis=0)
tag_labels_flatten.stop_gradient = True
tag_bboxes_flatten.stop_gradient = True
tag_center_flatten.stop_gradient = True
mask_positive_bool = tag_labels_flatten > 0
mask_positive_bool.stop_gradient = True
mask_positive_float = paddle.cast(mask_positive_bool, dtype="float32")
mask_positive_float.stop_gradient = True
num_positive_fp32 = paddle.sum(mask_positive_float)
num_positive_fp32.stop_gradient = True
num_positive_int32 = paddle.cast(num_positive_fp32, dtype="int32")
num_positive_int32 = num_positive_int32 * 0 + 1
num_positive_int32.stop_gradient = True
normalize_sum = paddle.sum(tag_center_flatten * mask_positive_float)
normalize_sum.stop_gradient = True
# 1. cls_logits: sigmoid_focal_loss
# expand onehot labels
num_classes = cls_logits_flatten.shape[-1]
tag_labels_flatten = paddle.squeeze(tag_labels_flatten, axis=-1)
tag_labels_flatten_bin = F.one_hot(
tag_labels_flatten, num_classes=1 + num_classes)
tag_labels_flatten_bin = tag_labels_flatten_bin[:, 1:]
# sigmoid_focal_loss
cls_loss = F.sigmoid_focal_loss(
cls_logits_flatten, tag_labels_flatten_bin) / num_positive_fp32
if self.quality == 'centerness':
# 2. bboxes_reg: giou_loss
mask_positive_float = paddle.squeeze(mask_positive_float, axis=-1)
tag_center_flatten = paddle.squeeze(tag_center_flatten, axis=-1)
reg_loss = self._iou_loss(
bboxes_reg_flatten,
tag_bboxes_flatten,
mask_positive_float,
weights=tag_center_flatten)
reg_loss = reg_loss * mask_positive_float / normalize_sum
# 3. centerness: sigmoid_cross_entropy_with_logits_loss
centerness_flatten = paddle.squeeze(centerness_flatten, axis=-1)
quality_loss = ops.sigmoid_cross_entropy_with_logits(
centerness_flatten, tag_center_flatten)
quality_loss = quality_loss * mask_positive_float / num_positive_fp32
elif self.quality == 'iou':
# 2. bboxes_reg: giou_loss
mask_positive_float = paddle.squeeze(mask_positive_float, axis=-1)
tag_center_flatten = paddle.squeeze(tag_center_flatten, axis=-1)
reg_loss = self._iou_loss(
bboxes_reg_flatten,
tag_bboxes_flatten,
mask_positive_float,
weights=None)
reg_loss = reg_loss * mask_positive_float / num_positive_fp32
# num_positive_fp32 is num_foreground
# 3. centerness: sigmoid_cross_entropy_with_logits_loss
centerness_flatten = paddle.squeeze(centerness_flatten, axis=-1)
gt_ious = self._iou_loss(
bboxes_reg_flatten,
tag_bboxes_flatten,
mask_positive_float,
weights=None,
return_iou=True)
quality_loss = ops.sigmoid_cross_entropy_with_logits(
centerness_flatten, gt_ious)
quality_loss = quality_loss * mask_positive_float / num_positive_fp32
else:
raise Exception(f'Unknown quality type: {self.quality}')
loss_all = {
"loss_cls": paddle.sum(cls_loss),
"loss_box": paddle.sum(reg_loss),
"loss_quality": paddle.sum(quality_loss),
}
return loss_all
@register
class FCOSLossMILC(FCOSLoss):
"""
FCOSLossMILC for ARSL in semi-det(ssod)
Args:
loss_alpha (float): alpha in focal loss
loss_gamma (float): gamma in focal loss
iou_loss_type (str): location loss type, IoU/GIoU/LINEAR_IoU
reg_weights (float): weight for location loss
"""
def __init__(self,
loss_alpha=0.25,
loss_gamma=2.0,
iou_loss_type="giou",
reg_weights=1.0):
super(FCOSLossMILC, self).__init__()
self.loss_alpha = loss_alpha
self.loss_gamma = loss_gamma
self.iou_loss_type = iou_loss_type
self.reg_weights = reg_weights
def iou_loss(self, pred, targets, weights=None, avg_factor=None):
"""
Calculate the loss for location prediction
Args:
pred (Tensor): bounding boxes prediction
targets (Tensor): targets for positive samples
weights (Tensor): weights for each positive samples
Return:
loss (Tensor): location loss
"""
plw = pred[:, 0]
pth = pred[:, 1]
prw = pred[:, 2]
pbh = pred[:, 3]
tlw = targets[:, 0]
tth = targets[:, 1]
trw = targets[:, 2]
tbh = targets[:, 3]
tlw.stop_gradient = True
trw.stop_gradient = True
tth.stop_gradient = True
tbh.stop_gradient = True
ilw = paddle.minimum(plw, tlw)
irw = paddle.minimum(prw, trw)
ith = paddle.minimum(pth, tth)
ibh = paddle.minimum(pbh, tbh)
clw = paddle.maximum(plw, tlw)
crw = paddle.maximum(prw, trw)
cth = paddle.maximum(pth, tth)
cbh = paddle.maximum(pbh, tbh)
area_predict = (plw + prw) * (pth + pbh)
area_target = (tlw + trw) * (tth + tbh)
area_inter = (ilw + irw) * (ith + ibh)
ious = (area_inter + 1.0) / (
area_predict + area_target - area_inter + 1.0)
ious = ious
if self.iou_loss_type.lower() == "linear_iou":
loss = 1.0 - ious
elif self.iou_loss_type.lower() == "giou":
area_uniou = area_predict + area_target - area_inter
area_circum = (clw + crw) * (cth + cbh) + 1e-7
giou = ious - (area_circum - area_uniou) / area_circum
loss = 1.0 - giou
elif self.iou_loss_type.lower() == "iou":
loss = 0.0 - paddle.log(ious)
else:
raise KeyError
if weights is not None:
loss = loss * weights
loss = paddle.sum(loss)
if avg_factor is not None:
loss = loss / avg_factor
return loss
# temp function: calcualate iou between bbox and target
def _bbox_overlap_align(self, pred, targets):
assert pred.shape[0] == targets.shape[0], \
'the pred should be aligned with target.'
plw = pred[:, 0]
pth = pred[:, 1]
prw = pred[:, 2]
pbh = pred[:, 3]
tlw = targets[:, 0]
tth = targets[:, 1]
trw = targets[:, 2]
tbh = targets[:, 3]
ilw = paddle.minimum(plw, tlw)
irw = paddle.minimum(prw, trw)
ith = paddle.minimum(pth, tth)
ibh = paddle.minimum(pbh, tbh)
area_predict = (plw + prw) * (pth + pbh)
area_target = (tlw + trw) * (tth + tbh)
area_inter = (ilw + irw) * (ith + ibh)
ious = (area_inter + 1.0) / (
area_predict + area_target - area_inter + 1.0)
return ious
def iou_based_soft_label_loss(self,
pred,
target,
alpha=0.75,
gamma=2.0,
iou_weighted=False,
implicit_iou=None,
avg_factor=None):
assert pred.shape == target.shape
pred = F.sigmoid(pred)
target = target.cast(pred.dtype)
if implicit_iou is not None:
pred = pred * implicit_iou
if iou_weighted:
focal_weight = (pred - target).abs().pow(gamma) * target * (target > 0.0).cast('float32') + \
alpha * (pred - target).abs().pow(gamma) * \
(target <= 0.0).cast('float32')
else:
focal_weight = (pred - target).abs().pow(gamma) * (target > 0.0).cast('float32') + \
alpha * (pred - target).abs().pow(gamma) * \
(target <= 0.0).cast('float32')
# focal loss
loss = F.binary_cross_entropy(
pred, target, reduction='none') * focal_weight
if avg_factor is not None:
loss = loss / avg_factor
return loss
def forward(self, cls_logits, bboxes_reg, centerness, tag_labels,
tag_bboxes, tag_center):
"""
Calculate the loss for classification, location and centerness
Args:
cls_logits (list): list of Tensor, which is predicted
score for all anchor points with shape [N, M, C]
bboxes_reg (list): list of Tensor, which is predicted
offsets for all anchor points with shape [N, M, 4]
centerness (list): list of Tensor, which is predicted
centerness for all anchor points with shape [N, M, 1]
tag_labels (list): list of Tensor, which is category
targets for each anchor point
tag_bboxes (list): list of Tensor, which is bounding
boxes targets for positive samples
tag_center (list): list of Tensor, which is centerness
targets for positive samples
Return:
loss (dict): loss composed by classification loss, bounding box
"""
cls_logits_flatten_list = []
bboxes_reg_flatten_list = []
centerness_flatten_list = []
tag_labels_flatten_list = []
tag_bboxes_flatten_list = []
tag_center_flatten_list = []
num_lvl = len(cls_logits)
for lvl in range(num_lvl):
cls_logits_flatten_list.append(
flatten_tensor(cls_logits[lvl], True))
bboxes_reg_flatten_list.append(
flatten_tensor(bboxes_reg[lvl], True))
centerness_flatten_list.append(
flatten_tensor(centerness[lvl], True))
tag_labels_flatten_list.append(
flatten_tensor(tag_labels[lvl], False))
tag_bboxes_flatten_list.append(
flatten_tensor(tag_bboxes[lvl], False))
tag_center_flatten_list.append(
flatten_tensor(tag_center[lvl], False))
cls_logits_flatten = paddle.concat(cls_logits_flatten_list, axis=0)
bboxes_reg_flatten = paddle.concat(bboxes_reg_flatten_list, axis=0)
centerness_flatten = paddle.concat(centerness_flatten_list, axis=0)
tag_labels_flatten = paddle.concat(tag_labels_flatten_list, axis=0)
tag_bboxes_flatten = paddle.concat(tag_bboxes_flatten_list, axis=0)
tag_center_flatten = paddle.concat(tag_center_flatten_list, axis=0)
tag_labels_flatten.stop_gradient = True
tag_bboxes_flatten.stop_gradient = True
tag_center_flatten.stop_gradient = True
# find positive index
mask_positive_bool = tag_labels_flatten > 0
mask_positive_bool.stop_gradient = True
mask_positive_float = paddle.cast(mask_positive_bool, dtype="float32")
mask_positive_float.stop_gradient = True
num_positive_fp32 = paddle.sum(mask_positive_float)
num_positive_fp32.stop_gradient = True
num_positive_int32 = paddle.cast(num_positive_fp32, dtype="int32")
num_positive_int32 = num_positive_int32 * 0 + 1
num_positive_int32.stop_gradient = True
# centerness target is used as reg weight
normalize_sum = paddle.sum(tag_center_flatten * mask_positive_float)
normalize_sum.stop_gradient = True
# 1. IoU-Based soft label loss
# calculate iou
with paddle.no_grad():
pos_ind = paddle.nonzero(
tag_labels_flatten.reshape([-1]) > 0).reshape([-1])
pos_pred = bboxes_reg_flatten[pos_ind]
pos_target = tag_bboxes_flatten[pos_ind]
bbox_iou = self._bbox_overlap_align(pos_pred, pos_target)
# pos labels
pos_labels = tag_labels_flatten[pos_ind].squeeze(1)
cls_target = paddle.zeros(cls_logits_flatten.shape)
cls_target[pos_ind, pos_labels - 1] = bbox_iou
cls_loss = self.iou_based_soft_label_loss(
cls_logits_flatten,
cls_target,
implicit_iou=F.sigmoid(centerness_flatten),
avg_factor=num_positive_fp32)
# 2. bboxes_reg: giou_loss
mask_positive_float = paddle.squeeze(mask_positive_float, axis=-1)
tag_center_flatten = paddle.squeeze(tag_center_flatten, axis=-1)
reg_loss = self._iou_loss(
bboxes_reg_flatten,
tag_bboxes_flatten,
mask_positive_float,
weights=tag_center_flatten)
reg_loss = reg_loss * mask_positive_float / normalize_sum
# 3. iou loss
pos_iou_pred = paddle.squeeze(centerness_flatten, axis=-1)[pos_ind]
loss_iou = ops.sigmoid_cross_entropy_with_logits(pos_iou_pred, bbox_iou)
loss_iou = loss_iou / num_positive_fp32 * 0.5
loss_all = {
"loss_cls": paddle.sum(cls_loss),
"loss_box": paddle.sum(reg_loss),
'loss_iou': paddle.sum(loss_iou),
}
return loss_all
# Concat multi-level feature maps by image
def levels_to_images(mlvl_tensor):
batch_size = mlvl_tensor[0].shape[0]
batch_list = [[] for _ in range(batch_size)]
channels = mlvl_tensor[0].shape[1]
for t in mlvl_tensor:
t = t.transpose([0, 2, 3, 1])
t = t.reshape([batch_size, -1, channels])
for img in range(batch_size):
batch_list[img].append(t[img])
return [paddle.concat(item, axis=0) for item in batch_list]
def multi_apply(func, *args, **kwargs):
"""Apply function to a list of arguments.
Note:
This function applies the ``func`` to multiple inputs and
map the multiple outputs of the ``func`` into different
list. Each list contains the same type of outputs corresponding
to different inputs.
Args:
func (Function): A function that will be applied to a list of
arguments
Returns:
tuple(list): A tuple containing multiple list, each list contains \
a kind of returned results by the function
"""
pfunc = partial(func, **kwargs) if kwargs else func
map_results = map(pfunc, *args)
return tuple(map(list, zip(*map_results)))
@register
class FCOSLossCR(FCOSLossMILC):
"""
FCOSLoss of Consistency Regularization
"""
def __init__(self,
iou_loss_type="giou",
cls_weight=2.0,
reg_weight=2.0,
iou_weight=0.5,
hard_neg_mining_flag=True):
super(FCOSLossCR, self).__init__()
self.iou_loss_type = iou_loss_type
self.cls_weight = cls_weight
self.reg_weight = reg_weight
self.iou_weight = iou_weight
self.hard_neg_mining_flag = hard_neg_mining_flag
def iou_loss(self, pred, targets, weights=None, avg_factor=None):
"""
Calculate the loss for location prediction
Args:
pred (Tensor): bounding boxes prediction
targets (Tensor): targets for positive samples
weights (Tensor): weights for each positive samples
Return:
loss (Tensor): location loss
"""
plw = pred[:, 0]
pth = pred[:, 1]
prw = pred[:, 2]
pbh = pred[:, 3]
tlw = targets[:, 0]
tth = targets[:, 1]
trw = targets[:, 2]
tbh = targets[:, 3]
tlw.stop_gradient = True
trw.stop_gradient = True
tth.stop_gradient = True
tbh.stop_gradient = True
ilw = paddle.minimum(plw, tlw)
irw = paddle.minimum(prw, trw)
ith = paddle.minimum(pth, tth)
ibh = paddle.minimum(pbh, tbh)
clw = paddle.maximum(plw, tlw)
crw = paddle.maximum(prw, trw)
cth = paddle.maximum(pth, tth)
cbh = paddle.maximum(pbh, tbh)
area_predict = (plw + prw) * (pth + pbh)
area_target = (tlw + trw) * (tth + tbh)
area_inter = (ilw + irw) * (ith + ibh)
ious = (area_inter + 1.0) / (
area_predict + area_target - area_inter + 1.0)
ious = ious
if self.iou_loss_type.lower() == "linear_iou":
loss = 1.0 - ious
elif self.iou_loss_type.lower() == "giou":
area_uniou = area_predict + area_target - area_inter
area_circum = (clw + crw) * (cth + cbh) + 1e-7
giou = ious - (area_circum - area_uniou) / area_circum
loss = 1.0 - giou
elif self.iou_loss_type.lower() == "iou":
loss = 0.0 - paddle.log(ious)
else:
raise KeyError
if weights is not None:
loss = loss * weights
loss = paddle.sum(loss)
if avg_factor is not None:
loss = loss / avg_factor
return loss
# calcualate iou between bbox and target
def bbox_overlap_align(self, pred, targets):
assert pred.shape[0] == targets.shape[0], \
'the pred should be aligned with target.'
plw = pred[:, 0]
pth = pred[:, 1]
prw = pred[:, 2]
pbh = pred[:, 3]
tlw = targets[:, 0]
tth = targets[:, 1]
trw = targets[:, 2]
tbh = targets[:, 3]
ilw = paddle.minimum(plw, tlw)
irw = paddle.minimum(prw, trw)
ith = paddle.minimum(pth, tth)
ibh = paddle.minimum(pbh, tbh)
area_predict = (plw + prw) * (pth + pbh)
area_target = (tlw + trw) * (tth + tbh)
area_inter = (ilw + irw) * (ith + ibh)
ious = (area_inter + 1.0) / (
area_predict + area_target - area_inter + 1.0)
return ious
# cls loss: iou-based soft lable with joint iou
def quality_focal_loss(self,
stu_cls,
targets,
quality=None,
weights=None,
alpha=0.75,
gamma=2.0,
avg_factor='sum'):
stu_cls = F.sigmoid(stu_cls)
if quality is not None:
stu_cls = stu_cls * F.sigmoid(quality)
focal_weight = (stu_cls - targets).abs().pow(gamma) * (targets > 0.0).cast('float32') + \
alpha * (stu_cls - targets).abs().pow(gamma) * \
(targets <= 0.0).cast('float32')
loss = F.binary_cross_entropy(
stu_cls, targets, reduction='none') * focal_weight
if weights is not None:
loss = loss * weights.reshape([-1, 1])
loss = paddle.sum(loss)
if avg_factor is not None:
loss = loss / avg_factor
return loss
# generate points according to feature maps
def compute_locations_by_level(self, fpn_stride, h, w):
"""
Compute locations of anchor points of each FPN layer
Return:
Anchor points locations of current FPN feature map
"""
shift_x = paddle.arange(0, w * fpn_stride, fpn_stride)
shift_y = paddle.arange(0, h * fpn_stride, fpn_stride)
shift_x = paddle.unsqueeze(shift_x, axis=0)
shift_y = paddle.unsqueeze(shift_y, axis=1)
shift_x = paddle.expand(shift_x, shape=[h, w])
shift_y = paddle.expand(shift_y, shape=[h, w])
shift_x = paddle.reshape(shift_x, shape=[-1])
shift_y = paddle.reshape(shift_y, shape=[-1])
location = paddle.stack(
[shift_x, shift_y], axis=-1) + float(fpn_stride) / 2
return location
# decode bbox from ltrb to x1y1x2y2
def decode_bbox(self, ltrb, points):
assert ltrb.shape[0] == points.shape[0], \
"When decoding bbox in one image, the num of loc should be same with points."
bbox_decoding = paddle.stack(
[
points[:, 0] - ltrb[:, 0], points[:, 1] - ltrb[:, 1],
points[:, 0] + ltrb[:, 2], points[:, 1] + ltrb[:, 3]
],
axis=1)
return bbox_decoding
# encode bbox from x1y1x2y2 to ltrb
def encode_bbox(self, bbox, points):
assert bbox.shape[0] == points.shape[0], \
"When encoding bbox in one image, the num of bbox should be same with points."
bbox_encoding = paddle.stack(
[
points[:, 0] - bbox[:, 0], points[:, 1] - bbox[:, 1],
bbox[:, 2] - points[:, 0], bbox[:, 3] - points[:, 1]
],
axis=1)
return bbox_encoding
def calcualate_iou(self, gt_bbox, predict_bbox):
# bbox area
gt_area = (gt_bbox[:, 2] - gt_bbox[:, 0]) * \
(gt_bbox[:, 3] - gt_bbox[:, 1])
predict_area = (predict_bbox[:, 2] - predict_bbox[:, 0]) * \
(predict_bbox[:, 3] - predict_bbox[:, 1])
# overlop area
lt = paddle.fmax(gt_bbox[:, None, :2], predict_bbox[None, :, :2])
rb = paddle.fmin(gt_bbox[:, None, 2:], predict_bbox[None, :, 2:])
wh = paddle.clip(rb - lt, min=0)
overlap = wh[..., 0] * wh[..., 1]
# iou
iou = overlap / (gt_area[:, None] + predict_area[None, :] - overlap)
return iou
# select potential positives from hard negatives
def hard_neg_mining(self,
cls_score,
loc_ltrb,
quality,
pos_ind,
hard_neg_ind,
loc_mask,
loc_targets,
iou_thresh=0.6):
# get points locations and strides
points_list = []
strides_list = []
scale_list = []
scale = [0, 1, 2, 3, 4]
for fpn_scale, fpn_stride, HW in zip(scale, self.fpn_stride,
self.lvl_hw):
h, w = HW
lvl_points = self.compute_locations_by_level(fpn_stride, h, w)
points_list.append(lvl_points)
lvl_strides = paddle.full([h * w, 1], fpn_stride)
strides_list.append(lvl_strides)
lvl_scales = paddle.full([h * w, 1], fpn_scale)
scale_list.append(lvl_scales)
points = paddle.concat(points_list, axis=0)
strides = paddle.concat(strides_list, axis=0)
scales = paddle.concat(scale_list, axis=0)
# cls scores
cls_vals = F.sigmoid(cls_score) * F.sigmoid(quality)
max_vals = paddle.max(cls_vals, axis=-1)
class_ind = paddle.argmax(cls_vals, axis=-1)
### calculate iou between positive and hard negative
# decode pos bbox
pos_cls = max_vals[pos_ind]
pos_loc = loc_ltrb[pos_ind].reshape([-1, 4])
pos_strides = strides[pos_ind]
pos_points = points[pos_ind].reshape([-1, 2])
pos_loc = pos_loc * pos_strides
pos_bbox = self.decode_bbox(pos_loc, pos_points)
pos_scales = scales[pos_ind]
# decode hard negative bbox
hard_neg_loc = loc_ltrb[hard_neg_ind].reshape([-1, 4])
hard_neg_strides = strides[hard_neg_ind]
hard_neg_points = points[hard_neg_ind].reshape([-1, 2])
hard_neg_loc = hard_neg_loc * hard_neg_strides
hard_neg_bbox = self.decode_bbox(hard_neg_loc, hard_neg_points)
hard_neg_scales = scales[hard_neg_ind]
# iou between pos bbox and hard negative bbox
hard_neg_pos_iou = self.calcualate_iou(hard_neg_bbox, pos_bbox)
### select potential positives from hard negatives
# scale flag
scale_temp = paddle.abs(
pos_scales.reshape([-1])[None, :] - hard_neg_scales.reshape([-1])
[:, None])
scale_flag = (scale_temp <= 1.)
# iou flag
iou_flag = (hard_neg_pos_iou >= iou_thresh)
# same class flag
pos_class = class_ind[pos_ind]
hard_neg_class = class_ind[hard_neg_ind]
class_flag = pos_class[None, :] - hard_neg_class[:, None]
class_flag = (class_flag == 0)
# hard negative point inside positive bbox flag
ltrb_temp = paddle.stack(
[
hard_neg_points[:, None, 0] - pos_bbox[None, :, 0],
hard_neg_points[:, None, 1] - pos_bbox[None, :, 1],
pos_bbox[None, :, 2] - hard_neg_points[:, None, 0],
pos_bbox[None, :, 3] - hard_neg_points[:, None, 1]
],
axis=-1)
inside_flag = ltrb_temp.min(axis=-1) > 0
# reset iou
valid_flag = (iou_flag & class_flag & inside_flag & scale_flag)
invalid_iou = paddle.zeros_like(hard_neg_pos_iou)
hard_neg_pos_iou = paddle.where(valid_flag, hard_neg_pos_iou,
invalid_iou)
pos_hard_neg_max_iou = hard_neg_pos_iou.max(axis=-1)
# selece potential pos
potential_pos_ind = (pos_hard_neg_max_iou > 0.)
num_potential_pos = paddle.nonzero(potential_pos_ind).shape[0]
if num_potential_pos == 0:
return None
### calculate loc target:aggregate all matching bboxes as the bbox targets of potential pos
# prepare data
potential_points = hard_neg_points[potential_pos_ind].reshape([-1, 2])
potential_strides = hard_neg_strides[potential_pos_ind]
potential_valid_flag = valid_flag[potential_pos_ind]
potential_pos_ind = hard_neg_ind[potential_pos_ind]
# get cls and box of matching positives
pos_cls = max_vals[pos_ind]
expand_pos_bbox = paddle.expand(
pos_bbox,
shape=[num_potential_pos, pos_bbox.shape[0], pos_bbox.shape[1]])
expand_pos_cls = paddle.expand(
pos_cls, shape=[num_potential_pos, pos_cls.shape[0]])
invalid_cls = paddle.zeros_like(expand_pos_cls)
expand_pos_cls = paddle.where(potential_valid_flag, expand_pos_cls,
invalid_cls)
expand_pos_cls = paddle.unsqueeze(expand_pos_cls, axis=-1)
# aggregate box based on cls_score
agg_bbox = (expand_pos_bbox * expand_pos_cls).sum(axis=1) \
/ expand_pos_cls.sum(axis=1)
agg_ltrb = self.encode_bbox(agg_bbox, potential_points)
agg_ltrb = agg_ltrb / potential_strides
# loc target for all pos
loc_targets[potential_pos_ind] = agg_ltrb
loc_mask[potential_pos_ind] = 1.
return loc_mask, loc_targets
# get training targets
def get_targets_per_img(self, tea_cls, tea_loc, tea_iou, stu_cls, stu_loc,
stu_iou):
### sample selection
# prepare datas
tea_cls_scores = F.sigmoid(tea_cls) * F.sigmoid(tea_iou)
class_ind = paddle.argmax(tea_cls_scores, axis=-1)
max_vals = paddle.max(tea_cls_scores, axis=-1)
cls_mask = paddle.zeros_like(
max_vals
) # set cls valid mask: pos is 1, hard_negative and negative are 0.
num_pos, num_hard_neg = 0, 0
# mean-std selection
# using nonzero to turn index from bool to int, because the index will be used to compose two-dim index in following.
# using squeeze rather than reshape to avoid errors when no score is larger than thresh.
candidate_ind = paddle.nonzero(max_vals >= 0.1).squeeze(axis=-1)
num_candidate = candidate_ind.shape[0]
if num_candidate > 0:
# pos thresh = mean + std to select pos samples
candidate_score = max_vals[candidate_ind]
candidate_score_mean = candidate_score.mean()
candidate_score_std = candidate_score.std()
pos_thresh = (candidate_score_mean + candidate_score_std).clip(
max=0.4)
# select pos
pos_ind = paddle.nonzero(max_vals >= pos_thresh).squeeze(axis=-1)
num_pos = pos_ind.shape[0]
# select hard negatives as potential pos
hard_neg_ind = (max_vals >= 0.1) & (max_vals < pos_thresh)
hard_neg_ind = paddle.nonzero(hard_neg_ind).squeeze(axis=-1)
num_hard_neg = hard_neg_ind.shape[0]
# if not positive, directly select top-10 as pos.
if (num_pos == 0):
num_pos = 10
_, pos_ind = paddle.topk(max_vals, k=num_pos)
cls_mask[pos_ind] = 1.
### Consistency Regularization Training targets
# cls targets
pos_class_ind = class_ind[pos_ind]
cls_targets = paddle.zeros_like(tea_cls)
cls_targets[pos_ind, pos_class_ind] = tea_cls_scores[pos_ind,
pos_class_ind]
# hard negative cls target
if num_hard_neg != 0:
cls_targets[hard_neg_ind] = tea_cls_scores[hard_neg_ind]
# loc targets
loc_targets = paddle.zeros_like(tea_loc)
loc_targets[pos_ind] = tea_loc[pos_ind]
# iou targets
iou_targets = paddle.zeros(
shape=[tea_iou.shape[0]], dtype=tea_iou.dtype)
iou_targets[pos_ind] = F.sigmoid(
paddle.squeeze(
tea_iou, axis=-1)[pos_ind])
loc_mask = cls_mask.clone()
# select potential positive from hard negatives for loc_task training
if (num_hard_neg > 0) and self.hard_neg_mining_flag:
results = self.hard_neg_mining(tea_cls, tea_loc, tea_iou, pos_ind,
hard_neg_ind, loc_mask, loc_targets)
if results is not None:
loc_mask, loc_targets = results
loc_pos_ind = paddle.nonzero(loc_mask > 0.).squeeze(axis=-1)
iou_targets[loc_pos_ind] = F.sigmoid(
paddle.squeeze(
tea_iou, axis=-1)[loc_pos_ind])
return cls_mask, loc_mask, \
cls_targets, loc_targets, iou_targets
def forward(self, student_prediction, teacher_prediction):
stu_cls_lvl, stu_loc_lvl, stu_iou_lvl = student_prediction
tea_cls_lvl, tea_loc_lvl, tea_iou_lvl, self.fpn_stride = teacher_prediction
# H and W of level (used for aggregating targets)
self.lvl_hw = []
for t in tea_cls_lvl:
_, _, H, W = t.shape
self.lvl_hw.append([H, W])
# levels to images
stu_cls_img = levels_to_images(stu_cls_lvl)
stu_loc_img = levels_to_images(stu_loc_lvl)
stu_iou_img = levels_to_images(stu_iou_lvl)
tea_cls_img = levels_to_images(tea_cls_lvl)
tea_loc_img = levels_to_images(tea_loc_lvl)
tea_iou_img = levels_to_images(tea_iou_lvl)
with paddle.no_grad():
cls_mask, loc_mask, \
cls_targets, loc_targets, iou_targets = multi_apply(
self.get_targets_per_img,
tea_cls_img,
tea_loc_img,
tea_iou_img,
stu_cls_img,
stu_loc_img,
stu_iou_img
)
# flatten preditction
stu_cls = paddle.concat(stu_cls_img, axis=0)
stu_loc = paddle.concat(stu_loc_img, axis=0)
stu_iou = paddle.concat(stu_iou_img, axis=0)
# flatten targets
cls_mask = paddle.concat(cls_mask, axis=0)
loc_mask = paddle.concat(loc_mask, axis=0)
cls_targets = paddle.concat(cls_targets, axis=0)
loc_targets = paddle.concat(loc_targets, axis=0)
iou_targets = paddle.concat(iou_targets, axis=0)
### Training Weights and avg factor
# find positives
cls_pos_ind = paddle.nonzero(cls_mask > 0.).squeeze(axis=-1)
loc_pos_ind = paddle.nonzero(loc_mask > 0.).squeeze(axis=-1)
# cls weight
cls_sample_weights = paddle.ones([cls_targets.shape[0]])
cls_avg_factor = paddle.max(cls_targets[cls_pos_ind],
axis=-1).sum().item()
# loc weight
loc_sample_weights = paddle.max(cls_targets[loc_pos_ind], axis=-1)
loc_avg_factor = loc_sample_weights.sum().item()
# iou weight
iou_sample_weights = paddle.ones([loc_pos_ind.shape[0]])
iou_avg_factor = loc_pos_ind.shape[0]
### unsupervised loss
# cls loss
loss_cls = self.quality_focal_loss(
stu_cls,
cls_targets,
quality=stu_iou,
weights=cls_sample_weights,
avg_factor=cls_avg_factor) * self.cls_weight
# iou loss
pos_stu_iou = paddle.squeeze(stu_iou, axis=-1)[loc_pos_ind]
pos_iou_targets = iou_targets[loc_pos_ind]
loss_iou = F.binary_cross_entropy(
F.sigmoid(pos_stu_iou), pos_iou_targets,
reduction='none') * iou_sample_weights
loss_iou = loss_iou.sum() / iou_avg_factor * self.iou_weight
# box loss
pos_stu_loc = stu_loc[loc_pos_ind]
pos_loc_targets = loc_targets[loc_pos_ind]
loss_box = self.iou_loss(
pos_stu_loc,
pos_loc_targets,
weights=loc_sample_weights,
avg_factor=loc_avg_factor)
loss_box = loss_box * self.reg_weight
loss_all = {
"loss_cls": loss_cls,
"loss_box": loss_box,
"loss_iou": loss_iou,
}
return loss_all
================================================
FILE: ppdet/modeling/losses/focal_loss.py
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn.functional as F
import paddle.nn as nn
from ppdet.core.workspace import register
__all__ = ['FocalLoss', 'Weighted_FocalLoss']
@register
class FocalLoss(nn.Layer):
"""A wrapper around paddle.nn.functional.sigmoid_focal_loss.
Args:
use_sigmoid (bool): currently only support use_sigmoid=True
alpha (float): parameter alpha in Focal Loss
gamma (float): parameter gamma in Focal Loss
loss_weight (float): final loss will be multiplied by this
"""
def __init__(self,
use_sigmoid=True,
alpha=0.25,
gamma=2.0,
loss_weight=1.0):
super(FocalLoss, self).__init__()
assert use_sigmoid == True, \
'Focal Loss only supports sigmoid at the moment'
self.use_sigmoid = use_sigmoid
self.alpha = alpha
self.gamma = gamma
self.loss_weight = loss_weight
def forward(self, pred, target, reduction='none'):
"""forward function.
Args:
pred (Tensor): logits of class prediction, of shape (N, num_classes)
target (Tensor): target class label, of shape (N, )
reduction (str): the way to reduce loss, one of (none, sum, mean)
"""
num_classes = pred.shape[1]
target = F.one_hot(target, num_classes+1).cast(pred.dtype)
target = target[:, :-1].detach()
loss = F.sigmoid_focal_loss(
pred, target, alpha=self.alpha, gamma=self.gamma,
reduction=reduction)
return loss * self.loss_weight
@register
class Weighted_FocalLoss(FocalLoss):
"""A wrapper around paddle.nn.functional.sigmoid_focal_loss.
Args:
use_sigmoid (bool): currently only support use_sigmoid=True
alpha (float): parameter alpha in Focal Loss
gamma (float): parameter gamma in Focal Loss
loss_weight (float): final loss will be multiplied by this
"""
def __init__(self,
use_sigmoid=True,
alpha=0.25,
gamma=2.0,
loss_weight=1.0,
reduction="mean"):
super(FocalLoss, self).__init__()
assert use_sigmoid == True, \
'Focal Loss only supports sigmoid at the moment'
self.use_sigmoid = use_sigmoid
self.alpha = alpha
self.gamma = gamma
self.loss_weight = loss_weight
self.reduction = reduction
def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None):
"""forward function.
Args:
pred (Tensor): logits of class prediction, of shape (N, num_classes)
target (Tensor): target class label, of shape (N, )
reduction (str): the way to reduce loss, one of (none, sum, mean)
"""
assert reduction_override in (None, 'none', 'mean', 'sum')
reduction = (
reduction_override if reduction_override else self.reduction)
num_classes = pred.shape[1]
target = F.one_hot(target, num_classes + 1).astype(pred.dtype)
target = target[:, :-1].detach()
loss = F.sigmoid_focal_loss(
pred, target, alpha=self.alpha, gamma=self.gamma,
reduction='none')
if weight is not None:
if weight.shape != loss.shape:
if weight.shape[0] == loss.shape[0]:
# For most cases, weight is of shape (num_priors, ),
# which means it does not have the second axis num_class
weight = weight.reshape((-1, 1))
else:
# Sometimes, weight per anchor per class is also needed. e.g.
# in FSAF. But it may be flattened of shape
# (num_priors x num_class, ), while loss is still of shape
# (num_priors, num_class).
assert weight.numel() == loss.numel()
weight = weight.reshape((loss.shape[0], -1))
assert weight.ndim == loss.ndim
loss = loss * weight
# if avg_factor is not specified, just reduce the loss
if avg_factor is None:
if reduction == 'mean':
loss = loss.mean()
elif reduction == 'sum':
loss = loss.sum()
else:
# if reduction is mean, then average the loss by avg_factor
if reduction == 'mean':
# Avoid causing ZeroDivisionError when avg_factor is 0.0,
# i.e., all labels of an image belong to ignore index.
eps = 1e-10
loss = loss.sum() / (avg_factor + eps)
# if reduction is 'none', then do nothing, otherwise raise an error
elif reduction != 'none':
raise ValueError('avg_factor can not be used with reduction="sum"')
return loss * self.loss_weight
================================================
FILE: ppdet/modeling/losses/gfocal_loss.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# The code is based on:
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/gfocal_loss.py
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register, serializable
from ppdet.modeling import ops
__all__ = ['QualityFocalLoss', 'DistributionFocalLoss']
def quality_focal_loss(pred, target, beta=2.0, use_sigmoid=True):
"""
Quality Focal Loss (QFL) is from `Generalized Focal Loss: Learning
Qualified and Distributed Bounding Boxes for Dense Object Detection