Repository: clxia12/RT-DETRv3 Branch: main Commit: 349e7d99a506 Files: 393 Total size: 3.9 MB Directory structure: gitextract_swotw2om/ ├── .gitignore ├── LICENSE ├── README.md ├── configs/ │ ├── datasets/ │ │ ├── coco_detection.yml │ │ ├── coco_instance.yml │ │ ├── culane.yml │ │ ├── dota.yml │ │ ├── dota_ms.yml │ │ ├── lvis_detection.yml │ │ ├── mcmot.yml │ │ ├── mot.yml │ │ ├── objects365_detection.yml │ │ ├── roadsign_voc.yml │ │ ├── sniper_coco_detection.yml │ │ ├── sniper_visdrone_detection.yml │ │ ├── spine_coco.yml │ │ ├── visdrone_detection.yml │ │ ├── voc.yml │ │ └── wider_face.yml │ ├── rtdetrv3/ │ │ ├── _base_/ │ │ │ ├── optimizer_6x.yml │ │ │ ├── rtdetr_reader.yml │ │ │ └── rtdetrv3_r50vd.yml │ │ ├── rtdetrv3_r18vd_6x_coco.yml │ │ ├── rtdetrv3_r18vd_6x_lvis.yml │ │ ├── rtdetrv3_r34vd_6x_coco.yml │ │ ├── rtdetrv3_r50vd_6x_coco.yml │ │ └── rtdetrv3_r50vd_6x_lvis.yml │ └── runtime.yml ├── dataset/ │ ├── coco/ │ │ └── download_coco.py │ ├── dota/ │ │ └── .gitignore │ ├── mot/ │ │ └── gen_labels_MOT.py │ ├── roadsign_voc/ │ │ ├── download_roadsign_voc.py │ │ └── label_list.txt │ ├── spine_coco/ │ │ └── download_spine_coco.py │ ├── voc/ │ │ ├── create_list.py │ │ ├── download_voc.py │ │ └── label_list.txt │ └── wider_face/ │ └── download_wider_face.sh ├── ppdet/ │ ├── __init__.py │ ├── core/ │ │ ├── __init__.py │ │ ├── config/ │ │ │ ├── __init__.py │ │ │ ├── schema.py │ │ │ └── yaml_helpers.py │ │ └── workspace.py │ ├── data/ │ │ ├── __init__.py │ │ ├── crop_utils/ │ │ │ ├── __init__.py │ │ │ ├── annotation_cropper.py │ │ │ └── chip_box_utils.py │ │ ├── culane_utils.py │ │ ├── reader.py │ │ ├── shm_utils.py │ │ ├── source/ │ │ │ ├── __init__.py │ │ │ ├── category.py │ │ │ ├── coco.py │ │ │ ├── culane.py │ │ │ ├── dataset.py │ │ │ ├── keypoint_coco.py │ │ │ ├── lvis.py │ │ │ ├── mot.py │ │ │ ├── pose3d_cmb.py │ │ │ ├── sniper_coco.py │ │ │ ├── voc.py │ │ │ └── widerface.py │ │ ├── transform/ │ │ │ ├── __init__.py │ │ │ ├── atss_assigner.py │ │ │ ├── autoaugment_utils.py │ │ │ ├── batch_operators.py │ │ │ ├── culane_operators.py │ │ │ ├── gridmask_utils.py │ │ │ ├── keypoint_operators.py │ │ │ ├── keypoints_3d_operators.py │ │ │ ├── mot_operators.py │ │ │ ├── op_helper.py │ │ │ ├── operators.py │ │ │ └── rotated_operators.py │ │ └── utils.py │ ├── engine/ │ │ ├── __init__.py │ │ ├── callbacks.py │ │ ├── env.py │ │ ├── export_utils.py │ │ ├── naive_sync_bn.py │ │ ├── tracker.py │ │ ├── trainer.py │ │ ├── trainer_cot.py │ │ └── trainer_ssod.py │ ├── ext_op/ │ │ ├── README.md │ │ ├── csrc/ │ │ │ ├── matched_rbox_iou/ │ │ │ │ ├── matched_rbox_iou.cc │ │ │ │ └── matched_rbox_iou.cu │ │ │ ├── nms_rotated/ │ │ │ │ ├── nms_rotated.cc │ │ │ │ └── nms_rotated.cu │ │ │ └── rbox_iou/ │ │ │ ├── rbox_iou.cc │ │ │ ├── rbox_iou.cu │ │ │ └── rbox_iou_utils.h │ │ ├── setup.py │ │ └── unittest/ │ │ ├── test_matched_rbox_iou.py │ │ └── test_rbox_iou.py │ ├── metrics/ │ │ ├── __init__.py │ │ ├── coco_utils.py │ │ ├── culane_metrics.py │ │ ├── fast_cocoeval/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── ext/ │ │ │ │ ├── cocoeval.cc │ │ │ │ ├── cocoeval.h │ │ │ │ └── setup.py │ │ │ └── fast_cocoeval.py │ │ ├── json_results.py │ │ ├── keypoint_metrics.py │ │ ├── lvis_utils.py │ │ ├── map_utils.py │ │ ├── mcmot_metrics.py │ │ ├── metrics.py │ │ ├── mot_metrics.py │ │ ├── munkres.py │ │ ├── pose3d_metrics.py │ │ └── widerface_utils.py │ ├── model_zoo/ │ │ ├── .gitignore │ │ ├── __init__.py │ │ ├── model_zoo.py │ │ └── tests/ │ │ ├── __init__.py │ │ ├── test_get_model.py │ │ └── test_list_model.py │ ├── modeling/ │ │ ├── __init__.py │ │ ├── architectures/ │ │ │ ├── __init__.py │ │ │ ├── blazeface.py │ │ │ ├── bytetrack.py │ │ │ ├── cascade_rcnn.py │ │ │ ├── centernet.py │ │ │ ├── centertrack.py │ │ │ ├── clrnet.py │ │ │ ├── deepsort.py │ │ │ ├── detr.py │ │ │ ├── detr_ssod.py │ │ │ ├── fairmot.py │ │ │ ├── faster_rcnn.py │ │ │ ├── fcos.py │ │ │ ├── gfl.py │ │ │ ├── jde.py │ │ │ ├── keypoint_hrhrnet.py │ │ │ ├── keypoint_hrnet.py │ │ │ ├── keypoint_petr.py │ │ │ ├── keypoint_vitpose.py │ │ │ ├── mask_rcnn.py │ │ │ ├── meta_arch.py │ │ │ ├── multi_stream_detector.py │ │ │ ├── picodet.py │ │ │ ├── pose3d_metro.py │ │ │ ├── ppyoloe.py │ │ │ ├── queryinst.py │ │ │ ├── retinanet.py │ │ │ ├── rtdetrv3.py │ │ │ ├── s2anet.py │ │ │ ├── solov2.py │ │ │ ├── sparse_rcnn.py │ │ │ ├── ssd.py │ │ │ ├── tood.py │ │ │ ├── ttfnet.py │ │ │ ├── yolo.py │ │ │ ├── yolof.py │ │ │ └── yolox.py │ │ ├── assigners/ │ │ │ ├── __init__.py │ │ │ ├── atss_assigner.py │ │ │ ├── clrnet_assigner.py │ │ │ ├── fcosr_assigner.py │ │ │ ├── hungarian_assigner.py │ │ │ ├── max_iou_assigner.py │ │ │ ├── pose_utils.py │ │ │ ├── rotated_task_aligned_assigner.py │ │ │ ├── simota_assigner.py │ │ │ ├── task_aligned_assigner.py │ │ │ ├── task_aligned_assigner_cr.py │ │ │ ├── uniform_assigner.py │ │ │ └── utils.py │ │ ├── backbones/ │ │ │ ├── __init__.py │ │ │ ├── blazenet.py │ │ │ ├── clrnet_resnet.py │ │ │ ├── convnext.py │ │ │ ├── csp_darknet.py │ │ │ ├── cspresnet.py │ │ │ ├── darknet.py │ │ │ ├── dla.py │ │ │ ├── esnet.py │ │ │ ├── focalnet.py │ │ │ ├── ghostnet.py │ │ │ ├── hardnet.py │ │ │ ├── hgnet_v2.py │ │ │ ├── hrnet.py │ │ │ ├── lcnet.py │ │ │ ├── lite_hrnet.py │ │ │ ├── mobilenet_v1.py │ │ │ ├── mobilenet_v3.py │ │ │ ├── mobileone.py │ │ │ ├── name_adapter.py │ │ │ ├── res2net.py │ │ │ ├── resnet.py │ │ │ ├── senet.py │ │ │ ├── shufflenet_v2.py │ │ │ ├── swin_transformer.py │ │ │ ├── trans_encoder.py │ │ │ ├── transformer_utils.py │ │ │ ├── vgg.py │ │ │ ├── vision_transformer.py │ │ │ ├── vit_mae.py │ │ │ └── vitpose.py │ │ ├── bbox_utils.py │ │ ├── clrnet_utils.py │ │ ├── cls_utils.py │ │ ├── heads/ │ │ │ ├── __init__.py │ │ │ ├── bbox_head.py │ │ │ ├── cascade_head.py │ │ │ ├── centernet_head.py │ │ │ ├── centertrack_head.py │ │ │ ├── clrnet_head.py │ │ │ ├── detr_head.py │ │ │ ├── face_head.py │ │ │ ├── fcos_head.py │ │ │ ├── fcosr_head.py │ │ │ ├── gfl_head.py │ │ │ ├── keypoint_hrhrnet_head.py │ │ │ ├── mask_head.py │ │ │ ├── petr_head.py │ │ │ ├── pico_head.py │ │ │ ├── ppyoloe_contrast_head.py │ │ │ ├── ppyoloe_head.py │ │ │ ├── ppyoloe_ins_head.py │ │ │ ├── ppyoloe_r_head.py │ │ │ ├── retina_head.py │ │ │ ├── roi_extractor.py │ │ │ ├── s2anet_head.py │ │ │ ├── simota_head.py │ │ │ ├── solov2_head.py │ │ │ ├── sparse_roi_head.py │ │ │ ├── sparsercnn_head.py │ │ │ ├── ssd_head.py │ │ │ ├── tood_head.py │ │ │ ├── ttf_head.py │ │ │ ├── vitpose_head.py │ │ │ ├── yolo_head.py │ │ │ └── yolof_head.py │ │ ├── initializer.py │ │ ├── keypoint_utils.py │ │ ├── lane_utils.py │ │ ├── layers.py │ │ ├── losses/ │ │ │ ├── __init__.py │ │ │ ├── clrnet_line_iou_loss.py │ │ │ ├── clrnet_loss.py │ │ │ ├── cot_loss.py │ │ │ ├── ctfocal_loss.py │ │ │ ├── detr_loss.py │ │ │ ├── fairmot_loss.py │ │ │ ├── fcos_loss.py │ │ │ ├── focal_loss.py │ │ │ ├── gfocal_loss.py │ │ │ ├── iou_aware_loss.py │ │ │ ├── iou_loss.py │ │ │ ├── jde_loss.py │ │ │ ├── keypoint_loss.py │ │ │ ├── pose3d_loss.py │ │ │ ├── probiou_loss.py │ │ │ ├── queryinst_loss.py │ │ │ ├── smooth_l1_loss.py │ │ │ ├── solov2_loss.py │ │ │ ├── sparsercnn_loss.py │ │ │ ├── ssd_loss.py │ │ │ ├── supcontrast.py │ │ │ ├── varifocal_loss.py │ │ │ └── yolo_loss.py │ │ ├── mot/ │ │ │ ├── __init__.py │ │ │ ├── matching/ │ │ │ │ ├── __init__.py │ │ │ │ ├── deepsort_matching.py │ │ │ │ ├── jde_matching.py │ │ │ │ └── ocsort_matching.py │ │ │ ├── motion/ │ │ │ │ ├── __init__.py │ │ │ │ ├── gmc.py │ │ │ │ ├── kalman_filter.py │ │ │ │ └── ocsort_kalman_filter.py │ │ │ ├── tracker/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base_jde_tracker.py │ │ │ │ ├── base_sde_tracker.py │ │ │ │ ├── botsort_tracker.py │ │ │ │ ├── center_tracker.py │ │ │ │ ├── deepsort_tracker.py │ │ │ │ ├── jde_tracker.py │ │ │ │ └── ocsort_tracker.py │ │ │ ├── utils.py │ │ │ └── visualization.py │ │ ├── necks/ │ │ │ ├── __init__.py │ │ │ ├── bifpn.py │ │ │ ├── blazeface_fpn.py │ │ │ ├── centernet_fpn.py │ │ │ ├── channel_mapper.py │ │ │ ├── clrnet_fpn.py │ │ │ ├── csp_pan.py │ │ │ ├── custom_pan.py │ │ │ ├── dilated_encoder.py │ │ │ ├── es_pan.py │ │ │ ├── fpn.py │ │ │ ├── hrfpn.py │ │ │ ├── lc_pan.py │ │ │ ├── ttf_fpn.py │ │ │ └── yolo_fpn.py │ │ ├── ops.py │ │ ├── post_process.py │ │ ├── proposal_generator/ │ │ │ ├── __init__.py │ │ │ ├── anchor_generator.py │ │ │ ├── embedding_rpn_head.py │ │ │ ├── proposal_generator.py │ │ │ ├── rpn_head.py │ │ │ ├── target.py │ │ │ └── target_layer.py │ │ ├── rbox_utils.py │ │ ├── reid/ │ │ │ ├── __init__.py │ │ │ ├── fairmot_embedding_head.py │ │ │ ├── jde_embedding_head.py │ │ │ ├── pplcnet_embedding.py │ │ │ ├── pyramidal_embedding.py │ │ │ ├── resnet.py │ │ │ └── resnet_embedding.py │ │ ├── shape_spec.py │ │ ├── ssod/ │ │ │ ├── __init__.py │ │ │ ├── losses.py │ │ │ └── utils.py │ │ ├── tests/ │ │ │ ├── __init__.py │ │ │ ├── test_architectures.py │ │ │ ├── test_base.py │ │ │ ├── test_mstest.py │ │ │ ├── test_ops.py │ │ │ └── test_yolov3_loss.py │ │ └── transformers/ │ │ ├── __init__.py │ │ ├── deformable_transformer.py │ │ ├── detr_transformer.py │ │ ├── dino_transformer.py │ │ ├── ext_op/ │ │ │ ├── README.md │ │ │ ├── ms_deformable_attn_op.cc │ │ │ ├── ms_deformable_attn_op.cu │ │ │ ├── setup_ms_deformable_attn_op.py │ │ │ └── test_ms_deformable_attn_op.py │ │ ├── group_detr_transformer.py │ │ ├── hybrid_encoder.py │ │ ├── mask_dino_transformer.py │ │ ├── mask_rtdetr_transformer.py │ │ ├── matchers.py │ │ ├── petr_transformer.py │ │ ├── position_encoding.py │ │ ├── rtdetr_transformer.py │ │ ├── rtdetr_transformerv2.py │ │ ├── rtdetr_transformerv3.py │ │ └── utils.py │ ├── optimizer/ │ │ ├── __init__.py │ │ ├── adamw.py │ │ ├── ema.py │ │ ├── optimizer.py │ │ └── utils.py │ ├── slim/ │ │ ├── __init__.py │ │ ├── distill_loss.py │ │ ├── distill_model.py │ │ ├── ofa.py │ │ ├── prune.py │ │ ├── quant.py │ │ └── unstructured_prune.py │ └── utils/ │ ├── __init__.py │ ├── cam_utils.py │ ├── check.py │ ├── checkpoint.py │ ├── cli.py │ ├── colormap.py │ ├── compact.py │ ├── download.py │ ├── fuse_utils.py │ ├── logger.py │ ├── profiler.py │ ├── stats.py │ ├── visualizer.py │ └── voc_utils.py ├── requirements.txt ├── scripts/ │ ├── build_wheel.sh │ ├── eval.sh │ ├── kill.sh │ └── train.sh └── tools/ ├── anchor_cluster.py ├── box_distribution.py ├── cam_ppdet.py ├── eval.py ├── eval_mot.py ├── export_model.py ├── gen_semi_coco.py ├── infer.py ├── infer_culane.py ├── infer_mot.py ├── post_quant.py ├── slice_image.py ├── sniper_params_stats.py ├── train.py └── x2coco.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/latest/usage/project/#working-with-version-control .pdm.toml .pdm-python .pdm-build/ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ English | [简体中文](README_cn.md) ## RT-DETRv3: Real-time End-to-End Object Detection with Hierarchical Dense Positive Supervision :fire::fire:**[WACV 2025 Oral]** The official implementation of the paper "[RT-DETRv3: Real-time End-to-End Object Detection with Hierarchical Dense Positive Supervision](https://arxiv.org/pdf/2409.08475)". \ [[`arXiv`](https://arxiv.org/pdf/2409.08475)] ![image](https://github.com/user-attachments/assets/5910d729-cc44-49f4-b404-b6631576930f) ## Model Zoo on COCO | Model | Epoch | Backbone | Input shape | $AP^{val}$ | $AP^{val}_{50}$| Params(M) | FLOPs(G) | T4 TensorRT FP16(FPS) | Weight | Config | Log |:--------------:|:-----:|:----------:| :-------:|:--------------------------:|:---------------------------:|:---------:|:--------:| :---------------------: |:------------------------------------------------------------------------------------:|:-------------------------------------------:|:---| | RT-DETRv3-R18 | 6x | ResNet-18 | 640 | 48.1 | 66.2 | 20 | 60 | 217 |[baidu 网盘](https://pan.baidu.com/s/1s7lyT6_fHmczoegQZXdX-w?pwd=54jp) [google drive](https://drive.google.com/file/d/1zIDOjn1qDccC3TBsDlGQHOjVrehd26bk/view?usp=drive_link)| [config](./configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml) | | RT-DETRv3-R34 | 6x | ResNet-34 | 640 | 49.9 | 67.7 | 31 | 92 | 161 | [baidu 网盘](https://pan.baidu.com/s/1VCg6oqNVF9_ZZdmlhUBgSA?pwd=pi32) [google drive](https://drive.google.com/file/d/12-wqAF8i67eqbocaWPK33d4tFkN2wGi2/view?usp=drive_link)| [config](./configs/rtdetrv3/rtdetrv3_r34vd_6x_coco.yml) | | RT-DETRv3-R50 | 6x | ResNet-50 | 640 | 53.4 | 71.7 | 42 | 136 | 108 | [baidu 网盘](https://pan.baidu.com/s/1DuvrpMIqbU5okoDp16C94g?pwd=wrxy) [google drive](https://drive.google.com/file/d/1wfJE-QgdgqKE0IkiTuoD5HEbZwwZg3sQ/view?usp=drive_link)| [config](./configs/rtdetrv3/rtdetrv3_r50vd_6x_coco.yml) | | RT-DETRv3-R101 | 6x | ResNet-101 | 640 | 54.6 | 73.1 | 76 | 259 | 74 | | [config](./configs/rtdetrv3/rtdetrv3_r101vd_6x_coco.yml) | **Notes:** - RT-DETRv3 uses 4 GPUs for training. - RT-DETRv3 was trained on COCO train2017 and evaluated on val2017. ## Model Zoo on LVIS | Model | Epoch | Backbone | Input shape | AP | $AP_{r}$ | $AP_{c}$ | $AP_{f}$ | Weight | Config | Log |:--------------:|:-----:|:----------:| :-------:|:--------------------------:|:---------------------------:|:---------:| :---------------------: |:------------------------------------------------------------------------------------:|:-------------------------------------------:|:---| | RT-DETRv3-R18 | 6x | ResNet-18 | 640 | 26.5 | 12.5 | 24.3 | 35.2 | | [config](./configs/rtdetrv3/rtdetrv3_r18vd_6x_lvis.yml) | | RT-DETRv3-R50 | 6x | ResNet-50 | 640 | 33.9 | 20.2 | 32.5 | 41.5 | | [config](./configs/rtdetrv3/rtdetrv3_r50vd_6x_lvis.yml) | ## Quick start
Install requirements ```bash pip install -r requirements.txt ```
Compile (optional) ```bash cd ./ppdet/modeling/transformers/ext_op/ python setup_ms_deformable_attn_op.py install ``` See [details](./ppdet/modeling/transformers/ext_op/)
Data preparation - Download and extract COCO 2017 train and val images. ``` path/to/coco/ annotations/ # annotation json files train2017/ # train images val2017/ # val images ``` - Modify config [`dataset_dir`](configs/datasets/coco_detection.yml)
Training & Evaluation & Testing - Training on a Single GPU: ```shell # training on single-GPU export CUDA_VISIBLE_DEVICES=0 python tools/train.py -c configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml --eval ``` - Training on Multiple GPUs: ```shell # training on multi-GPU export CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch --gpus 0,1,2,3 tools/train.py -c configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml --fleet --eval ``` - Evaluation: ```shell python tools/eval.py -c configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml \ -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetrv3_r18vd_6x_coco.pdparams ``` - Inference: ```shell python tools/infer.py -c configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml \ -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetrv3_r18vd_6x_coco.pdparams \ --infer_img=./demo/000000570688.jpg ```
## Deploy
1. Export model ```shell python tools/export_model.py -c configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml \ -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetrv3_r18vd_6x_coco.pdparams trt=True \ --output_dir=output_inference ```
2. Convert to ONNX - Install [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX) and ONNX ```shell pip install onnx==1.13.0 pip install paddle2onnx==1.0.5 ``` - Convert: ```shell paddle2onnx --model_dir=./output_inference/rtdetrv3_r18vd_6x_coco/ \ --model_filename model.pdmodel \ --params_filename model.pdiparams \ --opset_version 16 \ --save_file rtdetrv3_r18vd_6x_coco.onnx ```
3. Convert to TensorRT - TensorRT version >= 8.5.1 - Inference can refer to [Bennchmark](../benchmark) ```shell trtexec --onnx=./rtdetrv3_r18vd_6x_coco.onnx \ --workspace=4096 \ --shapes=image:1x3x640x640 \ --saveEngine=rtdetrv3_r18vd_6x_coco.trt \ --avgRuns=100 \ --fp16 ``` -
## Citation If you find RT-DETRv3 useful in your research, please consider giving a star ⭐ and citing: ``` @article{wang2024rt, title={RT-DETRv3: Real-time End-to-End Object Detection with Hierarchical Dense Positive Supervision}, author={Wang, Shuo and Xia, Chunlong and Lv, Feng and Shi, Yifeng}, journal={arXiv preprint arXiv:2409.08475}, year={2024} } ``` ================================================ FILE: configs/datasets/coco_detection.yml ================================================ metric: COCO num_classes: 80 TrainDataset: name: COCODataSet image_dir: train2017 anno_path: annotations/instances_train2017.json dataset_dir: /root/paddlejob/workspace/env_run/ws/datasets/COCO data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] EvalDataset: name: COCODataSet image_dir: val2017 anno_path: annotations/instances_val2017.json dataset_dir: /root/paddlejob/workspace/env_run/ws/datasets/COCO allow_empty: true TestDataset: name: ImageFolder anno_path: annotations/instances_val2017.json # also support txt (like VOC's label_list.txt) dataset_dir: dataset/coco # if set, anno_path will be 'dataset_dir/anno_path' ================================================ FILE: configs/datasets/coco_instance.yml ================================================ metric: COCO num_classes: 80 TrainDataset: name: COCODataSet image_dir: train2017 anno_path: annotations/instances_train2017.json dataset_dir: dataset/coco data_fields: ['image', 'gt_bbox', 'gt_class', 'gt_poly', 'is_crowd'] EvalDataset: name: COCODataSet image_dir: val2017 anno_path: annotations/instances_val2017.json dataset_dir: dataset/coco TestDataset: name: ImageFolder anno_path: annotations/instances_val2017.json # also support txt (like VOC's label_list.txt) dataset_dir: dataset/coco # if set, anno_path will be 'dataset_dir/anno_path' ================================================ FILE: configs/datasets/culane.yml ================================================ metric: CULaneMetric num_classes: 5 # 4 lanes + background cut_height: &cut_height 270 dataset_dir: &dataset_dir dataset/culane TrainDataset: name: CULaneDataSet dataset_dir: *dataset_dir list_path: 'list/train_gt.txt' split: train cut_height: *cut_height EvalDataset: name: CULaneDataSet dataset_dir: *dataset_dir list_path: 'list/test.txt' split: test cut_height: *cut_height TestDataset: name: CULaneDataSet dataset_dir: *dataset_dir list_path: 'list/test.txt' split: test cut_height: *cut_height ================================================ FILE: configs/datasets/dota.yml ================================================ metric: RBOX num_classes: 15 TrainDataset: !COCODataSet image_dir: trainval1024/images anno_path: trainval1024/DOTA_trainval1024.json dataset_dir: dataset/dota/ data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly'] EvalDataset: !COCODataSet image_dir: trainval1024/images anno_path: trainval1024/DOTA_trainval1024.json dataset_dir: dataset/dota/ data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly'] TestDataset: !ImageFolder anno_path: test1024/DOTA_test1024.json dataset_dir: dataset/dota/ ================================================ FILE: configs/datasets/dota_ms.yml ================================================ metric: RBOX num_classes: 15 TrainDataset: !COCODataSet image_dir: trainval1024/images anno_path: trainval1024/DOTA_trainval1024.json dataset_dir: dataset/dota_ms/ data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly'] EvalDataset: !COCODataSet image_dir: trainval1024/images anno_path: trainval1024/DOTA_trainval1024.json dataset_dir: dataset/dota_ms/ data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly'] TestDataset: !ImageFolder anno_path: test1024/DOTA_test1024.json dataset_dir: dataset/dota_ms/ ================================================ FILE: configs/datasets/lvis_detection.yml ================================================ metric: LVIS num_classes: 1203 TrainDataset: name: LVISDataSet image_dir: . anno_path: annotations/lvis_v1_train.json dataset_dir: /root/paddlejob/workspace/env_run/ws/datasets/COCO data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] EvalDataset: name: LVISDataSet image_dir: . anno_path: annotations/lvis_v1_val.json dataset_dir: /root/paddlejob/workspace/env_run/ws/datasets/COCO allow_empty: true TestDataset: name: ImageFolder anno_path: annotations/lvis_v1_val.json # also support txt (like VOC's label_list.txt) dataset_dir: /root/paddlejob/workspace/env_run/ws/datasets/COCO # if set, anno_path will be 'dataset_dir/anno_path' ================================================ FILE: configs/datasets/mcmot.yml ================================================ metric: MCMOT num_classes: 10 # using VisDrone2019 MOT dataset with 10 classes as default, you can modify it for your needs. # for MCMOT training TrainDataset: !MCMOTDataSet dataset_dir: dataset/mot image_lists: ['visdrone_mcmot.train'] data_fields: ['image', 'gt_bbox', 'gt_class', 'gt_ide'] label_list: label_list.txt # for MCMOT evaluation # If you want to change the MCMOT evaluation dataset, please modify 'data_root' EvalMOTDataset: !MOTImageFolder dataset_dir: dataset/mot data_root: visdrone_mcmot/images/val keep_ori_im: False # set True if save visualization images or video, or used in DeepSORT # for MCMOT video inference TestMOTDataset: !MOTImageFolder dataset_dir: dataset/mot keep_ori_im: True # set True if save visualization images or video ================================================ FILE: configs/datasets/mot.yml ================================================ metric: MOT num_classes: 1 # for MOT training TrainDataset: !MOTDataSet dataset_dir: dataset/mot image_lists: ['mot17.train', 'caltech.all', 'cuhksysu.train', 'prw.train', 'citypersons.train', 'eth.train'] data_fields: ['image', 'gt_bbox', 'gt_class', 'gt_ide'] # for MOT evaluation # If you want to change the MOT evaluation dataset, please modify 'data_root' EvalMOTDataset: !MOTImageFolder dataset_dir: dataset/mot data_root: MOT16/images/train keep_ori_im: False # set True if save visualization images or video, or used in DeepSORT # for MOT video inference TestMOTDataset: !MOTImageFolder dataset_dir: dataset/mot keep_ori_im: True # set True if save visualization images or video ================================================ FILE: configs/datasets/objects365_detection.yml ================================================ metric: COCO num_classes: 365 TrainDataset: !COCODataSet image_dir: train anno_path: annotations/zhiyuan_objv2_train.json dataset_dir: dataset/objects365 data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] EvalDataset: !COCODataSet image_dir: val anno_path: annotations/zhiyuan_objv2_val.json dataset_dir: dataset/objects365 allow_empty: true TestDataset: !ImageFolder anno_path: annotations/zhiyuan_objv2_val.json dataset_dir: dataset/objects365/ ================================================ FILE: configs/datasets/roadsign_voc.yml ================================================ metric: VOC map_type: integral num_classes: 4 TrainDataset: name: VOCDataSet dataset_dir: dataset/roadsign_voc anno_path: train.txt label_list: label_list.txt data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult'] EvalDataset: name: VOCDataSet dataset_dir: dataset/roadsign_voc anno_path: valid.txt label_list: label_list.txt data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult'] TestDataset: name: ImageFolder anno_path: dataset/roadsign_voc/label_list.txt ================================================ FILE: configs/datasets/sniper_coco_detection.yml ================================================ metric: SNIPERCOCO num_classes: 80 TrainDataset: !SniperCOCODataSet image_dir: train2017 anno_path: annotations/instances_train2017.json dataset_dir: dataset/coco data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] allow_empty: true is_trainset: true image_target_sizes: [2000, 1000] valid_box_ratio_ranges: [[-1, 0.1],[0.08, -1]] chip_target_size: 512 chip_target_stride: 200 use_neg_chip: false max_neg_num_per_im: 8 EvalDataset: !SniperCOCODataSet image_dir: val2017 anno_path: annotations/instances_val2017.json dataset_dir: dataset/coco data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] allow_empty: true is_trainset: false image_target_sizes: [2000, 1000] valid_box_ratio_ranges: [[-1, 0.1], [0.08, -1]] chip_target_size: 512 chip_target_stride: 200 max_per_img: -1 nms_thresh: 0.5 TestDataset: !SniperCOCODataSet image_dir: val2017 dataset_dir: dataset/coco is_trainset: false image_target_sizes: [2000, 1000] valid_box_ratio_ranges: [[-1, 0.1],[0.08, -1]] chip_target_size: 500 chip_target_stride: 200 max_per_img: -1 nms_thresh: 0.5 ================================================ FILE: configs/datasets/sniper_visdrone_detection.yml ================================================ metric: SNIPERCOCO num_classes: 9 TrainDataset: !SniperCOCODataSet image_dir: train anno_path: annotations/train.json dataset_dir: dataset/VisDrone2019_coco data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] allow_empty: true is_trainset: true image_target_sizes: [8145, 2742] valid_box_ratio_ranges: [[-1, 0.03142857142857144], [0.02333211853008726, -1]] chip_target_size: 1536 chip_target_stride: 1184 use_neg_chip: false max_neg_num_per_im: 8 EvalDataset: !SniperCOCODataSet image_dir: val anno_path: annotations/val.json dataset_dir: dataset/VisDrone2019_coco data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] allow_empty: true is_trainset: false image_target_sizes: [8145, 2742] valid_box_ratio_ranges: [[-1, 0.03142857142857144], [0.02333211853008726, -1]] chip_target_size: 1536 chip_target_stride: 1184 max_per_img: -1 nms_thresh: 0.5 TestDataset: !SniperCOCODataSet image_dir: val dataset_dir: dataset/VisDrone2019_coco is_trainset: false image_target_sizes: [8145, 2742] valid_box_ratio_ranges: [[-1, 0.03142857142857144], [0.02333211853008726, -1]] chip_target_size: 1536 chip_target_stride: 1184 max_per_img: -1 nms_thresh: 0.5 ================================================ FILE: configs/datasets/spine_coco.yml ================================================ metric: RBOX num_classes: 9 TrainDataset: !COCODataSet image_dir: images anno_path: annotations/train.json dataset_dir: dataset/spine_coco data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly'] EvalDataset: !COCODataSet image_dir: images anno_path: annotations/valid.json dataset_dir: dataset/spine_coco data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd', 'gt_poly'] TestDataset: !ImageFolder anno_path: annotations/valid.json dataset_dir: dataset/spine_coco ================================================ FILE: configs/datasets/visdrone_detection.yml ================================================ metric: COCO num_classes: 10 TrainDataset: !COCODataSet image_dir: VisDrone2019-DET-train anno_path: train.json dataset_dir: dataset/visdrone data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] EvalDataset: !COCODataSet image_dir: VisDrone2019-DET-val anno_path: val.json # image_dir: test_dev # anno_path: test_dev.json dataset_dir: dataset/visdrone TestDataset: !ImageFolder anno_path: val.json dataset_dir: dataset/visdrone ================================================ FILE: configs/datasets/voc.yml ================================================ metric: VOC map_type: 11point num_classes: 20 TrainDataset: name: VOCDataSet dataset_dir: dataset/voc anno_path: trainval.txt label_list: label_list.txt data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult'] EvalDataset: name: VOCDataSet dataset_dir: dataset/voc anno_path: test.txt label_list: label_list.txt data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult'] TestDataset: name: ImageFolder anno_path: dataset/voc/label_list.txt ================================================ FILE: configs/datasets/wider_face.yml ================================================ metric: WiderFace num_classes: 1 TrainDataset: !WIDERFaceDataSet dataset_dir: dataset/wider_face anno_path: wider_face_split/wider_face_train_bbx_gt.txt image_dir: WIDER_train/images data_fields: ['image', 'gt_bbox', 'gt_class'] EvalDataset: !WIDERFaceValDataset dataset_dir: dataset/wider_face image_dir: WIDER_val/images anno_path: wider_face_split/wider_face_val_bbx_gt.txt gt_mat_path: WIDER_val/ground_truth data_fields: ['image', 'gt_bbox', 'gt_class', 'ori_gt_bbox'] TestDataset: !ImageFolder use_default_label: true ================================================ FILE: configs/rtdetrv3/_base_/optimizer_6x.yml ================================================ epoch: 72 LearningRate: base_lr: 0.0004 schedulers: - !PiecewiseDecay gamma: 1.0 milestones: [100] use_warmup: true - !LinearWarmup start_factor: 0.001 steps: 2000 OptimizerBuilder: clip_grad_by_norm: 0.1 regularizer: false optimizer: type: AdamW weight_decay: 0.0001 ================================================ FILE: configs/rtdetrv3/_base_/rtdetr_reader.yml ================================================ worker_num: 4 TrainReader: sample_transforms: - Decode: {} - RandomDistort: {prob: 0.8} - RandomExpand: {fill_value: [123.675, 116.28, 103.53]} - RandomCrop: {prob: 0.8} - RandomFlip: {} batch_transforms: - BatchRandomResize: {target_size: [480, 512, 544, 576, 608, 640, 640, 640, 672, 704, 736, 768, 800], random_size: True, random_interp: True, keep_ratio: False} - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none} - NormalizeBox: {retain_origin_box: true} - BboxXYXY2XYWH: {} - Permute: {} - PadGT: {only_origin_box: true} batch_size: 16 shuffle: true drop_last: true collate_batch: false use_shared_memory: true EvalReader: sample_transforms: - Decode: {} - Resize: {target_size: [640, 640], keep_ratio: False, interp: 2} - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none} - Permute: {} batch_size: 16 shuffle: false drop_last: false TestReader: inputs_def: image_shape: [3, 640, 640] sample_transforms: - Decode: {} - Resize: {target_size: [640, 640], keep_ratio: False, interp: 2} - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none} - Permute: {} batch_size: 1 shuffle: false drop_last: false ================================================ FILE: configs/rtdetrv3/_base_/rtdetrv3_r50vd.yml ================================================ architecture: RTDETRV3 pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_v2_pretrained.pdparams norm_type: sync_bn use_ema: True ema_decay: 0.9999 ema_decay_type: "exponential" ema_filter_no_grad: True hidden_dim: 256 use_focal_loss: True eval_size: [640, 640] RTDETRV3: backbone: ResNet neck: HybridEncoder transformer: RTDETRTransformerv3 detr_head: DINOv3Head aux_o2m_head: PPYOLOEHead post_process: DETRPostProcess ResNet: # index 0 stands for res2 depth: 50 variant: d norm_type: bn freeze_at: 0 return_idx: [1, 2, 3] lr_mult_list: [0.1, 0.1, 0.1, 0.1] num_stages: 4 freeze_stem_only: True HybridEncoder: hidden_dim: 256 use_encoder_idx: [2] num_encoder_layers: 1 encoder_layer: name: TransformerLayer d_model: 256 nhead: 8 dim_feedforward: 1024 dropout: 0. activation: 'gelu' expansion: 1.0 RTDETRTransformerv3: num_queries: 300 position_embed_type: sine feat_strides: [8, 16, 32] num_levels: 3 nhead: 8 num_decoder_layers: 6 dim_feedforward: 1024 dropout: 0.0 activation: relu num_denoising: 100 label_noise_ratio: 0.5 box_noise_scale: 1.0 learnt_init_query: False num_noises: 0 num_noise_queries: [] num_noise_denoising: 100 learnt_init_query: False DINOv3Head: o2m: 4 loss: name: DINOv3Loss loss_coeff: {class: 1, bbox: 5, giou: 2} aux_loss: True use_vfl: True matcher: name: HungarianMatcher matcher_coeff: {class: 2, bbox: 5, giou: 2} PPYOLOEHead: fpn_strides: [8, 16, 32] grid_cell_scale: 5.0 grid_cell_offset: 0.5 static_assigner_epoch: 30 use_varifocal_loss: True loss_weight: {class: 1.0, iou: 2.5, dfl: 0.5} static_assigner: name: ATSSAssigner topk: 9 assigner: name: TaskAlignedAssigner topk: 13 alpha: 1.0 beta: 6.0 nms: name: MultiClassNMS nms_top_k: 1000 keep_top_k: 300 score_threshold: 0.01 nms_threshold: 0.7 DETRPostProcess: num_top_queries: 300 ================================================ FILE: configs/rtdetrv3/rtdetrv3_r18vd_6x_coco.yml ================================================ _BASE_: [ '../datasets/coco_detection.yml', '../runtime.yml', '_base_/optimizer_6x.yml', '_base_/rtdetrv3_r50vd.yml', '_base_/rtdetr_reader.yml', ] weights: output/rtdetrv3_r18vd_6x_coco/model_final find_unused_parameters: True log_iter: 200 o2m_branch: True num_queries_o2m: 450 pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet18_vd_pretrained.pdparams RTDETRV3: backbone: ResNet neck: HybridEncoder transformer: RTDETRTransformerv3 detr_head: DINOv3Head aux_o2m_head: PPYOLOEHead post_process: DETRPostProcess ResNet: depth: 18 variant: d return_idx: [1, 2, 3] freeze_at: -1 freeze_norm: false norm_decay: 0. HybridEncoder: hidden_dim: 256 use_encoder_idx: [2] num_encoder_layers: 1 encoder_layer: name: TransformerLayer d_model: 256 nhead: 8 dim_feedforward: 1024 dropout: 0. activation: 'gelu' expansion: 0.5 depth_mult: 1.0 RTDETRTransformerv3: eval_idx: -1 num_decoder_layers: 3 num_noises: 3 num_noise_queries: [300, 300, 300] num_noise_denoising: 100 learnt_init_query: False ================================================ FILE: configs/rtdetrv3/rtdetrv3_r18vd_6x_lvis.yml ================================================ _BASE_: [ '../datasets/lvis_detection.yml', '../runtime.yml', '_base_/optimizer_6x.yml', '_base_/rtdetrv3_r50vd.yml', '_base_/rtdetr_reader.yml', ] weights: output/rtdetrv3vd_r18_6x_lvis/model_final find_unused_parameters: True log_iter: 200 o2m_branch: True num_queries_o2m: 450 pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet18_vd_pretrained.pdparams RTDETRV3: backbone: ResNet neck: HybridEncoder transformer: RTDETRTransformerv3 detr_head: DINOv3Head aux_o2m_head: PPYOLOEHead post_process: DETRPostProcess ResNet: depth: 18 variant: d return_idx: [1, 2, 3] freeze_at: -1 freeze_norm: false norm_decay: 0. HybridEncoder: hidden_dim: 256 use_encoder_idx: [2] num_encoder_layers: 1 encoder_layer: name: TransformerLayer d_model: 256 nhead: 8 dim_feedforward: 1024 dropout: 0. activation: 'gelu' expansion: 0.5 depth_mult: 1.0 RTDETRTransformerv3: eval_idx: -1 num_decoder_layers: 3 num_noises: 2 num_noise_queries: [300, 300] num_noise_denoising: 100 learnt_init_query: False ================================================ FILE: configs/rtdetrv3/rtdetrv3_r34vd_6x_coco.yml ================================================ _BASE_: [ '../datasets/coco_detection.yml', '../runtime.yml', '_base_/optimizer_6x.yml', '_base_/rtdetrv3_r50vd.yml', '_base_/rtdetr_reader.yml', ] weights: output/rtdetrv3_r34vd_6x_coco/model_final find_unused_parameters: True log_iter: 200 o2m_branch: True num_queries_o2m: 450 pretrain_weights: https://bj.bcebos.com/v1/paddledet/models/pretrained/ResNet34_vd_pretrained.pdparams RTDETRV3: backbone: ResNet neck: HybridEncoder transformer: RTDETRTransformerv3 detr_head: DINOv3Head aux_o2m_head: PPYOLOEHead post_process: DETRPostProcess ResNet: depth: 34 variant: d return_idx: [1, 2, 3] freeze_at: -1 freeze_norm: false norm_decay: 0. HybridEncoder: hidden_dim: 256 use_encoder_idx: [2] num_encoder_layers: 1 encoder_layer: name: TransformerLayer d_model: 256 nhead: 8 dim_feedforward: 1024 dropout: 0. activation: 'gelu' expansion: 0.5 depth_mult: 1.0 RTDETRTransformerv3: eval_idx: -1 num_decoder_layers: 4 num_noises: 3 num_noise_queries: [300, 300, 300] num_noise_denoising: 100 learnt_init_query: False ================================================ FILE: configs/rtdetrv3/rtdetrv3_r50vd_6x_coco.yml ================================================ _BASE_: [ '../datasets/coco_detection.yml', '../runtime.yml', '_base_/optimizer_6x.yml', '_base_/rtdetrv3_r50vd.yml', '_base_/rtdetr_reader.yml', ] weights: output/rtdetrv3_r50vd_6x_coco/model_final find_unused_parameters: True log_iter: 200 o2m_branch: True num_queries_o2m: 450 pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_v2_pretrained.pdparams RTDETRTransformerv3: eval_idx: -1 num_decoder_layers: 6 num_noises: 2 num_noise_queries: [300, 300] num_noise_denoising: 100 learnt_init_query: False ================================================ FILE: configs/rtdetrv3/rtdetrv3_r50vd_6x_lvis.yml ================================================ _BASE_: [ '../datasets/lvis_detection.yml', '../runtime.yml', '_base_/optimizer_6x.yml', '_base_/rtdetrv3_r50vd.yml', '_base_/rtdetr_reader.yml', ] weights: output/rtdetrv3_r50vd_6x_lvis/model_final find_unused_parameters: True log_iter: 200 snapshot_epoch: 2 o2m_branch: True num_queries_o2m: 450 pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_v2_pretrained.pdparams RTDETRTransformerv3: eval_idx: -1 num_decoder_layers: 6 num_noises: 1 num_noise_queries: [300] num_noise_denoising: 100 learnt_init_query: False ================================================ FILE: configs/runtime.yml ================================================ use_gpu: true use_xpu: false use_mlu: false use_npu: false log_iter: 20 save_dir: output snapshot_epoch: 1 print_flops: false print_params: false # Exporting the model export: post_process: True # Whether post-processing is included in the network when export model. nms: True # Whether NMS is included in the network when export model. benchmark: False # It is used to testing model performance, if set `True`, post-process and NMS will not be exported. fuse_conv_bn: False ================================================ FILE: dataset/coco/download_coco.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys import os.path as osp import logging # add python path of PaddleDetection to sys.path parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3))) if parent_path not in sys.path: sys.path.append(parent_path) from ppdet.utils.download import download_dataset logging.basicConfig(level=logging.INFO) download_path = osp.split(osp.realpath(sys.argv[0]))[0] download_dataset(download_path, 'coco') ================================================ FILE: dataset/dota/.gitignore ================================================ ================================================ FILE: dataset/mot/gen_labels_MOT.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os.path as osp import os import numpy as np MOT_data = 'MOT16' # choose a data in ['MOT15', 'MOT16', 'MOT17', 'MOT20'] # or your custom data (prepare it following the 'docs/tutorials/PrepareMOTDataSet.md') def mkdirs(d): if not osp.exists(d): os.makedirs(d) seq_root = './{}/images/train'.format(MOT_data) label_root = './{}/labels_with_ids/train'.format(MOT_data) mkdirs(label_root) seqs = [s for s in os.listdir(seq_root)] tid_curr = 0 tid_last = -1 for seq in seqs: seq_info = open(osp.join(seq_root, seq, 'seqinfo.ini')).read() seq_width = int(seq_info[seq_info.find('imWidth=') + 8:seq_info.find( '\nimHeight')]) seq_height = int(seq_info[seq_info.find('imHeight=') + 9:seq_info.find( '\nimExt')]) gt_txt = osp.join(seq_root, seq, 'gt', 'gt.txt') gt = np.loadtxt(gt_txt, dtype=np.float64, delimiter=',') seq_label_root = osp.join(label_root, seq, 'img1') mkdirs(seq_label_root) for fid, tid, x, y, w, h, mark, label, _ in gt: if mark == 0 or not label == 1: continue fid = int(fid) tid = int(tid) if not tid == tid_last: tid_curr += 1 tid_last = tid x += w / 2 y += h / 2 label_fpath = osp.join(seq_label_root, '{:06d}.txt'.format(fid)) label_str = '0 {:d} {:.6f} {:.6f} {:.6f} {:.6f}\n'.format( tid_curr, x / seq_width, y / seq_height, w / seq_width, h / seq_height) with open(label_fpath, 'a') as f: f.write(label_str) ================================================ FILE: dataset/roadsign_voc/download_roadsign_voc.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys import os.path as osp import logging # add python path of PaddleDetection to sys.path parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3))) if parent_path not in sys.path: sys.path.append(parent_path) from ppdet.utils.download import download_dataset logging.basicConfig(level=logging.INFO) download_path = osp.split(osp.realpath(sys.argv[0]))[0] download_dataset(download_path, 'roadsign_voc') ================================================ FILE: dataset/roadsign_voc/label_list.txt ================================================ speedlimit crosswalk trafficlight stop ================================================ FILE: dataset/spine_coco/download_spine_coco.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys import os.path as osp import logging # add python path of PaddleDetection to sys.path parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3))) if parent_path not in sys.path: sys.path.append(parent_path) from ppdet.utils.download import download_dataset logging.basicConfig(level=logging.INFO) download_path = osp.split(osp.realpath(sys.argv[0]))[0] download_dataset(download_path, 'spine_coco') ================================================ FILE: dataset/voc/create_list.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys import os.path as osp import logging # add python path of PaddleDetection to sys.path parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3))) if parent_path not in sys.path: sys.path.append(parent_path) from ppdet.utils.download import create_voc_list logging.basicConfig(level=logging.INFO) voc_path = osp.split(osp.realpath(sys.argv[0]))[0] create_voc_list(voc_path) ================================================ FILE: dataset/voc/download_voc.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys import os.path as osp import logging # add python path of PaddleDetection to sys.path parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3))) if parent_path not in sys.path: sys.path.append(parent_path) from ppdet.utils.download import download_dataset logging.basicConfig(level=logging.INFO) download_path = osp.split(osp.realpath(sys.argv[0]))[0] download_dataset(download_path, 'voc') ================================================ FILE: dataset/voc/label_list.txt ================================================ aeroplane bicycle bird boat bottle bus car cat chair cow diningtable dog horse motorbike person pottedplant sheep sofa train tvmonitor ================================================ FILE: dataset/wider_face/download_wider_face.sh ================================================ # All rights `PaddleDetection` reserved # References: # @inproceedings{yang2016wider, # Author = {Yang, Shuo and Luo, Ping and Loy, Chen Change and Tang, Xiaoou}, # Booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, # Title = {WIDER FACE: A Face Detection Benchmark}, # Year = {2016}} DIR="$( cd "$(dirname "$0")" ; pwd -P )" cd "$DIR" # Download the data. echo "Downloading..." wget https://dataset.bj.bcebos.com/wider_face/WIDER_train.zip wget https://dataset.bj.bcebos.com/wider_face/WIDER_val.zip wget https://dataset.bj.bcebos.com/wider_face/wider_face_split.zip # Extract the data. echo "Extracting..." unzip -q WIDER_train.zip unzip -q WIDER_val.zip unzip -q wider_face_split.zip ================================================ FILE: ppdet/__init__.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import (core, data, engine, modeling, model_zoo, optimizer, metrics, utils, slim) try: from .version import full_version as __version__ from .version import commit as __git_commit__ except ImportError: import sys sys.stderr.write("Warning: import ppdet from source directory " \ "without installing, run 'python setup.py install' to " \ "install ppdet firstly\n") ================================================ FILE: ppdet/core/__init__.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import config ================================================ FILE: ppdet/core/config/__init__.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppdet/core/config/schema.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import print_function from __future__ import division import inspect import importlib import re try: from docstring_parser import parse as doc_parse except Exception: def doc_parse(*args): pass try: from typeguard import check_type except Exception: def check_type(*args): pass __all__ = ['SchemaValue', 'SchemaDict', 'SharedConfig', 'extract_schema'] class SchemaValue(object): def __init__(self, name, doc='', type=None): super(SchemaValue, self).__init__() self.name = name self.doc = doc self.type = type def set_default(self, value): self.default = value def has_default(self): return hasattr(self, 'default') class SchemaDict(dict): def __init__(self, **kwargs): super(SchemaDict, self).__init__() self.schema = {} self.strict = False self.doc = "" self.update(kwargs) def __setitem__(self, key, value): # XXX also update regular dict to SchemaDict?? if isinstance(value, dict) and key in self and isinstance(self[key], SchemaDict): self[key].update(value) else: super(SchemaDict, self).__setitem__(key, value) def __missing__(self, key): if self.has_default(key): return self.schema[key].default elif key in self.schema: return self.schema[key] else: raise KeyError(key) def copy(self): newone = SchemaDict() newone.__dict__.update(self.__dict__) newone.update(self) return newone def set_schema(self, key, value): assert isinstance(value, SchemaValue) self.schema[key] = value def set_strict(self, strict): self.strict = strict def has_default(self, key): return key in self.schema and self.schema[key].has_default() def is_default(self, key): if not self.has_default(key): return False if hasattr(self[key], '__dict__'): return True else: return key not in self or self[key] == self.schema[key].default def find_default_keys(self): return [ k for k in list(self.keys()) + list(self.schema.keys()) if self.is_default(k) ] def mandatory(self): return any([k for k in self.schema.keys() if not self.has_default(k)]) def find_missing_keys(self): missing = [ k for k in self.schema.keys() if k not in self and not self.has_default(k) ] placeholders = [k for k in self if self[k] in ('', '')] return missing + placeholders def find_extra_keys(self): return list(set(self.keys()) - set(self.schema.keys())) def find_mismatch_keys(self): mismatch_keys = [] for arg in self.schema.values(): if arg.type is not None: try: check_type("{}.{}".format(self.name, arg.name), self[arg.name], arg.type) except Exception: mismatch_keys.append(arg.name) return mismatch_keys def validate(self): missing_keys = self.find_missing_keys() if missing_keys: raise ValueError("Missing param for class<{}>: {}".format( self.name, ", ".join(missing_keys))) extra_keys = self.find_extra_keys() if extra_keys and self.strict: raise ValueError("Extraneous param for class<{}>: {}".format( self.name, ", ".join(extra_keys))) mismatch_keys = self.find_mismatch_keys() if mismatch_keys: raise TypeError("Wrong param type for class<{}>: {}".format( self.name, ", ".join(mismatch_keys))) class SharedConfig(object): """ Representation class for `__shared__` annotations, which work as follows: - if `key` is set for the module in config file, its value will take precedence - if `key` is not set for the module but present in the config file, its value will be used - otherwise, use the provided `default_value` as fallback Args: key: config[key] will be injected default_value: fallback value """ def __init__(self, key, default_value=None): super(SharedConfig, self).__init__() self.key = key self.default_value = default_value def extract_schema(cls): """ Extract schema from a given class Args: cls (type): Class from which to extract. Returns: schema (SchemaDict): Extracted schema. """ ctor = cls.__init__ # python 2 compatibility if hasattr(inspect, 'getfullargspec'): argspec = inspect.getfullargspec(ctor) annotations = argspec.annotations has_kwargs = argspec.varkw is not None else: argspec = inspect.getfullargspec(ctor) # python 2 type hinting workaround, see pep-3107 # however, since `typeguard` does not support python 2, type checking # is still python 3 only for now annotations = getattr(ctor, '__annotations__', {}) has_kwargs = argspec.varkw is not None names = [arg for arg in argspec.args if arg != 'self'] defaults = argspec.defaults num_defaults = argspec.defaults is not None and len(argspec.defaults) or 0 num_required = len(names) - num_defaults docs = cls.__doc__ if docs is None and getattr(cls, '__category__', None) == 'op': docs = cls.__call__.__doc__ try: docstring = doc_parse(docs) except Exception: docstring = None if docstring is None: comments = {} else: comments = {} for p in docstring.params: match_obj = re.match('^([a-zA-Z_]+[a-zA-Z_0-9]*).*', p.arg_name) if match_obj is not None: comments[match_obj.group(1)] = p.description schema = SchemaDict() schema.name = cls.__name__ schema.doc = "" if docs is not None: start_pos = docs[0] == '\n' and 1 or 0 schema.doc = docs[start_pos:].split("\n")[0].strip() # XXX handle paddle's weird doc convention if '**' == schema.doc[:2] and '**' == schema.doc[-2:]: schema.doc = schema.doc[2:-2].strip() schema.category = hasattr(cls, '__category__') and getattr( cls, '__category__') or 'module' schema.strict = not has_kwargs schema.pymodule = importlib.import_module(cls.__module__) schema.inject = getattr(cls, '__inject__', []) schema.shared = getattr(cls, '__shared__', []) for idx, name in enumerate(names): comment = name in comments and comments[name] or name if name in schema.inject: type_ = None else: type_ = name in annotations and annotations[name] or None value_schema = SchemaValue(name, comment, type_) if name in schema.shared: assert idx >= num_required, "shared config must have default value" default = defaults[idx - num_required] value_schema.set_default(SharedConfig(name, default)) elif idx >= num_required: default = defaults[idx - num_required] value_schema.set_default(default) schema.set_schema(name, value_schema) return schema ================================================ FILE: ppdet/core/config/yaml_helpers.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import importlib import inspect import yaml from .schema import SharedConfig __all__ = ['serializable', 'Callable'] def represent_dictionary_order(self, dict_data): return self.represent_mapping('tag:yaml.org,2002:map', dict_data.items()) def setup_orderdict(): from collections import OrderedDict yaml.add_representer(OrderedDict, represent_dictionary_order) def _make_python_constructor(cls): def python_constructor(loader, node): if isinstance(node, yaml.SequenceNode): args = loader.construct_sequence(node, deep=True) return cls(*args) else: kwargs = loader.construct_mapping(node, deep=True) try: return cls(**kwargs) except Exception as ex: print("Error when construct {} instance from yaml config". format(cls.__name__)) raise ex return python_constructor def _make_python_representer(cls): # python 2 compatibility if hasattr(inspect, 'getfullargspec'): argspec = inspect.getfullargspec(cls) else: argspec = inspect.getfullargspec(cls.__init__) argnames = [arg for arg in argspec.args if arg != 'self'] def python_representer(dumper, obj): if argnames: data = {name: getattr(obj, name) for name in argnames} else: data = obj.__dict__ if '_id' in data: del data['_id'] return dumper.represent_mapping(u'!{}'.format(cls.__name__), data) return python_representer def serializable(cls): """ Add loader and dumper for given class, which must be "trivially serializable" Args: cls: class to be serialized Returns: cls """ yaml.add_constructor(u'!{}'.format(cls.__name__), _make_python_constructor(cls)) yaml.add_representer(cls, _make_python_representer(cls)) return cls yaml.add_representer(SharedConfig, lambda d, o: d.represent_data(o.default_value)) @serializable class Callable(object): """ Helper to be used in Yaml for creating arbitrary class objects Args: full_type (str): the full module path to target function """ def __init__(self, full_type, args=[], kwargs={}): super(Callable, self).__init__() self.full_type = full_type self.args = args self.kwargs = kwargs def __call__(self): if '.' in self.full_type: idx = self.full_type.rfind('.') module = importlib.import_module(self.full_type[:idx]) func_name = self.full_type[idx + 1:] else: try: module = importlib.import_module('builtins') except Exception: module = importlib.import_module('__builtin__') func_name = self.full_type func = getattr(module, func_name) return func(*self.args, **self.kwargs) ================================================ FILE: ppdet/core/workspace.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import print_function from __future__ import division import importlib import os import sys import yaml import collections try: collectionsAbc = collections.abc except AttributeError: collectionsAbc = collections from .config.schema import SchemaDict, SharedConfig, extract_schema from .config.yaml_helpers import serializable __all__ = [ 'global_config', 'load_config', 'merge_config', 'get_registered_modules', 'create', 'register', 'serializable', 'dump_value', ] def dump_value(value): # XXX this is hackish, but collections.abc is not available in python 2 if hasattr(value, '__dict__') or isinstance(value, (dict, tuple, list)): value = yaml.dump(value, default_flow_style=True) value = value.replace('\n', '') value = value.replace('...', '') return "'{}'".format(value) else: # primitive types return str(value) class AttrDict(dict): """Single level attribute dict, NOT recursive""" def __init__(self, **kwargs): super(AttrDict, self).__init__() super(AttrDict, self).update(kwargs) def __getattr__(self, key): if key in self: return self[key] raise AttributeError("object has no attribute '{}'".format(key)) def __setattr__(self, key, value): self[key] = value def copy(self): new_dict = AttrDict() for k, v in self.items(): new_dict.update({k: v}) return new_dict global_config = AttrDict() BASE_KEY = '_BASE_' # parse and load _BASE_ recursively def _load_config_with_base(file_path): with open(file_path) as f: file_cfg = yaml.load(f, Loader=yaml.Loader) # NOTE: cfgs outside have higher priority than cfgs in _BASE_ if BASE_KEY in file_cfg: all_base_cfg = AttrDict() base_ymls = list(file_cfg[BASE_KEY]) for base_yml in base_ymls: if base_yml.startswith("~"): base_yml = os.path.expanduser(base_yml) if not base_yml.startswith('/'): base_yml = os.path.join(os.path.dirname(file_path), base_yml) with open(base_yml) as f: base_cfg = _load_config_with_base(base_yml) all_base_cfg = merge_config(base_cfg, all_base_cfg) del file_cfg[BASE_KEY] return merge_config(file_cfg, all_base_cfg) return file_cfg def load_config(file_path): """ Load config from file. Args: file_path (str): Path of the config file to be loaded. Returns: global config """ _, ext = os.path.splitext(file_path) assert ext in ['.yml', '.yaml'], "only support yaml files for now" # load config from file and merge into global config cfg = _load_config_with_base(file_path) cfg['filename'] = os.path.splitext(os.path.split(file_path)[-1])[0] merge_config(cfg) return global_config def dict_merge(dct, merge_dct): """ Recursive dict merge. Inspired by :meth:``dict.update()``, instead of updating only top-level keys, dict_merge recurses down into dicts nested to an arbitrary depth, updating keys. The ``merge_dct`` is merged into ``dct``. Args: dct: dict onto which the merge is executed merge_dct: dct merged into dct Returns: dct """ for k, v in merge_dct.items(): if (k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], collectionsAbc.Mapping)): dict_merge(dct[k], merge_dct[k]) else: dct[k] = merge_dct[k] return dct def merge_config(config, another_cfg=None): """ Merge config into global config or another_cfg. Args: config (dict): Config to be merged. Returns: global config """ global global_config dct = another_cfg or global_config return dict_merge(dct, config) def get_registered_modules(): return {k: v for k, v in global_config.items() if isinstance(v, SchemaDict)} def make_partial(cls): op_module = importlib.import_module(cls.__op__.__module__) op = getattr(op_module, cls.__op__.__name__) cls.__category__ = getattr(cls, '__category__', None) or 'op' def partial_apply(self, *args, **kwargs): kwargs_ = self.__dict__.copy() kwargs_.update(kwargs) return op(*args, **kwargs_) if getattr(cls, '__append_doc__', True): # XXX should default to True? if sys.version_info[0] > 2: cls.__doc__ = "Wrapper for `{}` OP".format(op.__name__) cls.__init__.__doc__ = op.__doc__ cls.__call__ = partial_apply cls.__call__.__doc__ = op.__doc__ else: # XXX work around for python 2 partial_apply.__doc__ = op.__doc__ cls.__call__ = partial_apply return cls def register(cls): """ Register a given module class. Args: cls (type): Module class to be registered. Returns: cls """ if cls.__name__ in global_config: raise ValueError("Module class already registered: {}".format( cls.__name__)) if hasattr(cls, '__op__'): cls = make_partial(cls) global_config[cls.__name__] = extract_schema(cls) return cls def create(cls_or_name, **kwargs): """ Create an instance of given module class. Args: cls_or_name (type or str): Class of which to create instance. Returns: instance of type `cls_or_name` """ assert type(cls_or_name) in [type, str ], "should be a class or name of a class" name = type(cls_or_name) == str and cls_or_name or cls_or_name.__name__ if name in global_config: if isinstance(global_config[name], SchemaDict): pass elif hasattr(global_config[name], "__dict__"): # support instance return directly return global_config[name] else: raise ValueError("The module {} is not registered".format(name)) else: raise ValueError("The module {} is not registered".format(name)) config = global_config[name] cls = getattr(config.pymodule, name) cls_kwargs = {} cls_kwargs.update(global_config[name]) # parse `shared` annoation of registered modules if getattr(config, 'shared', None): for k in config.shared: target_key = config[k] shared_conf = config.schema[k].default assert isinstance(shared_conf, SharedConfig) if target_key is not None and not isinstance(target_key, SharedConfig): continue # value is given for the module elif shared_conf.key in global_config: # `key` is present in config cls_kwargs[k] = global_config[shared_conf.key] else: cls_kwargs[k] = shared_conf.default_value # parse `inject` annoation of registered modules if getattr(cls, 'from_config', None): cls_kwargs.update(cls.from_config(config, **kwargs)) if getattr(config, 'inject', None): for k in config.inject: target_key = config[k] # optional dependency if target_key is None: continue if isinstance(target_key, dict) or hasattr(target_key, '__dict__'): if 'name' not in target_key.keys(): continue inject_name = str(target_key['name']) if inject_name not in global_config: raise ValueError( "Missing injection name {} and check it's name in cfg file". format(k)) target = global_config[inject_name] for i, v in target_key.items(): if i == 'name': continue target[i] = v if isinstance(target, SchemaDict): cls_kwargs[k] = create(inject_name) elif isinstance(target_key, str): if target_key not in global_config: raise ValueError("Missing injection config:", target_key) target = global_config[target_key] if isinstance(target, SchemaDict): cls_kwargs[k] = create(target_key) elif hasattr(target, '__dict__'): # serialized object cls_kwargs[k] = target else: raise ValueError("Unsupported injection type:", target_key) # prevent modification of global config values of reference types # (e.g., list, dict) from within the created module instances #kwargs = copy.deepcopy(kwargs) return cls(**cls_kwargs) ================================================ FILE: ppdet/data/__init__.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import source from . import transform from . import reader from .source import * from .transform import * from .reader import * ================================================ FILE: ppdet/data/crop_utils/__init__.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppdet/data/crop_utils/annotation_cropper.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import copy import math import random import numpy as np from copy import deepcopy from typing import List, Tuple from collections import defaultdict from .chip_box_utils import nms, transform_chip_boxes2image_boxes from .chip_box_utils import find_chips_to_cover_overlaped_boxes from .chip_box_utils import transform_chip_box from .chip_box_utils import intersection_over_box class AnnoCropper(object): def __init__(self, image_target_sizes: List[int], valid_box_ratio_ranges: List[List[float]], chip_target_size: int, chip_target_stride: int, use_neg_chip: bool=False, max_neg_num_per_im: int=8, max_per_img: int=-1, nms_thresh: int=0.5): """ Generate chips by chip_target_size and chip_target_stride. These two parameters just like kernel_size and stride in cnn. Each image has its raw size. After resizing, then get its target size. The resizing scale = target_size / raw_size. So are chips of the image. box_ratio = box_raw_size / image_raw_size = box_target_size / image_target_size The 'size' above mentioned is the size of long-side of image, box or chip. :param image_target_sizes: [2000, 1000] :param valid_box_ratio_ranges: [[-1, 0.1],[0.08, -1]] :param chip_target_size: 500 :param chip_target_stride: 200 """ self.target_sizes = image_target_sizes self.valid_box_ratio_ranges = valid_box_ratio_ranges assert len(self.target_sizes) == len(self.valid_box_ratio_ranges) self.scale_num = len(self.target_sizes) self.chip_target_size = chip_target_size # is target size self.chip_target_stride = chip_target_stride # is target stride self.use_neg_chip = use_neg_chip self.max_neg_num_per_im = max_neg_num_per_im self.max_per_img = max_per_img self.nms_thresh = nms_thresh def crop_anno_records(self, records: List[dict]): """ The main logic: # foreach record(image): # foreach scale: # 1 generate chips by chip size and stride for each scale # 2 get pos chips # - validate boxes: current scale; h,w >= 1 # - find pos chips greedily by valid gt boxes in each scale # - for every valid gt box, find its corresponding pos chips in each scale # 3 get neg chips # - If given proposals, find neg boxes in them which are not in pos chips # - If got neg boxes in last step, we find neg chips and assign neg boxes to neg chips such as 2. # 4 sample neg chips if too much each image # transform this image-scale annotations to chips(pos chips&neg chips) annotations :param records, standard coco_record but with extra key `proposals`(Px4), which are predicted by stage1 model and maybe have neg boxes in them. :return: new_records, list of dict like { 'im_file': 'fake_image1.jpg', 'im_id': np.array([1]), # new _global_chip_id as im_id 'h': h, # chip height 'w': w, # chip width 'is_crowd': is_crowd, # Nx1 -> Mx1 'gt_class': gt_class, # Nx1 -> Mx1 'gt_bbox': gt_bbox, # Nx4 -> Mx4, 4 represents [x1,y1,x2,y2] 'gt_poly': gt_poly, # [None]xN -> [None]xM 'chip': [x1, y1, x2, y2] # added } Attention: ------------------------------>x | | (x1,y1)------ | | | | | | | | | | | | | | | | ---------- | (x2,y2) | ↓ y If we use [x1, y1, x2, y2] to represent boxes or chips, (x1,y1) is the left-top point which is in the box, but (x2,y2) is the right-bottom point which is not in the box. So x1 in [0, w-1], x2 in [1, w], y1 in [0, h-1], y2 in [1,h]. And you can use x2-x1 to get width, and you can use image[y1:y2, x1:x2] to get the box area. """ self.chip_records = [] self._global_chip_id = 1 for r in records: self._cur_im_pos_chips = [ ] # element: (chip, boxes_idx), chip is [x1, y1, x2, y2], boxes_ids is List[int] self._cur_im_neg_chips = [] # element: (chip, neg_box_num) for scale_i in range(self.scale_num): self._get_current_scale_parameters(scale_i, r) # Cx4 chips = self._create_chips(r['h'], r['w'], self._cur_scale) # # dict: chipid->[box_id, ...] pos_chip2boxes_idx = self._get_valid_boxes_and_pos_chips( r['gt_bbox'], chips) # dict: chipid->neg_box_num neg_chip2box_num = self._get_neg_boxes_and_chips( chips, list(pos_chip2boxes_idx.keys()), r.get('proposals', None)) self._add_to_cur_im_chips(chips, pos_chip2boxes_idx, neg_chip2box_num) cur_image_records = self._trans_all_chips2annotations(r) self.chip_records.extend(cur_image_records) return self.chip_records def _add_to_cur_im_chips(self, chips, pos_chip2boxes_idx, neg_chip2box_num): for pos_chipid, boxes_idx in pos_chip2boxes_idx.items(): chip = np.array(chips[pos_chipid]) # copy chips slice self._cur_im_pos_chips.append((chip, boxes_idx)) if neg_chip2box_num is None: return for neg_chipid, neg_box_num in neg_chip2box_num.items(): chip = np.array(chips[neg_chipid]) self._cur_im_neg_chips.append((chip, neg_box_num)) def _trans_all_chips2annotations(self, r): gt_bbox = r['gt_bbox'] im_file = r['im_file'] is_crowd = r['is_crowd'] gt_class = r['gt_class'] # gt_poly = r['gt_poly'] # [None]xN # remaining keys: im_id, h, w chip_records = self._trans_pos_chips2annotations(im_file, gt_bbox, is_crowd, gt_class) if not self.use_neg_chip: return chip_records sampled_neg_chips = self._sample_neg_chips() neg_chip_records = self._trans_neg_chips2annotations(im_file, sampled_neg_chips) chip_records.extend(neg_chip_records) return chip_records def _trans_pos_chips2annotations(self, im_file, gt_bbox, is_crowd, gt_class): chip_records = [] for chip, boxes_idx in self._cur_im_pos_chips: chip_bbox, final_boxes_idx = transform_chip_box(gt_bbox, boxes_idx, chip) x1, y1, x2, y2 = chip chip_h = y2 - y1 chip_w = x2 - x1 rec = { 'im_file': im_file, 'im_id': np.array([self._global_chip_id]), 'h': chip_h, 'w': chip_w, 'gt_bbox': chip_bbox, 'is_crowd': is_crowd[final_boxes_idx].copy(), 'gt_class': gt_class[final_boxes_idx].copy(), # 'gt_poly': [None] * len(final_boxes_idx), 'chip': chip } self._global_chip_id += 1 chip_records.append(rec) return chip_records def _sample_neg_chips(self): pos_num = len(self._cur_im_pos_chips) neg_num = len(self._cur_im_neg_chips) sample_num = min(pos_num + 2, self.max_neg_num_per_im) assert sample_num >= 1 if neg_num <= sample_num: return self._cur_im_neg_chips candidate_num = int(sample_num * 1.5) candidate_neg_chips = sorted( self._cur_im_neg_chips, key=lambda x: -x[1])[:candidate_num] random.shuffle(candidate_neg_chips) sampled_neg_chips = candidate_neg_chips[:sample_num] return sampled_neg_chips def _trans_neg_chips2annotations(self, im_file: str, sampled_neg_chips: List[Tuple]): chip_records = [] for chip, neg_box_num in sampled_neg_chips: x1, y1, x2, y2 = chip chip_h = y2 - y1 chip_w = x2 - x1 rec = { 'im_file': im_file, 'im_id': np.array([self._global_chip_id]), 'h': chip_h, 'w': chip_w, 'gt_bbox': np.zeros( (0, 4), dtype=np.float32), 'is_crowd': np.zeros( (0, 1), dtype=np.int32), 'gt_class': np.zeros( (0, 1), dtype=np.int32), # 'gt_poly': [], 'chip': chip } self._global_chip_id += 1 chip_records.append(rec) return chip_records def _get_current_scale_parameters(self, scale_i, r): im_size = max(r['h'], r['w']) im_target_size = self.target_sizes[scale_i] self._cur_im_size, self._cur_im_target_size = im_size, im_target_size self._cur_scale = self._get_current_scale(im_target_size, im_size) self._cur_valid_ratio_range = self.valid_box_ratio_ranges[scale_i] def _get_current_scale(self, im_target_size, im_size): return im_target_size / im_size def _create_chips(self, h: int, w: int, scale: float): """ Generate chips by chip_target_size and chip_target_stride. These two parameters just like kernel_size and stride in cnn. :return: chips, Cx4, xy in raw size dimension """ chip_size = self.chip_target_size # omit target for simplicity stride = self.chip_target_stride width = int(scale * w) height = int(scale * h) min_chip_location_diff = 20 # in target size assert chip_size >= stride chip_overlap = chip_size - stride if (width - chip_overlap ) % stride > min_chip_location_diff: # 不能被stride整除的部分比较大,则保留 w_steps = max(1, int(math.ceil((width - chip_overlap) / stride))) else: # 不能被stride整除的部分比较小,则丢弃 w_steps = max(1, int(math.floor((width - chip_overlap) / stride))) if (height - chip_overlap) % stride > min_chip_location_diff: h_steps = max(1, int(math.ceil((height - chip_overlap) / stride))) else: h_steps = max(1, int(math.floor((height - chip_overlap) / stride))) chips = list() for j in range(h_steps): for i in range(w_steps): x1 = i * stride y1 = j * stride x2 = min(x1 + chip_size, width) y2 = min(y1 + chip_size, height) chips.append([x1, y1, x2, y2]) # check chip size for item in chips: if item[2] - item[0] > chip_size * 1.1 or item[3] - item[ 1] > chip_size * 1.1: raise ValueError(item) chips = np.array(chips, dtype=np.float32) raw_size_chips = chips / scale return raw_size_chips def _get_valid_boxes_and_pos_chips(self, gt_bbox, chips): valid_ratio_range = self._cur_valid_ratio_range im_size = self._cur_im_size scale = self._cur_scale # Nx4 N valid_boxes, valid_boxes_idx = self._validate_boxes( valid_ratio_range, im_size, gt_bbox, scale) # dict: chipid->[box_id, ...] pos_chip2boxes_idx = self._find_pos_chips(chips, valid_boxes, valid_boxes_idx) return pos_chip2boxes_idx def _validate_boxes(self, valid_ratio_range: List[float], im_size: int, gt_boxes: 'np.array of Nx4', scale: float): """ :return: valid_boxes: Nx4, valid_boxes_idx: N """ ws = (gt_boxes[:, 2] - gt_boxes[:, 0]).astype(np.int32) hs = (gt_boxes[:, 3] - gt_boxes[:, 1]).astype(np.int32) maxs = np.maximum(ws, hs) box_ratio = maxs / im_size mins = np.minimum(ws, hs) target_mins = mins * scale low = valid_ratio_range[0] if valid_ratio_range[0] > 0 else 0 high = valid_ratio_range[1] if valid_ratio_range[1] > 0 else np.finfo( np.float32).max valid_boxes_idx = np.nonzero((low <= box_ratio) & (box_ratio < high) & ( target_mins >= 2))[0] valid_boxes = gt_boxes[valid_boxes_idx] return valid_boxes, valid_boxes_idx def _find_pos_chips(self, chips: 'Cx4', valid_boxes: 'Bx4', valid_boxes_idx: 'B'): """ :return: pos_chip2boxes_idx, dict: chipid->[box_id, ...] """ iob = intersection_over_box(chips, valid_boxes) # overlap, CxB iob_threshold_to_find_chips = 1. pos_chip_ids, _ = self._find_chips_to_cover_overlaped_boxes( iob, iob_threshold_to_find_chips) pos_chip_ids = set(pos_chip_ids) iob_threshold_to_assign_box = 0.5 pos_chip2boxes_idx = self._assign_boxes_to_pos_chips( iob, iob_threshold_to_assign_box, pos_chip_ids, valid_boxes_idx) return pos_chip2boxes_idx def _find_chips_to_cover_overlaped_boxes(self, iob, overlap_threshold): return find_chips_to_cover_overlaped_boxes(iob, overlap_threshold) def _assign_boxes_to_pos_chips(self, iob, overlap_threshold, pos_chip_ids, valid_boxes_idx): chip_ids, box_ids = np.nonzero(iob >= overlap_threshold) pos_chip2boxes_idx = defaultdict(list) for chip_id, box_id in zip(chip_ids, box_ids): if chip_id not in pos_chip_ids: continue raw_gt_box_idx = valid_boxes_idx[box_id] pos_chip2boxes_idx[chip_id].append(raw_gt_box_idx) return pos_chip2boxes_idx def _get_neg_boxes_and_chips(self, chips: 'Cx4', pos_chip_ids: 'D', proposals: 'Px4'): """ :param chips: :param pos_chip_ids: :param proposals: :return: neg_chip2box_num, None or dict: chipid->neg_box_num """ if not self.use_neg_chip: return None # train proposals maybe None if proposals is None or len(proposals) < 1: return None valid_ratio_range = self._cur_valid_ratio_range im_size = self._cur_im_size scale = self._cur_scale valid_props, _ = self._validate_boxes(valid_ratio_range, im_size, proposals, scale) neg_boxes = self._find_neg_boxes(chips, pos_chip_ids, valid_props) neg_chip2box_num = self._find_neg_chips(chips, pos_chip_ids, neg_boxes) return neg_chip2box_num def _find_neg_boxes(self, chips: 'Cx4', pos_chip_ids: 'D', valid_props: 'Px4'): """ :return: neg_boxes: Nx4 """ if len(pos_chip_ids) == 0: return valid_props pos_chips = chips[pos_chip_ids] iob = intersection_over_box(pos_chips, valid_props) overlap_per_prop = np.max(iob, axis=0) non_overlap_props_idx = overlap_per_prop < 0.5 neg_boxes = valid_props[non_overlap_props_idx] return neg_boxes def _find_neg_chips(self, chips: 'Cx4', pos_chip_ids: 'D', neg_boxes: 'Nx4'): """ :return: neg_chip2box_num, dict: chipid->neg_box_num """ neg_chip_ids = np.setdiff1d(np.arange(len(chips)), pos_chip_ids) neg_chips = chips[neg_chip_ids] iob = intersection_over_box(neg_chips, neg_boxes) iob_threshold_to_find_chips = 0.7 chosen_neg_chip_ids, chip_id2overlap_box_num = \ self._find_chips_to_cover_overlaped_boxes(iob, iob_threshold_to_find_chips) neg_chipid2box_num = {} for cid in chosen_neg_chip_ids: box_num = chip_id2overlap_box_num[cid] raw_chip_id = neg_chip_ids[cid] neg_chipid2box_num[raw_chip_id] = box_num return neg_chipid2box_num def crop_infer_anno_records(self, records: List[dict]): """ transform image record to chips record :param records: :return: new_records, list of dict like { 'im_file': 'fake_image1.jpg', 'im_id': np.array([1]), # new _global_chip_id as im_id 'h': h, # chip height 'w': w, # chip width 'chip': [x1, y1, x2, y2] # added 'ori_im_h': ori_im_h # added, origin image height 'ori_im_w': ori_im_w # added, origin image width 'scale_i': 0 # added, } """ self.chip_records = [] self._global_chip_id = 1 # im_id start from 1 self._global_chip_id2img_id = {} for r in records: for scale_i in range(self.scale_num): self._get_current_scale_parameters(scale_i, r) # Cx4 chips = self._create_chips(r['h'], r['w'], self._cur_scale) cur_img_chip_record = self._get_chips_records(r, chips, scale_i) self.chip_records.extend(cur_img_chip_record) return self.chip_records def _get_chips_records(self, rec, chips, scale_i): cur_img_chip_records = [] ori_im_h = rec["h"] ori_im_w = rec["w"] im_file = rec["im_file"] ori_im_id = rec["im_id"] for id, chip in enumerate(chips): chip_rec = {} x1, y1, x2, y2 = chip chip_h = y2 - y1 chip_w = x2 - x1 chip_rec["im_file"] = im_file chip_rec["im_id"] = self._global_chip_id chip_rec["h"] = chip_h chip_rec["w"] = chip_w chip_rec["chip"] = chip chip_rec["ori_im_h"] = ori_im_h chip_rec["ori_im_w"] = ori_im_w chip_rec["scale_i"] = scale_i self._global_chip_id2img_id[self._global_chip_id] = int(ori_im_id) self._global_chip_id += 1 cur_img_chip_records.append(chip_rec) return cur_img_chip_records def aggregate_chips_detections(self, results, records=None): """ # 1. transform chip dets to image dets # 2. nms boxes per image; # 3. format output results :param results: :param roidb: :return: """ results = deepcopy(results) records = records if records else self.chip_records img_id2bbox = self._transform_chip2image_bboxes(results, records) nms_img_id2bbox = self._nms_dets(img_id2bbox) aggregate_results = self._reformat_results(nms_img_id2bbox) return aggregate_results def _transform_chip2image_bboxes(self, results, records): # 1. Transform chip dets to image dets; # 2. Filter valid range; # 3. Reformat and Aggregate chip dets to Get scale_cls_dets img_id2bbox = defaultdict(list) for result in results: bbox_locs = result['bbox'] bbox_nums = result['bbox_num'] if len(bbox_locs) == 1 and bbox_locs[0][ 0] == -1: # current batch has no detections # bbox_locs = array([[-1.]], dtype=float32); bbox_nums = [[1]] # MultiClassNMS output: If there is no detected boxes for all images, lod will be set to {1} and Out only contains one value which is -1. continue im_ids = result['im_id'] # replace with range(len(bbox_nums)) last_bbox_num = 0 for idx, im_id in enumerate(im_ids): cur_bbox_len = bbox_nums[idx] bboxes = bbox_locs[last_bbox_num:last_bbox_num + cur_bbox_len] last_bbox_num += cur_bbox_len # box: [num_id, score, xmin, ymin, xmax, ymax] if len(bboxes) == 0: # current image has no detections continue chip_rec = records[int(im_id) - 1] # im_id starts from 1, type is np.int64 image_size = max(chip_rec["ori_im_h"], chip_rec["ori_im_w"]) bboxes = transform_chip_boxes2image_boxes( bboxes, chip_rec["chip"], chip_rec["ori_im_h"], chip_rec["ori_im_w"]) scale_i = chip_rec["scale_i"] cur_scale = self._get_current_scale(self.target_sizes[scale_i], image_size) _, valid_boxes_idx = self._validate_boxes( self.valid_box_ratio_ranges[scale_i], image_size, bboxes[:, 2:], cur_scale) ori_img_id = self._global_chip_id2img_id[int(im_id)] img_id2bbox[ori_img_id].append(bboxes[valid_boxes_idx]) return img_id2bbox def _nms_dets(self, img_id2bbox): # 1. NMS on each image-class # 2. Limit number of detections to MAX_PER_IMAGE if requested max_per_img = self.max_per_img nms_thresh = self.nms_thresh for img_id in img_id2bbox: box = img_id2bbox[ img_id] # list of np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2] box = np.concatenate(box, axis=0) nms_dets = nms(box, nms_thresh) if max_per_img > 0: if len(nms_dets) > max_per_img: keep = np.argsort(-nms_dets[:, 1])[:max_per_img] nms_dets = nms_dets[keep] img_id2bbox[img_id] = nms_dets return img_id2bbox def _reformat_results(self, img_id2bbox): """reformat results""" im_ids = img_id2bbox.keys() results = [] for img_id in im_ids: # output by original im_id order if len(img_id2bbox[img_id]) == 0: bbox = np.array( [[-1., 0., 0., 0., 0., 0.]]) # edge case: no detections bbox_num = np.array([0]) else: # np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2] bbox = img_id2bbox[img_id] bbox_num = np.array([len(bbox)]) res = dict(im_id=np.array([[img_id]]), bbox=bbox, bbox_num=bbox_num) results.append(res) return results ================================================ FILE: ppdet/data/crop_utils/chip_box_utils.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np def bbox_area(boxes): return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) def intersection_over_box(chips, boxes): """ intersection area over box area :param chips: C :param boxes: B :return: iob, CxB """ M = chips.shape[0] N = boxes.shape[0] if M * N == 0: return np.zeros([M, N], dtype='float32') box_area = bbox_area(boxes) # B inter_x2y2 = np.minimum(np.expand_dims(chips, 1)[:, :, 2:], boxes[:, 2:]) # CxBX2 inter_x1y1 = np.maximum(np.expand_dims(chips, 1)[:, :, :2], boxes[:, :2]) # CxBx2 inter_wh = inter_x2y2 - inter_x1y1 inter_wh = np.clip(inter_wh, a_min=0, a_max=None) inter_area = inter_wh[:, :, 0] * inter_wh[:, :, 1] # CxB iob = inter_area / np.expand_dims(box_area, 0) return iob def clip_boxes(boxes, im_shape): """ Clip boxes to image boundaries. :param boxes: [N, 4] :param im_shape: tuple of 2, [h, w] :return: [N, 4] """ # x1 >= 0 boxes[:, 0] = np.clip(boxes[:, 0], 0, im_shape[1] - 1) # y1 >= 0 boxes[:, 1] = np.clip(boxes[:, 1], 0, im_shape[0] - 1) # x2 < im_shape[1] boxes[:, 2] = np.clip(boxes[:, 2], 1, im_shape[1]) # y2 < im_shape[0] boxes[:, 3] = np.clip(boxes[:, 3], 1, im_shape[0]) return boxes def transform_chip_box(gt_bbox: 'Gx4', boxes_idx: 'B', chip: '4'): boxes_idx = np.array(boxes_idx) cur_gt_bbox = gt_bbox[boxes_idx].copy() # Bx4 x1, y1, x2, y2 = chip cur_gt_bbox[:, 0] -= x1 cur_gt_bbox[:, 1] -= y1 cur_gt_bbox[:, 2] -= x1 cur_gt_bbox[:, 3] -= y1 h = y2 - y1 w = x2 - x1 cur_gt_bbox = clip_boxes(cur_gt_bbox, (h, w)) ws = (cur_gt_bbox[:, 2] - cur_gt_bbox[:, 0]).astype(np.int32) hs = (cur_gt_bbox[:, 3] - cur_gt_bbox[:, 1]).astype(np.int32) valid_idx = (ws >= 2) & (hs >= 2) return cur_gt_bbox[valid_idx], boxes_idx[valid_idx] def find_chips_to_cover_overlaped_boxes(iob, overlap_threshold): chip_ids, box_ids = np.nonzero(iob >= overlap_threshold) chip_id2overlap_box_num = np.bincount(chip_ids) # 1d array chip_id2overlap_box_num = np.pad( chip_id2overlap_box_num, (0, len(iob) - len(chip_id2overlap_box_num)), constant_values=0) chosen_chip_ids = [] while len(box_ids) > 0: value_counts = np.bincount(chip_ids) # 1d array max_count_chip_id = np.argmax(value_counts) assert max_count_chip_id not in chosen_chip_ids chosen_chip_ids.append(max_count_chip_id) box_ids_in_cur_chip = box_ids[chip_ids == max_count_chip_id] ids_not_in_cur_boxes_mask = np.logical_not( np.isin(box_ids, box_ids_in_cur_chip)) chip_ids = chip_ids[ids_not_in_cur_boxes_mask] box_ids = box_ids[ids_not_in_cur_boxes_mask] return chosen_chip_ids, chip_id2overlap_box_num def transform_chip_boxes2image_boxes(chip_boxes, chip, img_h, img_w): chip_boxes = np.array(sorted(chip_boxes, key=lambda item: -item[1])) xmin, ymin, _, _ = chip # Transform to origin image loc chip_boxes[:, 2] += xmin chip_boxes[:, 4] += xmin chip_boxes[:, 3] += ymin chip_boxes[:, 5] += ymin chip_boxes = clip_boxes(chip_boxes, (img_h, img_w)) return chip_boxes def nms(dets, thresh): """Apply classic DPM-style greedy NMS.""" if dets.shape[0] == 0: return dets[[], :] scores = dets[:, 1] x1 = dets[:, 2] y1 = dets[:, 3] x2 = dets[:, 4] y2 = dets[:, 5] areas = (x2 - x1 + 1) * (y2 - y1 + 1) order = scores.argsort()[::-1] ndets = dets.shape[0] suppressed = np.zeros((ndets), dtype=np.int32) # nominal indices # _i, _j # sorted indices # i, j # temp variables for box i's (the box currently under consideration) # ix1, iy1, ix2, iy2, iarea # variables for computing overlap with box j (lower scoring box) # xx1, yy1, xx2, yy2 # w, h # inter, ovr for _i in range(ndets): i = order[_i] if suppressed[i] == 1: continue ix1 = x1[i] iy1 = y1[i] ix2 = x2[i] iy2 = y2[i] iarea = areas[i] for _j in range(_i + 1, ndets): j = order[_j] if suppressed[j] == 1: continue xx1 = max(ix1, x1[j]) yy1 = max(iy1, y1[j]) xx2 = min(ix2, x2[j]) yy2 = min(iy2, y2[j]) w = max(0.0, xx2 - xx1 + 1) h = max(0.0, yy2 - yy1 + 1) inter = w * h ovr = inter / (iarea + areas[j] - inter) if ovr >= thresh: suppressed[j] = 1 keep = np.where(suppressed == 0)[0] dets = dets[keep, :] return dets ================================================ FILE: ppdet/data/culane_utils.py ================================================ import math import numpy as np from imgaug.augmentables.lines import LineString from scipy.interpolate import InterpolatedUnivariateSpline def lane_to_linestrings(lanes): lines = [] for lane in lanes: lines.append(LineString(lane)) return lines def linestrings_to_lanes(lines): lanes = [] for line in lines: lanes.append(line.coords) return lanes def sample_lane(points, sample_ys, img_w): # this function expects the points to be sorted points = np.array(points) if not np.all(points[1:, 1] < points[:-1, 1]): raise Exception('Annotaion points have to be sorted') x, y = points[:, 0], points[:, 1] # interpolate points inside domain assert len(points) > 1 interp = InterpolatedUnivariateSpline( y[::-1], x[::-1], k=min(3, len(points) - 1)) domain_min_y = y.min() domain_max_y = y.max() sample_ys_inside_domain = sample_ys[(sample_ys >= domain_min_y) & ( sample_ys <= domain_max_y)] assert len(sample_ys_inside_domain) > 0 interp_xs = interp(sample_ys_inside_domain) # extrapolate lane to the bottom of the image with a straight line using the 2 points closest to the bottom two_closest_points = points[:2] extrap = np.polyfit( two_closest_points[:, 1], two_closest_points[:, 0], deg=1) extrap_ys = sample_ys[sample_ys > domain_max_y] extrap_xs = np.polyval(extrap, extrap_ys) all_xs = np.hstack((extrap_xs, interp_xs)) # separate between inside and outside points inside_mask = (all_xs >= 0) & (all_xs < img_w) xs_inside_image = all_xs[inside_mask] xs_outside_image = all_xs[~inside_mask] return xs_outside_image, xs_inside_image def filter_lane(lane): assert lane[-1][1] <= lane[0][1] filtered_lane = [] used = set() for p in lane: if p[1] not in used: filtered_lane.append(p) used.add(p[1]) return filtered_lane def transform_annotation(img_w, img_h, max_lanes, n_offsets, offsets_ys, n_strips, strip_size, anno): old_lanes = anno['lanes'] # removing lanes with less than 2 points old_lanes = filter(lambda x: len(x) > 1, old_lanes) # sort lane points by Y (bottom to top of the image) old_lanes = [sorted(lane, key=lambda x: -x[1]) for lane in old_lanes] # remove points with same Y (keep first occurrence) old_lanes = [filter_lane(lane) for lane in old_lanes] # normalize the annotation coordinates old_lanes = [[[x * img_w / float(img_w), y * img_h / float(img_h)] for x, y in lane] for lane in old_lanes] # create tranformed annotations lanes = np.ones( (max_lanes, 2 + 1 + 1 + 2 + n_offsets), dtype=np.float32 ) * -1e5 # 2 scores, 1 start_y, 1 start_x, 1 theta, 1 length, S+1 coordinates lanes_endpoints = np.ones((max_lanes, 2)) # lanes are invalid by default lanes[:, 0] = 1 lanes[:, 1] = 0 for lane_idx, lane in enumerate(old_lanes): if lane_idx >= max_lanes: break try: xs_outside_image, xs_inside_image = sample_lane(lane, offsets_ys, img_w) except AssertionError: continue if len(xs_inside_image) <= 1: continue all_xs = np.hstack((xs_outside_image, xs_inside_image)) lanes[lane_idx, 0] = 0 lanes[lane_idx, 1] = 1 lanes[lane_idx, 2] = len(xs_outside_image) / n_strips lanes[lane_idx, 3] = xs_inside_image[0] thetas = [] for i in range(1, len(xs_inside_image)): theta = math.atan( i * strip_size / (xs_inside_image[i] - xs_inside_image[0] + 1e-5)) / math.pi theta = theta if theta > 0 else 1 - abs(theta) thetas.append(theta) theta_far = sum(thetas) / len(thetas) # lanes[lane_idx, # 4] = (theta_closest + theta_far) / 2 # averaged angle lanes[lane_idx, 4] = theta_far lanes[lane_idx, 5] = len(xs_inside_image) lanes[lane_idx, 6:6 + len(all_xs)] = all_xs lanes_endpoints[lane_idx, 0] = (len(all_xs) - 1) / n_strips lanes_endpoints[lane_idx, 1] = xs_inside_image[-1] new_anno = { 'label': lanes, 'old_anno': anno, 'lane_endpoints': lanes_endpoints } return new_anno ================================================ FILE: ppdet/data/reader.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import copy import os import traceback import six import sys if sys.version_info >= (3, 0): pass else: pass import numpy as np import paddle import paddle.nn.functional as F from copy import deepcopy from paddle.io import DataLoader, DistributedBatchSampler from .utils import default_collate_fn from ppdet.core.workspace import register from . import transform from .shm_utils import _get_shared_memory_size_in_M from ppdet.utils.logger import setup_logger logger = setup_logger('reader') MAIN_PID = os.getpid() class Compose(object): def __init__(self, transforms, num_classes=80): self.transforms = transforms self.transforms_cls = [] for t in self.transforms: for k, v in t.items(): op_cls = getattr(transform, k) f = op_cls(**v) if hasattr(f, 'num_classes'): f.num_classes = num_classes self.transforms_cls.append(f) def _update_transforms_cls(self, data): if 'transform_schedulers' in data: def is_valid(op): op_name = op.__class__.__name__ for t in data['transform_schedulers']: for k, v in t.items(): if op_name == k: # [start_epoch, stop_epoch) start_epoch = v.get('start_epoch', 0) if start_epoch > data['curr_epoch']: return False stop_epoch = v.get('stop_epoch', float('inf')) if stop_epoch <= data['curr_epoch']: return False return True return filter(is_valid, self.transforms_cls) else: return self.transforms_cls def __call__(self, data): transforms_cls = self._update_transforms_cls(data) for f in transforms_cls: try: data = f(data) except Exception as e: stack_info = traceback.format_exc() logger.warning("fail to map sample transform [{}] " "with error: {} and stack:\n{}".format( f, e, str(stack_info))) raise e return data class BatchCompose(Compose): def __init__(self, transforms, num_classes=80, collate_batch=True): super(BatchCompose, self).__init__(transforms, num_classes) self.collate_batch = collate_batch def __call__(self, data): transforms_cls = self._update_transforms_cls(data[0]) for f in transforms_cls: try: data = f(data) except Exception as e: stack_info = traceback.format_exc() logger.warning("fail to map batch transform [{}] " "with error: {} and stack:\n{}".format( f, e, str(stack_info))) raise e # remove keys which is not needed by model extra_key = ['h', 'w', 'flipped', 'transform_schedulers'] for k in extra_key: for sample in data: if k in sample: sample.pop(k) # batch data, if user-define batch function needed # use user-defined here if self.collate_batch: batch_data = default_collate_fn(data) else: batch_data = {} for k in data[0].keys(): tmp_data = [] for i in range(len(data)): tmp_data.append(data[i][k]) if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k: tmp_data = np.stack(tmp_data, axis=0) if 'origin_' in k: tmp_data = np.stack(tmp_data, axis=0) batch_data[k] = tmp_data return batch_data class BaseDataLoader(object): """ Base DataLoader implementation for detection models Args: sample_transforms (list): a list of transforms to perform on each sample batch_transforms (list): a list of transforms to perform on batch batch_size (int): batch size for batch collating, default 1. shuffle (bool): whether to shuffle samples drop_last (bool): whether to drop the last incomplete, default False num_classes (int): class number of dataset, default 80 collate_batch (bool): whether to collate batch in dataloader. If set to True, the samples will collate into batch according to the batch size. Otherwise, the ground-truth will not collate, which is used when the number of ground-truch is different in samples. use_shared_memory (bool): whether to use shared memory to accelerate data loading, enable this only if you are sure that the shared memory size of your OS is larger than memory cost of input datas of model. Note that shared memory will be automatically disabled if the shared memory of OS is less than 1G, which is not enough for detection models. Default False. """ def __init__(self, sample_transforms=[], batch_transforms=[], batch_size=1, shuffle=False, drop_last=False, num_classes=80, collate_batch=True, use_shared_memory=False, **kwargs): # sample transform self._sample_transforms = Compose( sample_transforms, num_classes=num_classes) # batch transfrom self._batch_transforms = BatchCompose(batch_transforms, num_classes, collate_batch) self.batch_size = batch_size self.shuffle = shuffle self.drop_last = drop_last self.use_shared_memory = use_shared_memory self.kwargs = kwargs def __call__(self, dataset, worker_num, batch_sampler=None, return_list=False): self.dataset = dataset self.dataset.check_or_download_dataset() self.dataset.parse_dataset() # get data self.dataset.set_transform(self._sample_transforms) # set kwargs self.dataset.set_kwargs(**self.kwargs) # batch sampler if batch_sampler is None: self._batch_sampler = DistributedBatchSampler( self.dataset, batch_size=self.batch_size, shuffle=self.shuffle, drop_last=self.drop_last) else: self._batch_sampler = batch_sampler # DataLoader do not start sub-process in Windows and Mac # system, do not need to use shared memory use_shared_memory = self.use_shared_memory and \ sys.platform not in ['win32', 'darwin'] # check whether shared memory size is bigger than 1G(1024M) if use_shared_memory: shm_size = _get_shared_memory_size_in_M() if shm_size is not None and shm_size < 1024.: logger.warning("Shared memory size is less than 1G, " "disable shared_memory in DataLoader") use_shared_memory = False self.dataloader = DataLoader( dataset=self.dataset, batch_sampler=self._batch_sampler, collate_fn=self._batch_transforms, num_workers=worker_num, return_list=return_list, use_shared_memory=use_shared_memory) self.loader = iter(self.dataloader) return self def __len__(self): return len(self._batch_sampler) def __iter__(self): return self def __next__(self): try: return next(self.loader) except StopIteration: self.loader = iter(self.dataloader) six.reraise(*sys.exc_info()) def next(self): # python2 compatibility return self.__next__() @register class TrainReader(BaseDataLoader): __shared__ = ['num_classes'] def __init__(self, sample_transforms=[], batch_transforms=[], batch_size=1, shuffle=True, drop_last=True, num_classes=80, collate_batch=True, **kwargs): super(TrainReader, self).__init__(sample_transforms, batch_transforms, batch_size, shuffle, drop_last, num_classes, collate_batch, **kwargs) @register class EvalReader(BaseDataLoader): __shared__ = ['num_classes'] def __init__(self, sample_transforms=[], batch_transforms=[], batch_size=1, shuffle=False, drop_last=False, num_classes=80, **kwargs): super(EvalReader, self).__init__(sample_transforms, batch_transforms, batch_size, shuffle, drop_last, num_classes, **kwargs) @register class TestReader(BaseDataLoader): __shared__ = ['num_classes'] def __init__(self, sample_transforms=[], batch_transforms=[], batch_size=1, shuffle=False, drop_last=False, num_classes=80, **kwargs): super(TestReader, self).__init__(sample_transforms, batch_transforms, batch_size, shuffle, drop_last, num_classes, **kwargs) @register class EvalMOTReader(BaseDataLoader): __shared__ = ['num_classes'] def __init__(self, sample_transforms=[], batch_transforms=[], batch_size=1, shuffle=False, drop_last=False, num_classes=1, **kwargs): super(EvalMOTReader, self).__init__(sample_transforms, batch_transforms, batch_size, shuffle, drop_last, num_classes, **kwargs) @register class TestMOTReader(BaseDataLoader): __shared__ = ['num_classes'] def __init__(self, sample_transforms=[], batch_transforms=[], batch_size=1, shuffle=False, drop_last=False, num_classes=1, **kwargs): super(TestMOTReader, self).__init__(sample_transforms, batch_transforms, batch_size, shuffle, drop_last, num_classes, **kwargs) # For Semi-Supervised Object Detection (SSOD) class Compose_SSOD(object): def __init__(self, base_transforms, weak_aug, strong_aug, num_classes=80): self.base_transforms = base_transforms self.base_transforms_cls = [] for t in self.base_transforms: for k, v in t.items(): op_cls = getattr(transform, k) f = op_cls(**v) if hasattr(f, 'num_classes'): f.num_classes = num_classes self.base_transforms_cls.append(f) self.weak_augs = weak_aug self.weak_augs_cls = [] for t in self.weak_augs: for k, v in t.items(): op_cls = getattr(transform, k) f = op_cls(**v) if hasattr(f, 'num_classes'): f.num_classes = num_classes self.weak_augs_cls.append(f) self.strong_augs = strong_aug self.strong_augs_cls = [] for t in self.strong_augs: for k, v in t.items(): op_cls = getattr(transform, k) f = op_cls(**v) if hasattr(f, 'num_classes'): f.num_classes = num_classes self.strong_augs_cls.append(f) def __call__(self, data): for f in self.base_transforms_cls: try: data = f(data) except Exception as e: stack_info = traceback.format_exc() logger.warning("fail to map sample transform [{}] " "with error: {} and stack:\n{}".format( f, e, str(stack_info))) raise e weak_data = deepcopy(data) strong_data = deepcopy(data) for f in self.weak_augs_cls: try: weak_data = f(weak_data) except Exception as e: stack_info = traceback.format_exc() logger.warning("fail to map weak aug [{}] " "with error: {} and stack:\n{}".format( f, e, str(stack_info))) raise e for f in self.strong_augs_cls: try: strong_data = f(strong_data) except Exception as e: stack_info = traceback.format_exc() logger.warning("fail to map strong aug [{}] " "with error: {} and stack:\n{}".format( f, e, str(stack_info))) raise e weak_data['strong_aug'] = strong_data return weak_data class BatchCompose_SSOD(Compose): def __init__(self, transforms, num_classes=80, collate_batch=True): super(BatchCompose_SSOD, self).__init__(transforms, num_classes) self.collate_batch = collate_batch def __call__(self, data): # split strong_data from data(weak_data) strong_data = [] for sample in data: strong_data.append(sample['strong_aug']) sample.pop('strong_aug') for f in self.transforms_cls: try: data = f(data) if 'BatchRandomResizeForSSOD' in f._id: strong_data = f(strong_data, data[1])[0] data = data[0] else: strong_data = f(strong_data) except Exception as e: stack_info = traceback.format_exc() logger.warning("fail to map batch transform [{}] " "with error: {} and stack:\n{}".format( f, e, str(stack_info))) raise e # remove keys which is not needed by model extra_key = ['h', 'w', 'flipped'] for k in extra_key: for sample in data: if k in sample: sample.pop(k) for sample in strong_data: if k in sample: sample.pop(k) # batch data, if user-define batch function needed # use user-defined here if self.collate_batch: batch_data = default_collate_fn(data) strong_batch_data = default_collate_fn(strong_data) return batch_data, strong_batch_data else: batch_data = {} for k in data[0].keys(): tmp_data = [] for i in range(len(data)): tmp_data.append(data[i][k]) if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k: tmp_data = np.stack(tmp_data, axis=0) batch_data[k] = tmp_data strong_batch_data = {} for k in strong_data[0].keys(): tmp_data = [] for i in range(len(strong_data)): tmp_data.append(strong_data[i][k]) if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k: tmp_data = np.stack(tmp_data, axis=0) strong_batch_data[k] = tmp_data return batch_data, strong_batch_data class CombineSSODLoader(object): def __init__(self, label_loader, unlabel_loader): self.label_loader = label_loader self.unlabel_loader = unlabel_loader def __iter__(self): while True: try: label_samples = next(self.label_loader_iter) except: self.label_loader_iter = iter(self.label_loader) label_samples = next(self.label_loader_iter) try: unlabel_samples = next(self.unlabel_loader_iter) except: self.unlabel_loader_iter = iter(self.unlabel_loader) unlabel_samples = next(self.unlabel_loader_iter) yield ( label_samples[0], # sup weak label_samples[1], # sup strong unlabel_samples[0], # unsup weak unlabel_samples[1] # unsup strong ) def __call__(self): return self.__iter__() class BaseSemiDataLoader(object): def __init__(self, sample_transforms=[], weak_aug=[], strong_aug=[], sup_batch_transforms=[], unsup_batch_transforms=[], sup_batch_size=1, unsup_batch_size=1, shuffle=True, drop_last=True, num_classes=80, collate_batch=True, use_shared_memory=False, **kwargs): # sup transforms self._sample_transforms_label = Compose_SSOD( sample_transforms, weak_aug, strong_aug, num_classes=num_classes) self._batch_transforms_label = BatchCompose_SSOD( sup_batch_transforms, num_classes, collate_batch) self.batch_size_label = sup_batch_size # unsup transforms self._sample_transforms_unlabel = Compose_SSOD( sample_transforms, weak_aug, strong_aug, num_classes=num_classes) self._batch_transforms_unlabel = BatchCompose_SSOD( unsup_batch_transforms, num_classes, collate_batch) self.batch_size_unlabel = unsup_batch_size # common self.shuffle = shuffle self.drop_last = drop_last self.use_shared_memory = use_shared_memory self.kwargs = kwargs def __call__(self, dataset_label, dataset_unlabel, worker_num, batch_sampler_label=None, batch_sampler_unlabel=None, return_list=False): # sup dataset self.dataset_label = dataset_label self.dataset_label.check_or_download_dataset() self.dataset_label.parse_dataset() self.dataset_label.set_transform(self._sample_transforms_label) self.dataset_label.set_kwargs(**self.kwargs) if batch_sampler_label is None: self._batch_sampler_label = DistributedBatchSampler( self.dataset_label, batch_size=self.batch_size_label, shuffle=self.shuffle, drop_last=self.drop_last) else: self._batch_sampler_label = batch_sampler_label # unsup dataset self.dataset_unlabel = dataset_unlabel self.dataset_unlabel.length = self.dataset_label.__len__() self.dataset_unlabel.check_or_download_dataset() self.dataset_unlabel.parse_dataset() self.dataset_unlabel.set_transform(self._sample_transforms_unlabel) self.dataset_unlabel.set_kwargs(**self.kwargs) if batch_sampler_unlabel is None: self._batch_sampler_unlabel = DistributedBatchSampler( self.dataset_unlabel, batch_size=self.batch_size_unlabel, shuffle=self.shuffle, drop_last=self.drop_last) else: self._batch_sampler_unlabel = batch_sampler_unlabel # DataLoader do not start sub-process in Windows and Mac # system, do not need to use shared memory use_shared_memory = self.use_shared_memory and \ sys.platform not in ['win32', 'darwin'] # check whether shared memory size is bigger than 1G(1024M) if use_shared_memory: shm_size = _get_shared_memory_size_in_M() if shm_size is not None and shm_size < 1024.: logger.warning("Shared memory size is less than 1G, " "disable shared_memory in DataLoader") use_shared_memory = False self.dataloader_label = DataLoader( dataset=self.dataset_label, batch_sampler=self._batch_sampler_label, collate_fn=self._batch_transforms_label, num_workers=worker_num, return_list=return_list, use_shared_memory=use_shared_memory) self.dataloader_unlabel = DataLoader( dataset=self.dataset_unlabel, batch_sampler=self._batch_sampler_unlabel, collate_fn=self._batch_transforms_unlabel, num_workers=worker_num, return_list=return_list, use_shared_memory=use_shared_memory) self.dataloader = CombineSSODLoader(self.dataloader_label, self.dataloader_unlabel) self.loader = iter(self.dataloader) return self def __len__(self): return len(self._batch_sampler_label) def __iter__(self): return self def __next__(self): return next(self.loader) def next(self): # python2 compatibility return self.__next__() @register class SemiTrainReader(BaseSemiDataLoader): __shared__ = ['num_classes'] def __init__(self, sample_transforms=[], weak_aug=[], strong_aug=[], sup_batch_transforms=[], unsup_batch_transforms=[], sup_batch_size=1, unsup_batch_size=1, shuffle=True, drop_last=True, num_classes=80, collate_batch=True, **kwargs): super(SemiTrainReader, self).__init__( sample_transforms, weak_aug, strong_aug, sup_batch_transforms, unsup_batch_transforms, sup_batch_size, unsup_batch_size, shuffle, drop_last, num_classes, collate_batch, **kwargs) ================================================ FILE: ppdet/data/shm_utils.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os SIZE_UNIT = ['K', 'M', 'G', 'T'] SHM_QUERY_CMD = 'df -h' SHM_KEY = 'shm' SHM_DEFAULT_MOUNT = '/dev/shm' # [ shared memory size check ] # In detection models, image/target data occupies a lot of memory, and # will occupy lots of shared memory in multi-process DataLoader, we use # following code to get shared memory size and perform a size check to # disable shared memory use if shared memory size is not enough. # Shared memory getting process as follows: # 1. use `df -h` get all mount info # 2. pick up spaces whose mount info contains 'shm' # 3. if 'shm' space number is only 1, return its size # 4. if there are multiple 'shm' space, try to find the default mount # directory '/dev/shm' is Linux-like system, otherwise return the # biggest space size. def _parse_size_in_M(size_str): if size_str[-1] == 'B': num, unit = size_str[:-2], size_str[-2] else: num, unit = size_str[:-1], size_str[-1] assert unit in SIZE_UNIT, \ "unknown shm size unit {}".format(unit) return float(num) * \ (1024 ** (SIZE_UNIT.index(unit) - 1)) def _get_shared_memory_size_in_M(): try: df_infos = os.popen(SHM_QUERY_CMD).readlines() except: return None else: shm_infos = [] for df_info in df_infos: info = df_info.strip() if info.find(SHM_KEY) >= 0: shm_infos.append(info.split()) if len(shm_infos) == 0: return None elif len(shm_infos) == 1: return _parse_size_in_M(shm_infos[0][3]) else: default_mount_infos = [ si for si in shm_infos if si[-1] == SHM_DEFAULT_MOUNT ] if default_mount_infos: return _parse_size_in_M(default_mount_infos[0][3]) else: return max([_parse_size_in_M(si[3]) for si in shm_infos]) ================================================ FILE: ppdet/data/source/__init__.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import coco from . import voc from . import widerface from . import category from . import keypoint_coco from . import mot from . import sniper_coco from . import culane from . import lvis from .coco import * from .voc import * from .widerface import * from .category import * from .keypoint_coco import * from .mot import * from .sniper_coco import SniperCOCODataSet from .dataset import ImageFolder from .pose3d_cmb import * from .culane import * from .lvis import * ================================================ FILE: ppdet/data/source/category.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os from ppdet.data.source.voc import pascalvoc_label from ppdet.data.source.widerface import widerface_label from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) __all__ = ['get_categories'] def get_categories(metric_type, anno_file=None, arch=None): """ Get class id to category id map and category id to category name map from annotation file. Args: metric_type (str): metric type, currently support 'coco', 'voc', 'oid' and 'widerface'. anno_file (str): annotation file path """ if arch == 'keypoint_arch': return (None, {'id': 'keypoint'}) if anno_file == None or (not os.path.isfile(anno_file)): logger.warning( "anno_file '{}' is None or not set or not exist, " "please recheck TrainDataset/EvalDataset/TestDataset.anno_path, " "otherwise the default categories will be used by metric_type.". format(anno_file)) if metric_type.lower() == 'coco' or metric_type.lower( ) == 'rbox' or metric_type.lower() == 'snipercoco': if anno_file and os.path.isfile(anno_file): if anno_file.endswith('json'): # lazy import pycocotools here from pycocotools.coco import COCO coco = COCO(anno_file) cats = coco.loadCats(coco.getCatIds()) clsid2catid = {i: cat['id'] for i, cat in enumerate(cats)} catid2name = {cat['id']: cat['name'] for cat in cats} elif anno_file.endswith('txt'): cats = [] with open(anno_file) as f: for line in f.readlines(): cats.append(line.strip()) if cats[0] == 'background': cats = cats[1:] clsid2catid = {i: i for i in range(len(cats))} catid2name = {i: name for i, name in enumerate(cats)} else: raise ValueError("anno_file {} should be json or txt.".format( anno_file)) return clsid2catid, catid2name # anno file not exist, load default categories of COCO17 else: if metric_type.lower() == 'rbox': logger.warning( "metric_type: {}, load default categories of DOTA.".format( metric_type)) return _dota_category() logger.warning("metric_type: {}, load default categories of COCO.". format(metric_type)) return _coco17_category() elif metric_type.lower() == 'voc': if anno_file and os.path.isfile(anno_file): cats = [] with open(anno_file) as f: for line in f.readlines(): cats.append(line.strip()) if cats[0] == 'background': cats = cats[1:] clsid2catid = {i: i for i in range(len(cats))} catid2name = {i: name for i, name in enumerate(cats)} return clsid2catid, catid2name # anno file not exist, load default categories of # VOC all 20 categories else: logger.warning("metric_type: {}, load default categories of VOC.". format(metric_type)) return _vocall_category() elif metric_type.lower() == 'oid': if anno_file and os.path.isfile(anno_file): logger.warning("only default categories support for OID19") return _oid19_category() elif metric_type.lower() == 'widerface': return _widerface_category() elif metric_type.lower() in [ 'keypointtopdowncocoeval', 'keypointtopdownmpiieval', 'keypointtopdowncocowholebadyhandeval' ]: return (None, {'id': 'keypoint'}) elif metric_type.lower() == 'pose3deval': return (None, {'id': 'pose3d'}) elif metric_type.lower() in ['mot', 'motdet', 'reid']: if anno_file and os.path.isfile(anno_file): cats = [] with open(anno_file) as f: for line in f.readlines(): cats.append(line.strip()) if cats[0] == 'background': cats = cats[1:] clsid2catid = {i: i for i in range(len(cats))} catid2name = {i: name for i, name in enumerate(cats)} return clsid2catid, catid2name # anno file not exist, load default category 'pedestrian'. else: logger.warning( "metric_type: {}, load default categories of pedestrian MOT.". format(metric_type)) return _mot_category(category='pedestrian') elif metric_type.lower() in ['kitti', 'bdd100kmot']: return _mot_category(category='vehicle') elif metric_type.lower() in ['mcmot']: if anno_file and os.path.isfile(anno_file): cats = [] with open(anno_file) as f: for line in f.readlines(): cats.append(line.strip()) if cats[0] == 'background': cats = cats[1:] clsid2catid = {i: i for i in range(len(cats))} catid2name = {i: name for i, name in enumerate(cats)} return clsid2catid, catid2name # anno file not exist, load default categories of visdrone all 10 categories else: logger.warning( "metric_type: {}, load default categories of VisDrone.".format( metric_type)) return _visdrone_category() else: raise ValueError("unknown metric type {}".format(metric_type)) def _mot_category(category='pedestrian'): """ Get class id to category id map and category id to category name map of mot dataset """ label_map = {category: 0} label_map = sorted(label_map.items(), key=lambda x: x[1]) cats = [l[0] for l in label_map] clsid2catid = {i: i for i in range(len(cats))} catid2name = {i: name for i, name in enumerate(cats)} return clsid2catid, catid2name def _coco17_category(): """ Get class id to category id map and category id to category name map of COCO2017 dataset """ clsid2catid = { 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 27, 26: 28, 27: 31, 28: 32, 29: 33, 30: 34, 31: 35, 32: 36, 33: 37, 34: 38, 35: 39, 36: 40, 37: 41, 38: 42, 39: 43, 40: 44, 41: 46, 42: 47, 43: 48, 44: 49, 45: 50, 46: 51, 47: 52, 48: 53, 49: 54, 50: 55, 51: 56, 52: 57, 53: 58, 54: 59, 55: 60, 56: 61, 57: 62, 58: 63, 59: 64, 60: 65, 61: 67, 62: 70, 63: 72, 64: 73, 65: 74, 66: 75, 67: 76, 68: 77, 69: 78, 70: 79, 71: 80, 72: 81, 73: 82, 74: 84, 75: 85, 76: 86, 77: 87, 78: 88, 79: 89, 80: 90 } catid2name = { 0: 'background', 1: 'person', 2: 'bicycle', 3: 'car', 4: 'motorcycle', 5: 'airplane', 6: 'bus', 7: 'train', 8: 'truck', 9: 'boat', 10: 'traffic light', 11: 'fire hydrant', 13: 'stop sign', 14: 'parking meter', 15: 'bench', 16: 'bird', 17: 'cat', 18: 'dog', 19: 'horse', 20: 'sheep', 21: 'cow', 22: 'elephant', 23: 'bear', 24: 'zebra', 25: 'giraffe', 27: 'backpack', 28: 'umbrella', 31: 'handbag', 32: 'tie', 33: 'suitcase', 34: 'frisbee', 35: 'skis', 36: 'snowboard', 37: 'sports ball', 38: 'kite', 39: 'baseball bat', 40: 'baseball glove', 41: 'skateboard', 42: 'surfboard', 43: 'tennis racket', 44: 'bottle', 46: 'wine glass', 47: 'cup', 48: 'fork', 49: 'knife', 50: 'spoon', 51: 'bowl', 52: 'banana', 53: 'apple', 54: 'sandwich', 55: 'orange', 56: 'broccoli', 57: 'carrot', 58: 'hot dog', 59: 'pizza', 60: 'donut', 61: 'cake', 62: 'chair', 63: 'couch', 64: 'potted plant', 65: 'bed', 67: 'dining table', 70: 'toilet', 72: 'tv', 73: 'laptop', 74: 'mouse', 75: 'remote', 76: 'keyboard', 77: 'cell phone', 78: 'microwave', 79: 'oven', 80: 'toaster', 81: 'sink', 82: 'refrigerator', 84: 'book', 85: 'clock', 86: 'vase', 87: 'scissors', 88: 'teddy bear', 89: 'hair drier', 90: 'toothbrush' } clsid2catid = {k - 1: v for k, v in clsid2catid.items()} catid2name.pop(0) return clsid2catid, catid2name def _dota_category(): """ Get class id to category id map and category id to category name map of dota dataset """ catid2name = { 0: 'background', 1: 'plane', 2: 'baseball-diamond', 3: 'bridge', 4: 'ground-track-field', 5: 'small-vehicle', 6: 'large-vehicle', 7: 'ship', 8: 'tennis-court', 9: 'basketball-court', 10: 'storage-tank', 11: 'soccer-ball-field', 12: 'roundabout', 13: 'harbor', 14: 'swimming-pool', 15: 'helicopter' } catid2name.pop(0) clsid2catid = {i: i + 1 for i in range(len(catid2name))} return clsid2catid, catid2name def _vocall_category(): """ Get class id to category id map and category id to category name map of mixup voc dataset """ label_map = pascalvoc_label() label_map = sorted(label_map.items(), key=lambda x: x[1]) cats = [l[0] for l in label_map] clsid2catid = {i: i for i in range(len(cats))} catid2name = {i: name for i, name in enumerate(cats)} return clsid2catid, catid2name def _widerface_category(): label_map = widerface_label() label_map = sorted(label_map.items(), key=lambda x: x[1]) cats = [l[0] for l in label_map] clsid2catid = {i: i for i in range(len(cats))} catid2name = {i: name for i, name in enumerate(cats)} return clsid2catid, catid2name def _oid19_category(): clsid2catid = {k: k + 1 for k in range(500)} catid2name = { 0: "background", 1: "Infant bed", 2: "Rose", 3: "Flag", 4: "Flashlight", 5: "Sea turtle", 6: "Camera", 7: "Animal", 8: "Glove", 9: "Crocodile", 10: "Cattle", 11: "House", 12: "Guacamole", 13: "Penguin", 14: "Vehicle registration plate", 15: "Bench", 16: "Ladybug", 17: "Human nose", 18: "Watermelon", 19: "Flute", 20: "Butterfly", 21: "Washing machine", 22: "Raccoon", 23: "Segway", 24: "Taco", 25: "Jellyfish", 26: "Cake", 27: "Pen", 28: "Cannon", 29: "Bread", 30: "Tree", 31: "Shellfish", 32: "Bed", 33: "Hamster", 34: "Hat", 35: "Toaster", 36: "Sombrero", 37: "Tiara", 38: "Bowl", 39: "Dragonfly", 40: "Moths and butterflies", 41: "Antelope", 42: "Vegetable", 43: "Torch", 44: "Building", 45: "Power plugs and sockets", 46: "Blender", 47: "Billiard table", 48: "Cutting board", 49: "Bronze sculpture", 50: "Turtle", 51: "Broccoli", 52: "Tiger", 53: "Mirror", 54: "Bear", 55: "Zucchini", 56: "Dress", 57: "Volleyball", 58: "Guitar", 59: "Reptile", 60: "Golf cart", 61: "Tart", 62: "Fedora", 63: "Carnivore", 64: "Car", 65: "Lighthouse", 66: "Coffeemaker", 67: "Food processor", 68: "Truck", 69: "Bookcase", 70: "Surfboard", 71: "Footwear", 72: "Bench", 73: "Necklace", 74: "Flower", 75: "Radish", 76: "Marine mammal", 77: "Frying pan", 78: "Tap", 79: "Peach", 80: "Knife", 81: "Handbag", 82: "Laptop", 83: "Tent", 84: "Ambulance", 85: "Christmas tree", 86: "Eagle", 87: "Limousine", 88: "Kitchen & dining room table", 89: "Polar bear", 90: "Tower", 91: "Football", 92: "Willow", 93: "Human head", 94: "Stop sign", 95: "Banana", 96: "Mixer", 97: "Binoculars", 98: "Dessert", 99: "Bee", 100: "Chair", 101: "Wood-burning stove", 102: "Flowerpot", 103: "Beaker", 104: "Oyster", 105: "Woodpecker", 106: "Harp", 107: "Bathtub", 108: "Wall clock", 109: "Sports uniform", 110: "Rhinoceros", 111: "Beehive", 112: "Cupboard", 113: "Chicken", 114: "Man", 115: "Blue jay", 116: "Cucumber", 117: "Balloon", 118: "Kite", 119: "Fireplace", 120: "Lantern", 121: "Missile", 122: "Book", 123: "Spoon", 124: "Grapefruit", 125: "Squirrel", 126: "Orange", 127: "Coat", 128: "Punching bag", 129: "Zebra", 130: "Billboard", 131: "Bicycle", 132: "Door handle", 133: "Mechanical fan", 134: "Ring binder", 135: "Table", 136: "Parrot", 137: "Sock", 138: "Vase", 139: "Weapon", 140: "Shotgun", 141: "Glasses", 142: "Seahorse", 143: "Belt", 144: "Watercraft", 145: "Window", 146: "Giraffe", 147: "Lion", 148: "Tire", 149: "Vehicle", 150: "Canoe", 151: "Tie", 152: "Shelf", 153: "Picture frame", 154: "Printer", 155: "Human leg", 156: "Boat", 157: "Slow cooker", 158: "Croissant", 159: "Candle", 160: "Pancake", 161: "Pillow", 162: "Coin", 163: "Stretcher", 164: "Sandal", 165: "Woman", 166: "Stairs", 167: "Harpsichord", 168: "Stool", 169: "Bus", 170: "Suitcase", 171: "Human mouth", 172: "Juice", 173: "Skull", 174: "Door", 175: "Violin", 176: "Chopsticks", 177: "Digital clock", 178: "Sunflower", 179: "Leopard", 180: "Bell pepper", 181: "Harbor seal", 182: "Snake", 183: "Sewing machine", 184: "Goose", 185: "Helicopter", 186: "Seat belt", 187: "Coffee cup", 188: "Microwave oven", 189: "Hot dog", 190: "Countertop", 191: "Serving tray", 192: "Dog bed", 193: "Beer", 194: "Sunglasses", 195: "Golf ball", 196: "Waffle", 197: "Palm tree", 198: "Trumpet", 199: "Ruler", 200: "Helmet", 201: "Ladder", 202: "Office building", 203: "Tablet computer", 204: "Toilet paper", 205: "Pomegranate", 206: "Skirt", 207: "Gas stove", 208: "Cookie", 209: "Cart", 210: "Raven", 211: "Egg", 212: "Burrito", 213: "Goat", 214: "Kitchen knife", 215: "Skateboard", 216: "Salt and pepper shakers", 217: "Lynx", 218: "Boot", 219: "Platter", 220: "Ski", 221: "Swimwear", 222: "Swimming pool", 223: "Drinking straw", 224: "Wrench", 225: "Drum", 226: "Ant", 227: "Human ear", 228: "Headphones", 229: "Fountain", 230: "Bird", 231: "Jeans", 232: "Television", 233: "Crab", 234: "Microphone", 235: "Home appliance", 236: "Snowplow", 237: "Beetle", 238: "Artichoke", 239: "Jet ski", 240: "Stationary bicycle", 241: "Human hair", 242: "Brown bear", 243: "Starfish", 244: "Fork", 245: "Lobster", 246: "Corded phone", 247: "Drink", 248: "Saucer", 249: "Carrot", 250: "Insect", 251: "Clock", 252: "Castle", 253: "Tennis racket", 254: "Ceiling fan", 255: "Asparagus", 256: "Jaguar", 257: "Musical instrument", 258: "Train", 259: "Cat", 260: "Rifle", 261: "Dumbbell", 262: "Mobile phone", 263: "Taxi", 264: "Shower", 265: "Pitcher", 266: "Lemon", 267: "Invertebrate", 268: "Turkey", 269: "High heels", 270: "Bust", 271: "Elephant", 272: "Scarf", 273: "Barrel", 274: "Trombone", 275: "Pumpkin", 276: "Box", 277: "Tomato", 278: "Frog", 279: "Bidet", 280: "Human face", 281: "Houseplant", 282: "Van", 283: "Shark", 284: "Ice cream", 285: "Swim cap", 286: "Falcon", 287: "Ostrich", 288: "Handgun", 289: "Whiteboard", 290: "Lizard", 291: "Pasta", 292: "Snowmobile", 293: "Light bulb", 294: "Window blind", 295: "Muffin", 296: "Pretzel", 297: "Computer monitor", 298: "Horn", 299: "Furniture", 300: "Sandwich", 301: "Fox", 302: "Convenience store", 303: "Fish", 304: "Fruit", 305: "Earrings", 306: "Curtain", 307: "Grape", 308: "Sofa bed", 309: "Horse", 310: "Luggage and bags", 311: "Desk", 312: "Crutch", 313: "Bicycle helmet", 314: "Tick", 315: "Airplane", 316: "Canary", 317: "Spatula", 318: "Watch", 319: "Lily", 320: "Kitchen appliance", 321: "Filing cabinet", 322: "Aircraft", 323: "Cake stand", 324: "Candy", 325: "Sink", 326: "Mouse", 327: "Wine", 328: "Wheelchair", 329: "Goldfish", 330: "Refrigerator", 331: "French fries", 332: "Drawer", 333: "Treadmill", 334: "Picnic basket", 335: "Dice", 336: "Cabbage", 337: "Football helmet", 338: "Pig", 339: "Person", 340: "Shorts", 341: "Gondola", 342: "Honeycomb", 343: "Doughnut", 344: "Chest of drawers", 345: "Land vehicle", 346: "Bat", 347: "Monkey", 348: "Dagger", 349: "Tableware", 350: "Human foot", 351: "Mug", 352: "Alarm clock", 353: "Pressure cooker", 354: "Human hand", 355: "Tortoise", 356: "Baseball glove", 357: "Sword", 358: "Pear", 359: "Miniskirt", 360: "Traffic sign", 361: "Girl", 362: "Roller skates", 363: "Dinosaur", 364: "Porch", 365: "Human beard", 366: "Submarine sandwich", 367: "Screwdriver", 368: "Strawberry", 369: "Wine glass", 370: "Seafood", 371: "Racket", 372: "Wheel", 373: "Sea lion", 374: "Toy", 375: "Tea", 376: "Tennis ball", 377: "Waste container", 378: "Mule", 379: "Cricket ball", 380: "Pineapple", 381: "Coconut", 382: "Doll", 383: "Coffee table", 384: "Snowman", 385: "Lavender", 386: "Shrimp", 387: "Maple", 388: "Cowboy hat", 389: "Goggles", 390: "Rugby ball", 391: "Caterpillar", 392: "Poster", 393: "Rocket", 394: "Organ", 395: "Saxophone", 396: "Traffic light", 397: "Cocktail", 398: "Plastic bag", 399: "Squash", 400: "Mushroom", 401: "Hamburger", 402: "Light switch", 403: "Parachute", 404: "Teddy bear", 405: "Winter melon", 406: "Deer", 407: "Musical keyboard", 408: "Plumbing fixture", 409: "Scoreboard", 410: "Baseball bat", 411: "Envelope", 412: "Adhesive tape", 413: "Briefcase", 414: "Paddle", 415: "Bow and arrow", 416: "Telephone", 417: "Sheep", 418: "Jacket", 419: "Boy", 420: "Pizza", 421: "Otter", 422: "Office supplies", 423: "Couch", 424: "Cello", 425: "Bull", 426: "Camel", 427: "Ball", 428: "Duck", 429: "Whale", 430: "Shirt", 431: "Tank", 432: "Motorcycle", 433: "Accordion", 434: "Owl", 435: "Porcupine", 436: "Sun hat", 437: "Nail", 438: "Scissors", 439: "Swan", 440: "Lamp", 441: "Crown", 442: "Piano", 443: "Sculpture", 444: "Cheetah", 445: "Oboe", 446: "Tin can", 447: "Mango", 448: "Tripod", 449: "Oven", 450: "Mouse", 451: "Barge", 452: "Coffee", 453: "Snowboard", 454: "Common fig", 455: "Salad", 456: "Marine invertebrates", 457: "Umbrella", 458: "Kangaroo", 459: "Human arm", 460: "Measuring cup", 461: "Snail", 462: "Loveseat", 463: "Suit", 464: "Teapot", 465: "Bottle", 466: "Alpaca", 467: "Kettle", 468: "Trousers", 469: "Popcorn", 470: "Centipede", 471: "Spider", 472: "Sparrow", 473: "Plate", 474: "Bagel", 475: "Personal care", 476: "Apple", 477: "Brassiere", 478: "Bathroom cabinet", 479: "studio couch", 480: "Computer keyboard", 481: "Table tennis racket", 482: "Sushi", 483: "Cabinetry", 484: "Street light", 485: "Towel", 486: "Nightstand", 487: "Rabbit", 488: "Dolphin", 489: "Dog", 490: "Jug", 491: "Wok", 492: "Fire hydrant", 493: "Human eye", 494: "Skyscraper", 495: "Backpack", 496: "Potato", 497: "Paper towel", 498: "Lifejacket", 499: "Bicycle wheel", 500: "Toilet", } return clsid2catid, catid2name def _visdrone_category(): clsid2catid = {i: i for i in range(10)} catid2name = { 0: 'pedestrian', 1: 'people', 2: 'bicycle', 3: 'car', 4: 'van', 5: 'truck', 6: 'tricycle', 7: 'awning-tricycle', 8: 'bus', 9: 'motor' } return clsid2catid, catid2name ================================================ FILE: ppdet/data/source/coco.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import copy try: from collections.abc import Sequence except Exception: from collections import Sequence import numpy as np from ppdet.core.workspace import register, serializable from .dataset import DetDataset from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) __all__ = [ 'COCODataSet', 'SlicedCOCODataSet', 'SemiCOCODataSet', 'COCODetDataset', 'COCOInstSegDataset' ] @register @serializable class COCODataSet(DetDataset): """ Load dataset with COCO format. Args: dataset_dir (str): root directory for dataset. image_dir (str): directory for images. anno_path (str): coco annotation file path. data_fields (list): key name of data dictionary, at least have 'image'. sample_num (int): number of samples to load, -1 means all. load_crowd (bool): whether to load crowded ground-truth. False as default allow_empty (bool): whether to load empty entry. False as default empty_ratio (float): the ratio of empty record number to total record's, if empty_ratio is out of [0. ,1.), do not sample the records and use all the empty entries. 1. as default repeat (int): repeat times for dataset, use in benchmark. """ def __init__(self, dataset_dir=None, image_dir=None, anno_path=None, data_fields=['image'], sample_num=-1, load_crowd=False, allow_empty=False, empty_ratio=1., repeat=1): super(COCODataSet, self).__init__( dataset_dir, image_dir, anno_path, data_fields, sample_num, repeat=repeat) self.load_image_only = False self.load_semantic = False self.load_crowd = load_crowd self.allow_empty = allow_empty self.empty_ratio = empty_ratio def _sample_empty(self, records, num): # if empty_ratio is out of [0. ,1.), do not sample the records if self.empty_ratio < 0. or self.empty_ratio >= 1.: return records import random sample_num = min( int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records)) records = random.sample(records, sample_num) return records def parse_dataset(self): anno_path = os.path.join(self.dataset_dir, self.anno_path) image_dir = os.path.join(self.dataset_dir, self.image_dir) assert anno_path.endswith('.json'), \ 'invalid coco annotation file: ' + anno_path from pycocotools.coco import COCO coco = COCO(anno_path) img_ids = coco.getImgIds() img_ids.sort() cat_ids = coco.getCatIds() records = [] empty_records = [] ct = 0 self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)}) self.cname2cid = dict({ coco.loadCats(catid)[0]['name']: clsid for catid, clsid in self.catid2clsid.items() }) if 'annotations' not in coco.dataset: self.load_image_only = True logger.warning('Annotation file: {} does not contains ground truth ' 'and load image information only.'.format(anno_path)) for img_id in img_ids: img_anno = coco.loadImgs([img_id])[0] im_fname = img_anno['file_name'] im_w = float(img_anno['width']) im_h = float(img_anno['height']) im_path = os.path.join(image_dir, im_fname) if image_dir else im_fname is_empty = False if not os.path.exists(im_path): logger.warning('Illegal image file: {}, and it will be ' 'ignored'.format(im_path)) continue if im_w < 0 or im_h < 0: logger.warning('Illegal width: {} or height: {} in annotation, ' 'and im_id: {} will be ignored'.format( im_w, im_h, img_id)) continue coco_rec = { 'im_file': im_path, 'im_id': np.array([img_id]), 'h': im_h, 'w': im_w, } if 'image' in self.data_fields else {} if not self.load_image_only: ins_anno_ids = coco.getAnnIds( imgIds=[img_id], iscrowd=None if self.load_crowd else False) instances = coco.loadAnns(ins_anno_ids) bboxes = [] is_rbox_anno = False for inst in instances: # check gt bbox if inst.get('ignore', False): continue if 'bbox' not in inst.keys(): continue else: if not any(np.array(inst['bbox'])): continue x1, y1, box_w, box_h = inst['bbox'] x2 = x1 + box_w y2 = y1 + box_h eps = 1e-5 if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps: inst['clean_bbox'] = [ round(float(x), 3) for x in [x1, y1, x2, y2] ] bboxes.append(inst) else: logger.warning( 'Found an invalid bbox in annotations: im_id: {}, ' 'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format( img_id, float(inst['area']), x1, y1, x2, y2)) num_bbox = len(bboxes) if num_bbox <= 0 and not self.allow_empty: continue elif num_bbox <= 0: is_empty = True gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32) gt_class = np.zeros((num_bbox, 1), dtype=np.int32) is_crowd = np.zeros((num_bbox, 1), dtype=np.int32) gt_poly = [None] * num_bbox gt_track_id = -np.ones((num_bbox, 1), dtype=np.int32) has_segmentation = False has_track_id = False for i, box in enumerate(bboxes): catid = box['category_id'] gt_class[i][0] = self.catid2clsid[catid] gt_bbox[i, :] = box['clean_bbox'] is_crowd[i][0] = box['iscrowd'] # check RLE format if 'segmentation' in box and box['iscrowd'] == 1: gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] elif 'segmentation' in box and box['segmentation']: if not np.array( box['segmentation'], dtype=object).size > 0 and not self.allow_empty: bboxes.pop(i) gt_poly.pop(i) np.delete(is_crowd, i) np.delete(gt_class, i) np.delete(gt_bbox, i) else: gt_poly[i] = box['segmentation'] has_segmentation = True if 'track_id' in box: gt_track_id[i][0] = box['track_id'] has_track_id = True if has_segmentation and not any( gt_poly) and not self.allow_empty: continue gt_rec = { 'is_crowd': is_crowd, 'gt_class': gt_class, 'gt_bbox': gt_bbox, 'gt_poly': gt_poly, } if has_track_id: gt_rec.update({'gt_track_id': gt_track_id}) for k, v in gt_rec.items(): if k in self.data_fields: coco_rec[k] = v # TODO: remove load_semantic if self.load_semantic and 'semantic' in self.data_fields: seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps', 'train2017', im_fname[:-3] + 'png') coco_rec.update({'semantic': seg_path}) logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format( im_path, img_id, im_h, im_w)) if is_empty: empty_records.append(coco_rec) else: records.append(coco_rec) ct += 1 if self.sample_num > 0 and ct >= self.sample_num: break assert ct > 0, 'not found any coco record in %s' % (anno_path) logger.info('Load [{} samples valid, {} samples invalid] in file {}.'. format(ct, len(img_ids) - ct, anno_path)) if self.allow_empty and len(empty_records) > 0: empty_records = self._sample_empty(empty_records, len(records)) records += empty_records self.roidbs = records @register @serializable class SlicedCOCODataSet(COCODataSet): """Sliced COCODataSet""" def __init__( self, dataset_dir=None, image_dir=None, anno_path=None, data_fields=['image'], sample_num=-1, load_crowd=False, allow_empty=False, empty_ratio=1., repeat=1, sliced_size=[640, 640], overlap_ratio=[0.25, 0.25], ): super(SlicedCOCODataSet, self).__init__( dataset_dir=dataset_dir, image_dir=image_dir, anno_path=anno_path, data_fields=data_fields, sample_num=sample_num, load_crowd=load_crowd, allow_empty=allow_empty, empty_ratio=empty_ratio, repeat=repeat, ) self.sliced_size = sliced_size self.overlap_ratio = overlap_ratio def parse_dataset(self): anno_path = os.path.join(self.dataset_dir, self.anno_path) image_dir = os.path.join(self.dataset_dir, self.image_dir) assert anno_path.endswith('.json'), \ 'invalid coco annotation file: ' + anno_path from pycocotools.coco import COCO coco = COCO(anno_path) img_ids = coco.getImgIds() img_ids.sort() cat_ids = coco.getCatIds() records = [] empty_records = [] ct = 0 ct_sub = 0 self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)}) self.cname2cid = dict({ coco.loadCats(catid)[0]['name']: clsid for catid, clsid in self.catid2clsid.items() }) if 'annotations' not in coco.dataset: self.load_image_only = True logger.warning('Annotation file: {} does not contains ground truth ' 'and load image information only.'.format(anno_path)) try: import sahi from sahi.slicing import slice_image except Exception as e: logger.error( 'sahi not found, plaese install sahi. ' 'for example: `pip install sahi`, see https://github.com/obss/sahi.' ) raise e sub_img_ids = 0 for img_id in img_ids: img_anno = coco.loadImgs([img_id])[0] im_fname = img_anno['file_name'] im_w = float(img_anno['width']) im_h = float(img_anno['height']) im_path = os.path.join(image_dir, im_fname) if image_dir else im_fname is_empty = False if not os.path.exists(im_path): logger.warning('Illegal image file: {}, and it will be ' 'ignored'.format(im_path)) continue if im_w < 0 or im_h < 0: logger.warning('Illegal width: {} or height: {} in annotation, ' 'and im_id: {} will be ignored'.format( im_w, im_h, img_id)) continue slice_image_result = sahi.slicing.slice_image( image=im_path, slice_height=self.sliced_size[0], slice_width=self.sliced_size[1], overlap_height_ratio=self.overlap_ratio[0], overlap_width_ratio=self.overlap_ratio[1]) sub_img_num = len(slice_image_result) for _ind in range(sub_img_num): im = slice_image_result.images[_ind] coco_rec = { 'image': im, 'im_id': np.array([sub_img_ids + _ind]), 'h': im.shape[0], 'w': im.shape[1], 'ori_im_id': np.array([img_id]), 'st_pix': np.array( slice_image_result.starting_pixels[_ind], dtype=np.float32), 'is_last': 1 if _ind == sub_img_num - 1 else 0, } if 'image' in self.data_fields else {} records.append(coco_rec) ct_sub += sub_img_num ct += 1 if self.sample_num > 0 and ct >= self.sample_num: break assert ct > 0, 'not found any coco record in %s' % (anno_path) logger.info('{} samples and slice to {} sub_samples in file {}'.format( ct, ct_sub, anno_path)) if self.allow_empty and len(empty_records) > 0: empty_records = self._sample_empty(empty_records, len(records)) records += empty_records self.roidbs = records @register @serializable class SemiCOCODataSet(COCODataSet): """Semi-COCODataSet used for supervised and unsupervised dataSet""" def __init__(self, dataset_dir=None, image_dir=None, anno_path=None, data_fields=['image'], sample_num=-1, load_crowd=False, allow_empty=False, empty_ratio=1., repeat=1, supervised=True): super(SemiCOCODataSet, self).__init__( dataset_dir, image_dir, anno_path, data_fields, sample_num, load_crowd, allow_empty, empty_ratio, repeat) self.supervised = supervised self.length = -1 # defalut -1 means all def parse_dataset(self): anno_path = os.path.join(self.dataset_dir, self.anno_path) image_dir = os.path.join(self.dataset_dir, self.image_dir) assert anno_path.endswith('.json'), \ 'invalid coco annotation file: ' + anno_path from pycocotools.coco import COCO coco = COCO(anno_path) img_ids = coco.getImgIds() img_ids.sort() cat_ids = coco.getCatIds() records = [] empty_records = [] ct = 0 self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)}) self.cname2cid = dict({ coco.loadCats(catid)[0]['name']: clsid for catid, clsid in self.catid2clsid.items() }) if 'annotations' not in coco.dataset or self.supervised == False: self.load_image_only = True logger.warning('Annotation file: {} does not contains ground truth ' 'and load image information only.'.format(anno_path)) for img_id in img_ids: img_anno = coco.loadImgs([img_id])[0] im_fname = img_anno['file_name'] im_w = float(img_anno['width']) im_h = float(img_anno['height']) im_path = os.path.join(image_dir, im_fname) if image_dir else im_fname is_empty = False if not os.path.exists(im_path): logger.warning('Illegal image file: {}, and it will be ' 'ignored'.format(im_path)) continue if im_w < 0 or im_h < 0: logger.warning('Illegal width: {} or height: {} in annotation, ' 'and im_id: {} will be ignored'.format( im_w, im_h, img_id)) continue coco_rec = { 'im_file': im_path, 'im_id': np.array([img_id]), 'h': im_h, 'w': im_w, } if 'image' in self.data_fields else {} if not self.load_image_only: ins_anno_ids = coco.getAnnIds( imgIds=[img_id], iscrowd=None if self.load_crowd else False) instances = coco.loadAnns(ins_anno_ids) bboxes = [] is_rbox_anno = False for inst in instances: # check gt bbox if inst.get('ignore', False): continue if 'bbox' not in inst.keys(): continue else: if not any(np.array(inst['bbox'])): continue x1, y1, box_w, box_h = inst['bbox'] x2 = x1 + box_w y2 = y1 + box_h eps = 1e-5 if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps: inst['clean_bbox'] = [ round(float(x), 3) for x in [x1, y1, x2, y2] ] bboxes.append(inst) else: logger.warning( 'Found an invalid bbox in annotations: im_id: {}, ' 'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format( img_id, float(inst['area']), x1, y1, x2, y2)) num_bbox = len(bboxes) if num_bbox <= 0 and not self.allow_empty: continue elif num_bbox <= 0: is_empty = True gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32) gt_class = np.zeros((num_bbox, 1), dtype=np.int32) is_crowd = np.zeros((num_bbox, 1), dtype=np.int32) gt_poly = [None] * num_bbox has_segmentation = False for i, box in enumerate(bboxes): catid = box['category_id'] gt_class[i][0] = self.catid2clsid[catid] gt_bbox[i, :] = box['clean_bbox'] is_crowd[i][0] = box['iscrowd'] # check RLE format if 'segmentation' in box and box['iscrowd'] == 1: gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] elif 'segmentation' in box and box['segmentation']: if not np.array(box['segmentation'] ).size > 0 and not self.allow_empty: bboxes.pop(i) gt_poly.pop(i) np.delete(is_crowd, i) np.delete(gt_class, i) np.delete(gt_bbox, i) else: gt_poly[i] = box['segmentation'] has_segmentation = True if has_segmentation and not any( gt_poly) and not self.allow_empty: continue gt_rec = { 'is_crowd': is_crowd, 'gt_class': gt_class, 'gt_bbox': gt_bbox, 'gt_poly': gt_poly, } for k, v in gt_rec.items(): if k in self.data_fields: coco_rec[k] = v # TODO: remove load_semantic if self.load_semantic and 'semantic' in self.data_fields: seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps', 'train2017', im_fname[:-3] + 'png') coco_rec.update({'semantic': seg_path}) logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format( im_path, img_id, im_h, im_w)) if is_empty: empty_records.append(coco_rec) else: records.append(coco_rec) ct += 1 if self.sample_num > 0 and ct >= self.sample_num: break assert ct > 0, 'not found any coco record in %s' % (anno_path) logger.info('Load [{} samples valid, {} samples invalid] in file {}.'. format(ct, len(img_ids) - ct, anno_path)) if self.allow_empty and len(empty_records) > 0: empty_records = self._sample_empty(empty_records, len(records)) records += empty_records self.roidbs = records if self.supervised: logger.info(f'Use {len(self.roidbs)} sup_samples data as LABELED') else: if self.length > 0: # unsup length will be decide by sup length all_roidbs = self.roidbs.copy() selected_idxs = [ np.random.choice(len(all_roidbs)) for _ in range(self.length) ] self.roidbs = [all_roidbs[i] for i in selected_idxs] logger.info( f'Use {len(self.roidbs)} unsup_samples data as UNLABELED') def __getitem__(self, idx): n = len(self.roidbs) if self.repeat > 1: idx %= n # data batch roidb = copy.deepcopy(self.roidbs[idx]) if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch: idx = np.random.randint(n) roidb = [roidb, copy.deepcopy(self.roidbs[idx])] elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch: idx = np.random.randint(n) roidb = [roidb, copy.deepcopy(self.roidbs[idx])] elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch: roidb = [roidb, ] + [ copy.deepcopy(self.roidbs[np.random.randint(n)]) for _ in range(4) ] if isinstance(roidb, Sequence): for r in roidb: r['curr_iter'] = self._curr_iter else: roidb['curr_iter'] = self._curr_iter self._curr_iter += 1 return self.transform(roidb) # for PaddleX @register @serializable class COCODetDataset(COCODataSet): pass # for PaddleX @register @serializable class COCOInstSegDataset(COCODataSet): pass ================================================ FILE: ppdet/data/source/culane.py ================================================ from ppdet.core.workspace import register, serializable import cv2 import os import tarfile import numpy as np import os.path as osp from ppdet.data.source.dataset import DetDataset from imgaug.augmentables.lines import LineStringsOnImage from imgaug.augmentables.segmaps import SegmentationMapsOnImage from ppdet.data.culane_utils import lane_to_linestrings import pickle as pkl from ppdet.utils.logger import setup_logger try: from collections.abc import Sequence except Exception: from collections import Sequence from .dataset import DetDataset, _make_dataset, _is_valid_file from ppdet.utils.download import download_dataset logger = setup_logger(__name__) @register @serializable class CULaneDataSet(DetDataset): def __init__( self, dataset_dir, cut_height, list_path, split='train', data_fields=['image'], video_file=None, frame_rate=-1, ): super(CULaneDataSet, self).__init__( dataset_dir=dataset_dir, cut_height=cut_height, split=split, data_fields=data_fields) self.dataset_dir = dataset_dir self.list_path = osp.join(dataset_dir, list_path) self.cut_height = cut_height self.data_fields = data_fields self.split = split self.training = 'train' in split self.data_infos = [] self.video_file = video_file self.frame_rate = frame_rate self._imid2path = {} self.predict_dir = None def __len__(self): return len(self.data_infos) def check_or_download_dataset(self): if not osp.exists(self.dataset_dir): download_dataset("dataset", dataset="culane") # extract .tar files in self.dataset_dir for fname in os.listdir(self.dataset_dir): logger.info("Decompressing {}...".format(fname)) # ignore .* files if fname.startswith('.'): continue if fname.find('.tar.gz') >= 0: with tarfile.open(osp.join(self.dataset_dir, fname)) as tf: tf.extractall(path=self.dataset_dir) logger.info("Dataset files are ready.") def parse_dataset(self): logger.info('Loading CULane annotations...') if self.predict_dir is not None: logger.info('switch to predict mode') return # Waiting for the dataset to load is tedious, let's cache it os.makedirs('cache', exist_ok=True) cache_path = 'cache/culane_paddle_{}.pkl'.format(self.split) if os.path.exists(cache_path): with open(cache_path, 'rb') as cache_file: self.data_infos = pkl.load(cache_file) self.max_lanes = max( len(anno['lanes']) for anno in self.data_infos) return with open(self.list_path) as list_file: for line in list_file: infos = self.load_annotation(line.split()) self.data_infos.append(infos) # cache data infos to file with open(cache_path, 'wb') as cache_file: pkl.dump(self.data_infos, cache_file) def load_annotation(self, line): infos = {} img_line = line[0] img_line = img_line[1 if img_line[0] == '/' else 0::] img_path = os.path.join(self.dataset_dir, img_line) infos['img_name'] = img_line infos['img_path'] = img_path if len(line) > 1: mask_line = line[1] mask_line = mask_line[1 if mask_line[0] == '/' else 0::] mask_path = os.path.join(self.dataset_dir, mask_line) infos['mask_path'] = mask_path if len(line) > 2: exist_list = [int(l) for l in line[2:]] infos['lane_exist'] = np.array(exist_list) anno_path = img_path[: -3] + 'lines.txt' # remove sufix jpg and add lines.txt with open(anno_path, 'r') as anno_file: data = [ list(map(float, line.split())) for line in anno_file.readlines() ] lanes = [[(lane[i], lane[i + 1]) for i in range(0, len(lane), 2) if lane[i] >= 0 and lane[i + 1] >= 0] for lane in data] lanes = [list(set(lane)) for lane in lanes] # remove duplicated points lanes = [lane for lane in lanes if len(lane) > 2] # remove lanes with less than 2 points lanes = [sorted( lane, key=lambda x: x[1]) for lane in lanes] # sort by y infos['lanes'] = lanes return infos def set_images(self, images): self.predict_dir = images self.data_infos = self._load_images() def _find_images(self): predict_dir = self.predict_dir if not isinstance(predict_dir, Sequence): predict_dir = [predict_dir] images = [] for im_dir in predict_dir: if os.path.isdir(im_dir): im_dir = os.path.join(self.predict_dir, im_dir) images.extend(_make_dataset(im_dir)) elif os.path.isfile(im_dir) and _is_valid_file(im_dir): images.append(im_dir) return images def _load_images(self): images = self._find_images() ct = 0 records = [] for image in images: assert image != '' and os.path.isfile(image), \ "Image {} not found".format(image) if self.sample_num > 0 and ct >= self.sample_num: break rec = { 'im_id': np.array([ct]), "img_path": os.path.abspath(image), "img_name": os.path.basename(image), "lanes": [] } self._imid2path[ct] = image ct += 1 records.append(rec) assert len(records) > 0, "No image file found" return records def get_imid2path(self): return self._imid2path def __getitem__(self, idx): data_info = self.data_infos[idx] img = cv2.imread(data_info['img_path']) img = img[self.cut_height:, :, :] sample = data_info.copy() sample.update({'image': img}) img_org = sample['image'] if self.training: label = cv2.imread(sample['mask_path'], cv2.IMREAD_UNCHANGED) if len(label.shape) > 2: label = label[:, :, 0] label = label.squeeze() label = label[self.cut_height:, :] sample.update({'mask': label}) if self.cut_height != 0: new_lanes = [] for i in sample['lanes']: lanes = [] for p in i: lanes.append((p[0], p[1] - self.cut_height)) new_lanes.append(lanes) sample.update({'lanes': new_lanes}) sample['mask'] = SegmentationMapsOnImage( sample['mask'], shape=img_org.shape) sample['full_img_path'] = data_info['img_path'] sample['img_name'] = data_info['img_name'] sample['im_id'] = np.array([idx]) sample['image'] = sample['image'].copy().astype(np.uint8) sample['lanes'] = lane_to_linestrings(sample['lanes']) sample['lanes'] = LineStringsOnImage( sample['lanes'], shape=img_org.shape) sample['seg'] = np.zeros(img_org.shape) return sample ================================================ FILE: ppdet/data/source/dataset.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import copy import numpy as np try: from collections.abc import Sequence except Exception: from collections import Sequence from pycocotools.coco import COCO from paddle.io import Dataset from ppdet.core.workspace import register, serializable from ppdet.utils.download import get_dataset_path from ppdet.data import source from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) @serializable class DetDataset(Dataset): """ Load detection dataset. Args: dataset_dir (str): root directory for dataset. image_dir (str): directory for images. anno_path (str): annotation file path. data_fields (list): key name of data dictionary, at least have 'image'. sample_num (int): number of samples to load, -1 means all. use_default_label (bool): whether to load default label list. repeat (int): repeat times for dataset, use in benchmark. """ def __init__(self, dataset_dir=None, image_dir=None, anno_path=None, data_fields=['image'], sample_num=-1, use_default_label=None, repeat=1, **kwargs): super(DetDataset, self).__init__() self.dataset_dir = dataset_dir if dataset_dir is not None else '' self.anno_path = anno_path self.image_dir = image_dir if image_dir is not None else '' self.data_fields = data_fields self.sample_num = sample_num self.use_default_label = use_default_label self.repeat = repeat self._epoch = 0 self._curr_iter = 0 def __len__(self, ): return len(self.roidbs) * self.repeat def __call__(self, *args, **kwargs): return self def __getitem__(self, idx): n = len(self.roidbs) if self.repeat > 1: idx %= n # data batch roidb = copy.deepcopy(self.roidbs[idx]) if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch: idx = np.random.randint(n) roidb = [roidb, copy.deepcopy(self.roidbs[idx])] elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch: idx = np.random.randint(n) roidb = [roidb, copy.deepcopy(self.roidbs[idx])] elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch: roidb = [roidb, ] + [ copy.deepcopy(self.roidbs[np.random.randint(n)]) for _ in range(4) ] elif self.pre_img_epoch == 0 or self._epoch < self.pre_img_epoch: # Add previous image as input, only used in CenterTrack idx_pre_img = idx - 1 if idx_pre_img < 0: idx_pre_img = idx + 1 roidb = [roidb, ] + [copy.deepcopy(self.roidbs[idx_pre_img])] if isinstance(roidb, Sequence): for r in roidb: r['curr_iter'] = self._curr_iter r['curr_epoch'] = self._epoch else: roidb['curr_iter'] = self._curr_iter roidb['curr_epoch'] = self._epoch self._curr_iter += 1 if self.transform_schedulers: assert isinstance(self.transform_schedulers, list) if isinstance(roidb, Sequence): for r in roidb: r['transform_schedulers'] = self.transform_schedulers else: roidb['transform_schedulers'] = self.transform_schedulers return self.transform(roidb) def check_or_download_dataset(self): self.dataset_dir = get_dataset_path(self.dataset_dir, self.anno_path, self.image_dir) def set_kwargs(self, **kwargs): self.mixup_epoch = kwargs.get('mixup_epoch', -1) self.cutmix_epoch = kwargs.get('cutmix_epoch', -1) self.mosaic_epoch = kwargs.get('mosaic_epoch', -1) self.pre_img_epoch = kwargs.get('pre_img_epoch', -1) self.transform_schedulers = kwargs.get('transform_schedulers', None) def set_transform(self, transform): self.transform = transform def set_epoch(self, epoch_id): self._epoch = epoch_id def parse_dataset(self, ): raise NotImplementedError( "Need to implement parse_dataset method of Dataset") def get_anno(self): if self.anno_path is None: return return os.path.join(self.dataset_dir, self.anno_path) def _is_valid_file(f, extensions=('.jpg', '.jpeg', '.png', '.bmp')): return f.lower().endswith(extensions) def _make_dataset(dir): dir = os.path.expanduser(dir) if not os.path.isdir(dir): raise ('{} should be a dir'.format(dir)) images = [] for root, _, fnames in sorted(os.walk(dir, followlinks=True)): for fname in sorted(fnames): path = os.path.join(root, fname) if _is_valid_file(path): images.append(path) return images @register @serializable class ImageFolder(DetDataset): def __init__(self, dataset_dir=None, image_dir=None, anno_path=None, sample_num=-1, use_default_label=None, **kwargs): super(ImageFolder, self).__init__( dataset_dir, image_dir, anno_path, sample_num=sample_num, use_default_label=use_default_label) self._imid2path = {} self.roidbs = None self.sample_num = sample_num def check_or_download_dataset(self): return def get_anno(self): if self.anno_path is None: return if self.dataset_dir: return os.path.join(self.dataset_dir, self.anno_path) else: return self.anno_path def parse_dataset(self, ): if not self.roidbs: self.roidbs = self._load_images() def _parse(self): image_dir = self.image_dir if not isinstance(image_dir, Sequence): image_dir = [image_dir] images = [] for im_dir in image_dir: if os.path.isdir(im_dir): im_dir = os.path.join(self.dataset_dir, im_dir) images.extend(_make_dataset(im_dir)) elif os.path.isfile(im_dir) and _is_valid_file(im_dir): images.append(im_dir) return images def get_images(self): images_path = [] coco = COCO(os.path.join(self.dataset_dir, self.anno_path)) imgIds = coco.getImgIds(catIds=[]) for imgId in imgIds: filename = coco.loadImgs(imgId)[0]["file_name"] images_path.append(os.path.join(self.dataset_dir, self.image_dir, filename)) return images_path def _load_images(self, do_eval=False): images = self._parse() ct = 0 records = [] anno_file = self.get_anno() coco = COCO(anno_file) for image in images: assert image != '' and os.path.isfile(image), \ "Image {} not found".format(image) if self.sample_num > 0 and ct >= self.sample_num: break if do_eval: image_id = self.get_image_id(image, coco) ct = image_id rec = {'im_id': np.array([ct]), 'im_file': image} self._imid2path[ct] = image ct += 1 records.append(rec) assert len(records) > 0, "No image file found" return records def get_image_id(self, image, coco): image_ids = coco.getImgIds() for image_id in image_ids: img_info = coco.loadImgs(image_id)[0] if img_info['file_name'] in image: return image_id else: continue def get_imid2path(self): return self._imid2path def set_images(self, images, do_eval=False): self.image_dir = images self.roidbs = self._load_images(do_eval=do_eval) def set_slice_images(self, images, slice_size=[640, 640], overlap_ratio=[0.25, 0.25]): self.image_dir = images ori_records = self._load_images() try: import sahi from sahi.slicing import slice_image except Exception as e: logger.error( 'sahi not found, plaese install sahi. ' 'for example: `pip install sahi`, see https://github.com/obss/sahi.' ) raise e sub_img_ids = 0 ct = 0 ct_sub = 0 records = [] for i, ori_rec in enumerate(ori_records): im_path = ori_rec['im_file'] slice_image_result = sahi.slicing.slice_image( image=im_path, slice_height=slice_size[0], slice_width=slice_size[1], overlap_height_ratio=overlap_ratio[0], overlap_width_ratio=overlap_ratio[1]) sub_img_num = len(slice_image_result) for _ind in range(sub_img_num): im = slice_image_result.images[_ind] rec = { 'image': im, 'im_id': np.array([sub_img_ids + _ind]), 'h': im.shape[0], 'w': im.shape[1], 'ori_im_id': np.array([ori_rec['im_id'][0]]), 'st_pix': np.array( slice_image_result.starting_pixels[_ind], dtype=np.float32), 'is_last': 1 if _ind == sub_img_num - 1 else 0, } if 'image' in self.data_fields else {} records.append(rec) ct_sub += sub_img_num ct += 1 logger.info('{} samples and slice to {} sub_samples.'.format(ct, ct_sub)) self.roidbs = records def get_label_list(self): # Only VOC dataset needs label list in ImageFold return self.anno_path @register class CommonDataset(object): def __init__(self, **dataset_args): super(CommonDataset, self).__init__() dataset_args = copy.deepcopy(dataset_args) type = dataset_args.pop("name") self.dataset = getattr(source, type)(**dataset_args) def __call__(self): return self.dataset @register class TrainDataset(CommonDataset): pass @register class EvalMOTDataset(CommonDataset): pass @register class TestMOTDataset(CommonDataset): pass @register class EvalDataset(CommonDataset): pass @register class TestDataset(CommonDataset): pass ================================================ FILE: ppdet/data/source/keypoint_coco.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ this code is base on https://github.com/open-mmlab/mmpose """ import os import cv2 import numpy as np import json import copy import pycocotools from pycocotools.coco import COCO from .dataset import DetDataset from ppdet.core.workspace import register, serializable @serializable class KeypointBottomUpBaseDataset(DetDataset): """Base class for bottom-up datasets. All datasets should subclass it. All subclasses should overwrite: Methods:`_get_imganno` Args: dataset_dir (str): Root path to the dataset. anno_path (str): Relative path to the annotation file. image_dir (str): Path to a directory where images are held. Default: None. num_joints (int): keypoint numbers transform (composed(operators)): A sequence of data transforms. shard (list): [rank, worldsize], the distributed env params test_mode (bool): Store True when building test or validation dataset. Default: False. """ def __init__(self, dataset_dir, image_dir, anno_path, num_joints, transform=[], shard=[0, 1], test_mode=False): super().__init__(dataset_dir, image_dir, anno_path) self.image_info = {} self.ann_info = {} self.img_prefix = os.path.join(dataset_dir, image_dir) self.transform = transform self.test_mode = test_mode self.ann_info['num_joints'] = num_joints self.img_ids = [] def parse_dataset(self): pass def __len__(self): """Get dataset length.""" return len(self.img_ids) def _get_imganno(self, idx): """Get anno for a single image.""" raise NotImplementedError def __getitem__(self, idx): """Prepare image for training given the index.""" records = copy.deepcopy(self._get_imganno(idx)) records['image'] = cv2.imread(records['image_file']) records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB) if 'mask' in records: records['mask'] = (records['mask'] + 0).astype('uint8') records = self.transform(records) return records def parse_dataset(self): return @register @serializable class KeypointBottomUpCocoDataset(KeypointBottomUpBaseDataset): """COCO dataset for bottom-up pose estimation. The dataset loads raw features and apply specified transforms to return a dict containing the image tensors and other information. COCO keypoint indexes:: 0: 'nose', 1: 'left_eye', 2: 'right_eye', 3: 'left_ear', 4: 'right_ear', 5: 'left_shoulder', 6: 'right_shoulder', 7: 'left_elbow', 8: 'right_elbow', 9: 'left_wrist', 10: 'right_wrist', 11: 'left_hip', 12: 'right_hip', 13: 'left_knee', 14: 'right_knee', 15: 'left_ankle', 16: 'right_ankle' Args: dataset_dir (str): Root path to the dataset. anno_path (str): Relative path to the annotation file. image_dir (str): Path to a directory where images are held. Default: None. num_joints (int): keypoint numbers transform (composed(operators)): A sequence of data transforms. shard (list): [rank, worldsize], the distributed env params test_mode (bool): Store True when building test or validation dataset. Default: False. """ def __init__(self, dataset_dir, image_dir, anno_path, num_joints, transform=[], shard=[0, 1], test_mode=False, return_mask=True, return_bbox=True, return_area=True, return_class=True): super().__init__(dataset_dir, image_dir, anno_path, num_joints, transform, shard, test_mode) self.ann_file = os.path.join(dataset_dir, anno_path) self.shard = shard self.test_mode = test_mode self.return_mask = return_mask self.return_bbox = return_bbox self.return_area = return_area self.return_class = return_class def parse_dataset(self): self.coco = COCO(self.ann_file) self.img_ids = self.coco.getImgIds() if not self.test_mode: self.img_ids_tmp = [] for img_id in self.img_ids: ann_ids = self.coco.getAnnIds(imgIds=img_id) anno = self.coco.loadAnns(ann_ids) anno = [obj for obj in anno if obj['iscrowd'] == 0] if len(anno) == 0: continue self.img_ids_tmp.append(img_id) self.img_ids = self.img_ids_tmp blocknum = int(len(self.img_ids) / self.shard[1]) self.img_ids = self.img_ids[(blocknum * self.shard[0]):(blocknum * ( self.shard[0] + 1))] self.num_images = len(self.img_ids) self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs) self.dataset_name = 'coco' cat_ids = self.coco.getCatIds() self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)}) print('=> num_images: {}'.format(self.num_images)) @staticmethod def _get_mapping_id_name(imgs): """ Args: imgs (dict): dict of image info. Returns: tuple: Image name & id mapping dicts. - id2name (dict): Mapping image id to name. - name2id (dict): Mapping image name to id. """ id2name = {} name2id = {} for image_id, image in imgs.items(): file_name = image['file_name'] id2name[image_id] = file_name name2id[file_name] = image_id return id2name, name2id def _get_imganno(self, idx): """Get anno for a single image. Args: idx (int): image idx Returns: dict: info for model training """ coco = self.coco img_id = self.img_ids[idx] ann_ids = coco.getAnnIds(imgIds=img_id) anno = coco.loadAnns(ann_ids) anno = [ obj for obj in anno if obj['iscrowd'] == 0 and obj['num_keypoints'] > 0 ] db_rec = {} joints, orgsize = self._get_joints(anno, idx) db_rec['gt_joints'] = joints db_rec['im_shape'] = orgsize if self.return_bbox: db_rec['gt_bbox'] = self._get_bboxs(anno, idx) if self.return_class: db_rec['gt_class'] = self._get_labels(anno, idx) if self.return_area: db_rec['gt_areas'] = self._get_areas(anno, idx) if self.return_mask: db_rec['mask'] = self._get_mask(anno, idx) db_rec['im_id'] = img_id db_rec['image_file'] = os.path.join(self.img_prefix, self.id2name[img_id]) return db_rec def _get_joints(self, anno, idx): """Get joints for all people in an image.""" num_people = len(anno) joints = np.zeros( (num_people, self.ann_info['num_joints'], 3), dtype=np.float32) for i, obj in enumerate(anno): joints[i, :self.ann_info['num_joints'], :3] = \ np.array(obj['keypoints']).reshape([-1, 3]) img_info = self.coco.loadImgs(self.img_ids[idx])[0] orgsize = np.array([img_info['height'], img_info['width'], 1]) return joints, orgsize def _get_bboxs(self, anno, idx): num_people = len(anno) gt_bboxes = np.zeros((num_people, 4), dtype=np.float32) for idx, obj in enumerate(anno): if 'bbox' in obj: gt_bboxes[idx, :] = obj['bbox'] gt_bboxes[:, 2] += gt_bboxes[:, 0] gt_bboxes[:, 3] += gt_bboxes[:, 1] return gt_bboxes def _get_labels(self, anno, idx): num_people = len(anno) gt_labels = np.zeros((num_people, 1), dtype=np.float32) for idx, obj in enumerate(anno): if 'category_id' in obj: catid = obj['category_id'] gt_labels[idx, 0] = self.catid2clsid[catid] return gt_labels def _get_areas(self, anno, idx): num_people = len(anno) gt_areas = np.zeros((num_people, ), dtype=np.float32) for idx, obj in enumerate(anno): if 'area' in obj: gt_areas[idx, ] = obj['area'] return gt_areas def _get_mask(self, anno, idx): """Get ignore masks to mask out losses.""" coco = self.coco img_info = coco.loadImgs(self.img_ids[idx])[0] m = np.zeros((img_info['height'], img_info['width']), dtype=np.float32) for obj in anno: if 'segmentation' in obj: if obj['iscrowd']: rle = pycocotools.mask.frPyObjects(obj['segmentation'], img_info['height'], img_info['width']) m += pycocotools.mask.decode(rle) elif obj['num_keypoints'] == 0: rles = pycocotools.mask.frPyObjects(obj['segmentation'], img_info['height'], img_info['width']) for rle in rles: m += pycocotools.mask.decode(rle) return m < 0.5 @register @serializable class KeypointBottomUpCrowdPoseDataset(KeypointBottomUpCocoDataset): """CrowdPose dataset for bottom-up pose estimation. The dataset loads raw features and apply specified transforms to return a dict containing the image tensors and other information. CrowdPose keypoint indexes:: 0: 'left_shoulder', 1: 'right_shoulder', 2: 'left_elbow', 3: 'right_elbow', 4: 'left_wrist', 5: 'right_wrist', 6: 'left_hip', 7: 'right_hip', 8: 'left_knee', 9: 'right_knee', 10: 'left_ankle', 11: 'right_ankle', 12: 'top_head', 13: 'neck' Args: dataset_dir (str): Root path to the dataset. anno_path (str): Relative path to the annotation file. image_dir (str): Path to a directory where images are held. Default: None. num_joints (int): keypoint numbers transform (composed(operators)): A sequence of data transforms. shard (list): [rank, worldsize], the distributed env params test_mode (bool): Store True when building test or validation dataset. Default: False. """ def __init__(self, dataset_dir, image_dir, anno_path, num_joints, transform=[], shard=[0, 1], test_mode=False): super().__init__(dataset_dir, image_dir, anno_path, num_joints, transform, shard, test_mode) self.ann_file = os.path.join(dataset_dir, anno_path) self.shard = shard self.test_mode = test_mode def parse_dataset(self): self.coco = COCO(self.ann_file) self.img_ids = self.coco.getImgIds() if not self.test_mode: self.img_ids = [ img_id for img_id in self.img_ids if len(self.coco.getAnnIds( imgIds=img_id, iscrowd=None)) > 0 ] blocknum = int(len(self.img_ids) / self.shard[1]) self.img_ids = self.img_ids[(blocknum * self.shard[0]):(blocknum * ( self.shard[0] + 1))] self.num_images = len(self.img_ids) self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs) self.dataset_name = 'crowdpose' print('=> num_images: {}'.format(self.num_images)) @serializable class KeypointTopDownBaseDataset(DetDataset): """Base class for top_down datasets. All datasets should subclass it. All subclasses should overwrite: Methods:`_get_db` Args: dataset_dir (str): Root path to the dataset. image_dir (str): Path to a directory where images are held. anno_path (str): Relative path to the annotation file. num_joints (int): keypoint numbers transform (composed(operators)): A sequence of data transforms. """ def __init__(self, dataset_dir, image_dir, anno_path, num_joints, transform=[]): super().__init__(dataset_dir, image_dir, anno_path) self.image_info = {} self.ann_info = {} self.img_prefix = os.path.join(dataset_dir, image_dir) self.transform = transform self.ann_info['num_joints'] = num_joints self.db = [] def __len__(self): """Get dataset length.""" return len(self.db) def _get_db(self): """Get a sample""" raise NotImplementedError def __getitem__(self, idx): """Prepare sample for training given the index.""" records = copy.deepcopy(self.db[idx]) records['image'] = cv2.imread(records['image_file'], cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION) records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB) records['score'] = records['score'] if 'score' in records else 1 records = self.transform(records) # print('records', records) return records @register @serializable class KeypointTopDownCocoDataset(KeypointTopDownBaseDataset): """COCO dataset for top-down pose estimation. The dataset loads raw features and apply specified transforms to return a dict containing the image tensors and other information. COCO keypoint indexes: 0: 'nose', 1: 'left_eye', 2: 'right_eye', 3: 'left_ear', 4: 'right_ear', 5: 'left_shoulder', 6: 'right_shoulder', 7: 'left_elbow', 8: 'right_elbow', 9: 'left_wrist', 10: 'right_wrist', 11: 'left_hip', 12: 'right_hip', 13: 'left_knee', 14: 'right_knee', 15: 'left_ankle', 16: 'right_ankle' Args: dataset_dir (str): Root path to the dataset. image_dir (str): Path to a directory where images are held. anno_path (str): Relative path to the annotation file. num_joints (int): Keypoint numbers trainsize (list):[w, h] Image target size transform (composed(operators)): A sequence of data transforms. bbox_file (str): Path to a detection bbox file Default: None. use_gt_bbox (bool): Whether to use ground truth bbox Default: True. pixel_std (int): The pixel std of the scale Default: 200. image_thre (float): The threshold to filter the detection box Default: 0.0. """ def __init__(self, dataset_dir, image_dir, anno_path, num_joints, trainsize, transform=[], bbox_file=None, use_gt_bbox=True, pixel_std=200, image_thre=0.0, center_scale=None): super().__init__(dataset_dir, image_dir, anno_path, num_joints, transform) self.bbox_file = bbox_file self.use_gt_bbox = use_gt_bbox self.trainsize = trainsize self.pixel_std = pixel_std self.image_thre = image_thre self.center_scale = center_scale self.dataset_name = 'coco' def parse_dataset(self): if self.use_gt_bbox: self.db = self._load_coco_keypoint_annotations() else: self.db = self._load_coco_person_detection_results() def _load_coco_keypoint_annotations(self): coco = COCO(self.get_anno()) img_ids = coco.getImgIds() gt_db = [] for index in img_ids: im_ann = coco.loadImgs(index)[0] width = im_ann['width'] height = im_ann['height'] file_name = im_ann['file_name'] im_id = int(im_ann["id"]) annIds = coco.getAnnIds(imgIds=index, iscrowd=False) objs = coco.loadAnns(annIds) valid_objs = [] for obj in objs: x, y, w, h = obj['bbox'] x1 = np.max((0, x)) y1 = np.max((0, y)) x2 = np.min((width - 1, x1 + np.max((0, w - 1)))) y2 = np.min((height - 1, y1 + np.max((0, h - 1)))) if obj['area'] > 0 and x2 >= x1 and y2 >= y1: obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1] valid_objs.append(obj) objs = valid_objs rec = [] for obj in objs: if max(obj['keypoints']) == 0: continue joints = np.zeros( (self.ann_info['num_joints'], 3), dtype=np.float32) joints_vis = np.zeros( (self.ann_info['num_joints'], 3), dtype=np.float32) for ipt in range(self.ann_info['num_joints']): joints[ipt, 0] = obj['keypoints'][ipt * 3 + 0] joints[ipt, 1] = obj['keypoints'][ipt * 3 + 1] joints[ipt, 2] = 0 t_vis = obj['keypoints'][ipt * 3 + 2] if t_vis > 1: t_vis = 1 joints_vis[ipt, 0] = t_vis joints_vis[ipt, 1] = t_vis joints_vis[ipt, 2] = 0 center, scale = self._box2cs(obj['clean_bbox'][:4]) rec.append({ 'image_file': os.path.join(self.img_prefix, file_name), 'center': center, 'scale': scale, 'gt_joints': joints, 'joints_vis': joints_vis, 'im_id': im_id, }) gt_db.extend(rec) return gt_db def _box2cs(self, box): x, y, w, h = box[:4] center = np.zeros((2), dtype=np.float32) center[0] = x + w * 0.5 center[1] = y + h * 0.5 aspect_ratio = self.trainsize[0] * 1.0 / self.trainsize[1] if self.center_scale is not None and np.random.rand() < 0.3: center += self.center_scale * (np.random.rand(2) - 0.5) * [w, h] if w > aspect_ratio * h: h = w * 1.0 / aspect_ratio elif w < aspect_ratio * h: w = h * aspect_ratio scale = np.array( [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std], dtype=np.float32) if center[0] != -1: scale = scale * 1.25 return center, scale def _load_coco_person_detection_results(self): all_boxes = None bbox_file_path = os.path.join(self.dataset_dir, self.bbox_file) with open(bbox_file_path, 'r') as f: all_boxes = json.load(f) if not all_boxes: print('=> Load %s fail!' % bbox_file_path) return None kpt_db = [] for n_img in range(0, len(all_boxes)): det_res = all_boxes[n_img] if det_res['category_id'] != 1: continue file_name = det_res[ 'filename'] if 'filename' in det_res else '%012d.jpg' % det_res[ 'image_id'] img_name = os.path.join(self.img_prefix, file_name) box = det_res['bbox'] score = det_res['score'] im_id = int(det_res['image_id']) if score < self.image_thre: continue center, scale = self._box2cs(box) joints = np.zeros( (self.ann_info['num_joints'], 3), dtype=np.float32) joints_vis = np.ones( (self.ann_info['num_joints'], 3), dtype=np.float32) kpt_db.append({ 'image_file': img_name, 'im_id': im_id, 'center': center, 'scale': scale, 'score': score, 'gt_joints': joints, 'joints_vis': joints_vis, }) return kpt_db @register @serializable class KeypointTopDownCocoWholeBodyHandDataset(KeypointTopDownBaseDataset): """CocoWholeBody dataset for top-down hand pose estimation. The dataset loads raw features and apply specified transforms to return a dict containing the image tensors and other information. COCO-WholeBody Hand keypoint indexes: 0: 'wrist', 1: 'thumb1', 2: 'thumb2', 3: 'thumb3', 4: 'thumb4', 5: 'forefinger1', 6: 'forefinger2', 7: 'forefinger3', 8: 'forefinger4', 9: 'middle_finger1', 10: 'middle_finger2', 11: 'middle_finger3', 12: 'middle_finger4', 13: 'ring_finger1', 14: 'ring_finger2', 15: 'ring_finger3', 16: 'ring_finger4', 17: 'pinky_finger1', 18: 'pinky_finger2', 19: 'pinky_finger3', 20: 'pinky_finger4' Args: dataset_dir (str): Root path to the dataset. image_dir (str): Path to a directory where images are held. anno_path (str): Relative path to the annotation file. num_joints (int): Keypoint numbers trainsize (list):[w, h] Image target size transform (composed(operators)): A sequence of data transforms. pixel_std (int): The pixel std of the scale Default: 200. """ def __init__(self, dataset_dir, image_dir, anno_path, num_joints, trainsize, transform=[], pixel_std=200): super().__init__(dataset_dir, image_dir, anno_path, num_joints, transform) self.trainsize = trainsize self.pixel_std = pixel_std self.dataset_name = 'coco_wholebady_hand' def _box2cs(self, box): x, y, w, h = box[:4] center = np.zeros((2), dtype=np.float32) center[0] = x + w * 0.5 center[1] = y + h * 0.5 aspect_ratio = self.trainsize[0] * 1.0 / self.trainsize[1] if w > aspect_ratio * h: h = w * 1.0 / aspect_ratio elif w < aspect_ratio * h: w = h * aspect_ratio scale = np.array( [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std], dtype=np.float32) if center[0] != -1: scale = scale * 1.25 return center, scale def parse_dataset(self): gt_db = [] num_joints = self.ann_info['num_joints'] coco = COCO(self.get_anno()) img_ids = list(coco.imgs.keys()) for img_id in img_ids: im_ann = coco.loadImgs(img_id)[0] image_file = os.path.join(self.img_prefix, im_ann['file_name']) im_id = int(im_ann["id"]) ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False) objs = coco.loadAnns(ann_ids) for obj in objs: for type in ['left', 'right']: if (obj[f'{type}hand_valid'] and max(obj[f'{type}hand_kpts']) > 0): joints = np.zeros((num_joints, 3), dtype=np.float32) joints_vis = np.zeros((num_joints, 3), dtype=np.float32) keypoints = np.array(obj[f'{type}hand_kpts']) keypoints = keypoints.reshape(-1, 3) joints[:, :2] = keypoints[:, :2] joints_vis[:, :2] = np.minimum(1, keypoints[:, 2:3]) center, scale = self._box2cs(obj[f'{type}hand_box'][:4]) gt_db.append({ 'image_file': image_file, 'center': center, 'scale': scale, 'gt_joints': joints, 'joints_vis': joints_vis, 'im_id': im_id, }) self.db = gt_db @register @serializable class KeypointTopDownMPIIDataset(KeypointTopDownBaseDataset): """MPII dataset for topdown pose estimation. The dataset loads raw features and apply specified transforms to return a dict containing the image tensors and other information. MPII keypoint indexes:: 0: 'right_ankle', 1: 'right_knee', 2: 'right_hip', 3: 'left_hip', 4: 'left_knee', 5: 'left_ankle', 6: 'pelvis', 7: 'thorax', 8: 'upper_neck', 9: 'head_top', 10: 'right_wrist', 11: 'right_elbow', 12: 'right_shoulder', 13: 'left_shoulder', 14: 'left_elbow', 15: 'left_wrist', Args: dataset_dir (str): Root path to the dataset. image_dir (str): Path to a directory where images are held. anno_path (str): Relative path to the annotation file. num_joints (int): Keypoint numbers trainsize (list):[w, h] Image target size transform (composed(operators)): A sequence of data transforms. """ def __init__(self, dataset_dir, image_dir, anno_path, num_joints, transform=[]): super().__init__(dataset_dir, image_dir, anno_path, num_joints, transform) self.dataset_name = 'mpii' def parse_dataset(self): with open(self.get_anno()) as anno_file: anno = json.load(anno_file) gt_db = [] for a in anno: image_name = a['image'] im_id = a['image_id'] if 'image_id' in a else int( os.path.splitext(image_name)[0]) c = np.array(a['center'], dtype=np.float32) s = np.array([a['scale'], a['scale']], dtype=np.float32) # Adjust center/scale slightly to avoid cropping limbs if c[0] != -1: c[1] = c[1] + 15 * s[1] s = s * 1.25 c = c - 1 joints = np.zeros( (self.ann_info['num_joints'], 3), dtype=np.float32) joints_vis = np.zeros( (self.ann_info['num_joints'], 3), dtype=np.float32) if 'gt_joints' in a: joints_ = np.array(a['gt_joints']) joints_[:, 0:2] = joints_[:, 0:2] - 1 joints_vis_ = np.array(a['joints_vis']) assert len(joints_) == self.ann_info[ 'num_joints'], 'joint num diff: {} vs {}'.format( len(joints_), self.ann_info['num_joints']) joints[:, 0:2] = joints_[:, 0:2] joints_vis[:, 0] = joints_vis_[:] joints_vis[:, 1] = joints_vis_[:] gt_db.append({ 'image_file': os.path.join(self.img_prefix, image_name), 'im_id': im_id, 'center': c, 'scale': s, 'gt_joints': joints, 'joints_vis': joints_vis }) print("number length: {}".format(len(gt_db))) self.db = gt_db ================================================ FILE: ppdet/data/source/lvis.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import cv2 import copy try: from collections.abc import Sequence except Exception: from collections import Sequence import numpy as np from ppdet.core.workspace import register, serializable from .dataset import DetDataset from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) __all__ = [ 'LVISDataSet', ] @register @serializable class LVISDataSet(DetDataset): """ Load dataset with LVISDataSet format. Args: dataset_dir (str): root directory for dataset. image_dir (str): directory for images. anno_path (str): coco annotation file path. data_fields (list): key name of data dictionary, at least have 'image'. sample_num (int): number of samples to load, -1 means all. load_crowd (bool): whether to load crowded ground-truth. False as default allow_empty (bool): whether to load empty entry. False as default empty_ratio (float): the ratio of empty record number to total record's, if empty_ratio is out of [0. ,1.), do not sample the records and use all the empty entries. 1. as default repeat (int): repeat times for dataset, use in benchmark. """ def __init__(self, dataset_dir=None, image_dir=None, anno_path=None, data_fields=['image'], sample_num=-1, load_crowd=False, allow_empty=False, empty_ratio=1., repeat=1): super(LVISDataSet, self).__init__( dataset_dir, image_dir, anno_path, data_fields, sample_num, repeat=repeat) self.load_image_only = False self.load_semantic = False self.load_crowd = load_crowd self.allow_empty = allow_empty self.empty_ratio = empty_ratio def _sample_empty(self, records, num): # if empty_ratio is out of [0. ,1.), do not sample the records if self.empty_ratio < 0. or self.empty_ratio >= 1.: return records import random sample_num = min( int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records)) records = random.sample(records, sample_num) return records def parse_dataset(self): anno_path = os.path.join(self.dataset_dir, self.anno_path) image_dir = os.path.join(self.dataset_dir, self.image_dir) assert anno_path.endswith('.json'), \ 'invalid coco annotation file: ' + anno_path from lvis import LVIS lvis_ = LVIS(anno_path) img_ids = lvis_.get_img_ids() img_ids.sort() cat_ids = lvis_.get_cat_ids() records = [] empty_records = [] ct = 0 self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)}) self.cname2cid = dict({ lvis_.load_cats([catid])[0]['name']: clsid for catid, clsid in self.catid2clsid.items() }) if 'annotations' not in lvis_.dataset: self.load_image_only = True logger.warning('Annotation file: {} does not contains ground truth ' 'and load image information only.'.format(anno_path)) for img_id in img_ids: img_anno = lvis_.load_imgs([img_id])[0] im_fname = img_anno['coco_url'].replace('http://images.cocodataset.org/', '') im_w = float(img_anno['width']) im_h = float(img_anno['height']) im_path = os.path.join(image_dir, im_fname) if image_dir else im_fname is_empty = False if not os.path.exists(im_path): logger.warning('Illegal image file: {}, and it will be ' 'ignored'.format(im_path)) continue if im_w < 0 or im_h < 0: logger.warning('Illegal width: {} or height: {} in annotation, ' 'and im_id: {} will be ignored'.format( im_w, im_h, img_id)) continue coco_rec = { 'im_file': im_path, 'im_id': np.array([img_id]), 'h': im_h, 'w': im_w, } if 'image' in self.data_fields else {} if not self.load_image_only: ins_anno_ids = lvis_.get_ann_ids(img_ids=[img_id]) instances = lvis_.load_anns(ins_anno_ids) bboxes = [] is_rbox_anno = False for inst in instances: # check gt bbox if inst.get('ignore', False): continue if 'bbox' not in inst.keys(): continue else: if not any(np.array(inst['bbox'])): continue x1, y1, box_w, box_h = inst['bbox'] x2 = x1 + box_w y2 = y1 + box_h eps = 1e-5 if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps: inst['clean_bbox'] = [ round(float(x), 3) for x in [x1, y1, x2, y2] ] bboxes.append(inst) else: logger.warning( 'Found an invalid bbox in annotations: im_id: {}, ' 'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format( img_id, float(inst['area']), x1, y1, x2, y2)) num_bbox = len(bboxes) if num_bbox <= 0 and not self.allow_empty: continue elif num_bbox <= 0: is_empty = True gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32) gt_class = np.zeros((num_bbox, 1), dtype=np.int32) is_crowd = np.zeros((num_bbox, 1), dtype=np.int32) gt_poly = [None] * num_bbox gt_track_id = -np.ones((num_bbox, 1), dtype=np.int32) has_segmentation = False has_track_id = False for i, box in enumerate(bboxes): catid = box['category_id'] gt_class[i][0] = self.catid2clsid[catid] gt_bbox[i, :] = box['clean_bbox'] # is_crowd[i][0] = box['iscrowd'] # check RLE format # if 'segmentation' in box and box['iscrowd'] == 1: # gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] # elif 'segmentation' in box and box['segmentation']: # if not np.array( # box['segmentation'], # dtype=object).size > 0 and not self.allow_empty: # bboxes.pop(i) # gt_poly.pop(i) # np.delete(is_crowd, i) # np.delete(gt_class, i) # np.delete(gt_bbox, i) # else: # gt_poly[i] = box['segmentation'] # has_segmentation = True if 'track_id' in box: gt_track_id[i][0] = box['track_id'] has_track_id = True if has_segmentation and not any( gt_poly) and not self.allow_empty: continue gt_rec = { 'is_crowd': is_crowd, 'gt_class': gt_class, 'gt_bbox': gt_bbox, 'gt_poly': gt_poly, } if has_track_id: gt_rec.update({'gt_track_id': gt_track_id}) for k, v in gt_rec.items(): if k in self.data_fields: coco_rec[k] = v # TODO: remove load_semantic if self.load_semantic and 'semantic' in self.data_fields: seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps', 'train2017', im_fname[:-3] + 'png') coco_rec.update({'semantic': seg_path}) logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format( im_path, img_id, im_h, im_w)) if is_empty: empty_records.append(coco_rec) else: records.append(coco_rec) ct += 1 if self.sample_num > 0 and ct >= self.sample_num: break assert ct > 0, 'not found any coco record in %s' % (anno_path) logger.info('Load [{} samples valid, {} samples invalid] in file {}.'. format(ct, len(img_ids) - ct, anno_path)) if self.allow_empty and len(empty_records) > 0: empty_records = self._sample_empty(empty_records, len(records)) records += empty_records self.roidbs = records ================================================ FILE: ppdet/data/source/mot.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import sys import cv2 import glob import numpy as np from collections import OrderedDict, defaultdict try: from collections.abc import Sequence except Exception: from collections import Sequence from .dataset import DetDataset, _make_dataset, _is_valid_file from ppdet.core.workspace import register, serializable from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) @register @serializable class MOTDataSet(DetDataset): """ Load dataset with MOT format, only support single class MOT. Args: dataset_dir (str): root directory for dataset. image_lists (str|list): mot data image lists, muiti-source mot dataset. data_fields (list): key name of data dictionary, at least have 'image'. sample_num (int): number of samples to load, -1 means all. repeat (int): repeat times for dataset, use in benchmark. Notes: MOT datasets root directory following this: dataset/mot |——————image_lists | |——————caltech.train | |——————caltech.val | |——————mot16.train | |——————mot17.train | ...... |——————Caltech |——————MOT17 |——————...... All the MOT datasets have the following structure: Caltech |——————images | └——————00001.jpg | |—————— ... | └——————0000N.jpg └——————labels_with_ids └——————00001.txt |—————— ... └——————0000N.txt or MOT17 |——————images | └——————train | └——————test └——————labels_with_ids └——————train """ def __init__(self, dataset_dir=None, image_lists=[], data_fields=['image'], sample_num=-1, repeat=1): super(MOTDataSet, self).__init__( dataset_dir=dataset_dir, data_fields=data_fields, sample_num=sample_num, repeat=repeat) self.dataset_dir = dataset_dir self.image_lists = image_lists if isinstance(self.image_lists, str): self.image_lists = [self.image_lists] self.roidbs = None self.cname2cid = None def get_anno(self): if self.image_lists == []: return # only used to get categories and metric # only check first data, but the label_list of all data should be same. first_mot_data = self.image_lists[0].split('.')[0] anno_file = os.path.join(self.dataset_dir, first_mot_data, 'label_list.txt') return anno_file def parse_dataset(self): self.img_files = OrderedDict() self.img_start_index = OrderedDict() self.label_files = OrderedDict() self.tid_num = OrderedDict() self.tid_start_index = OrderedDict() img_index = 0 for data_name in self.image_lists: # check every data image list image_lists_dir = os.path.join(self.dataset_dir, 'image_lists') assert os.path.isdir(image_lists_dir), \ "The {} is not a directory.".format(image_lists_dir) list_path = os.path.join(image_lists_dir, data_name) assert os.path.exists(list_path), \ "The list path {} does not exist.".format(list_path) # record img_files, filter out empty ones with open(list_path, 'r') as file: self.img_files[data_name] = file.readlines() self.img_files[data_name] = [ os.path.join(self.dataset_dir, x.strip()) for x in self.img_files[data_name] ] self.img_files[data_name] = list( filter(lambda x: len(x) > 0, self.img_files[data_name])) self.img_start_index[data_name] = img_index img_index += len(self.img_files[data_name]) # record label_files self.label_files[data_name] = [ x.replace('images', 'labels_with_ids').replace( '.png', '.txt').replace('.jpg', '.txt') for x in self.img_files[data_name] ] for data_name, label_paths in self.label_files.items(): max_index = -1 for lp in label_paths: lb = np.loadtxt(lp) if len(lb) < 1: continue if len(lb.shape) < 2: img_max = lb[1] else: img_max = np.max(lb[:, 1]) if img_max > max_index: max_index = img_max self.tid_num[data_name] = int(max_index + 1) last_index = 0 for i, (k, v) in enumerate(self.tid_num.items()): self.tid_start_index[k] = last_index last_index += v self.num_identities_dict = defaultdict(int) self.num_identities_dict[0] = int(last_index + 1) # single class self.num_imgs_each_data = [len(x) for x in self.img_files.values()] self.total_imgs = sum(self.num_imgs_each_data) logger.info('MOT dataset summary: ') logger.info(self.tid_num) logger.info('Total images: {}'.format(self.total_imgs)) logger.info('Image start index: {}'.format(self.img_start_index)) logger.info('Total identities: {}'.format(self.num_identities_dict[0])) logger.info('Identity start index: {}'.format(self.tid_start_index)) records = [] cname2cid = mot_label() for img_index in range(self.total_imgs): for i, (k, v) in enumerate(self.img_start_index.items()): if img_index >= v: data_name = list(self.label_files.keys())[i] start_index = v img_file = self.img_files[data_name][img_index - start_index] lbl_file = self.label_files[data_name][img_index - start_index] if not os.path.exists(img_file): logger.warning('Illegal image file: {}, and it will be ignored'. format(img_file)) continue if not os.path.isfile(lbl_file): logger.warning('Illegal label file: {}, and it will be ignored'. format(lbl_file)) continue labels = np.loadtxt(lbl_file, dtype=np.float32).reshape(-1, 6) # each row in labels (N, 6) is [gt_class, gt_identity, cx, cy, w, h] cx, cy = labels[:, 2], labels[:, 3] w, h = labels[:, 4], labels[:, 5] gt_bbox = np.stack((cx, cy, w, h)).T.astype('float32') gt_class = labels[:, 0:1].astype('int32') gt_score = np.ones((len(labels), 1)).astype('float32') gt_ide = labels[:, 1:2].astype('int32') for i, _ in enumerate(gt_ide): if gt_ide[i] > -1: gt_ide[i] += self.tid_start_index[data_name] mot_rec = { 'im_file': img_file, 'im_id': img_index, } if 'image' in self.data_fields else {} gt_rec = { 'gt_class': gt_class, 'gt_score': gt_score, 'gt_bbox': gt_bbox, 'gt_ide': gt_ide, } for k, v in gt_rec.items(): if k in self.data_fields: mot_rec[k] = v records.append(mot_rec) if self.sample_num > 0 and img_index >= self.sample_num: break assert len(records) > 0, 'not found any mot record in %s' % ( self.image_lists) self.roidbs, self.cname2cid = records, cname2cid @register @serializable class MCMOTDataSet(DetDataset): """ Load dataset with MOT format, support multi-class MOT. Args: dataset_dir (str): root directory for dataset. image_lists (list(str)): mcmot data image lists, muiti-source mcmot dataset. data_fields (list): key name of data dictionary, at least have 'image'. label_list (str): if use_default_label is False, will load mapping between category and class index. sample_num (int): number of samples to load, -1 means all. Notes: MCMOT datasets root directory following this: dataset/mot |——————image_lists | |——————visdrone_mcmot.train | |——————visdrone_mcmot.val visdrone_mcmot |——————images | └——————train | └——————val └——————labels_with_ids └——————train """ def __init__(self, dataset_dir=None, image_lists=[], data_fields=['image'], label_list=None, sample_num=-1): super(MCMOTDataSet, self).__init__( dataset_dir=dataset_dir, data_fields=data_fields, sample_num=sample_num) self.dataset_dir = dataset_dir self.image_lists = image_lists if isinstance(self.image_lists, str): self.image_lists = [self.image_lists] self.label_list = label_list self.roidbs = None self.cname2cid = None def get_anno(self): if self.image_lists == []: return # only used to get categories and metric # only check first data, but the label_list of all data should be same. first_mot_data = self.image_lists[0].split('.')[0] anno_file = os.path.join(self.dataset_dir, first_mot_data, 'label_list.txt') return anno_file def parse_dataset(self): self.img_files = OrderedDict() self.img_start_index = OrderedDict() self.label_files = OrderedDict() self.tid_num = OrderedDict() self.tid_start_idx_of_cls_ids = defaultdict(dict) # for MCMOT img_index = 0 for data_name in self.image_lists: # check every data image list image_lists_dir = os.path.join(self.dataset_dir, 'image_lists') assert os.path.isdir(image_lists_dir), \ "The {} is not a directory.".format(image_lists_dir) list_path = os.path.join(image_lists_dir, data_name) assert os.path.exists(list_path), \ "The list path {} does not exist.".format(list_path) # record img_files, filter out empty ones with open(list_path, 'r') as file: self.img_files[data_name] = file.readlines() self.img_files[data_name] = [ os.path.join(self.dataset_dir, x.strip()) for x in self.img_files[data_name] ] self.img_files[data_name] = list( filter(lambda x: len(x) > 0, self.img_files[data_name])) self.img_start_index[data_name] = img_index img_index += len(self.img_files[data_name]) # record label_files self.label_files[data_name] = [ x.replace('images', 'labels_with_ids').replace( '.png', '.txt').replace('.jpg', '.txt') for x in self.img_files[data_name] ] for data_name, label_paths in self.label_files.items(): # using max_ids_dict rather than max_index max_ids_dict = defaultdict(int) for lp in label_paths: lb = np.loadtxt(lp) if len(lb) < 1: continue lb = lb.reshape(-1, 6) for item in lb: if item[1] > max_ids_dict[int(item[0])]: # item[0]: cls_id # item[1]: track id max_ids_dict[int(item[0])] = int(item[1]) # track id number self.tid_num[data_name] = max_ids_dict last_idx_dict = defaultdict(int) for i, (k, v) in enumerate(self.tid_num.items()): # each sub dataset for cls_id, id_num in v.items(): # v is a max_ids_dict self.tid_start_idx_of_cls_ids[k][cls_id] = last_idx_dict[cls_id] last_idx_dict[cls_id] += id_num self.num_identities_dict = defaultdict(int) for k, v in last_idx_dict.items(): self.num_identities_dict[k] = int(v) # total ids of each category self.num_imgs_each_data = [len(x) for x in self.img_files.values()] self.total_imgs = sum(self.num_imgs_each_data) # cname2cid and cid2cname cname2cid = {} if self.label_list is not None: # if use label_list for multi source mix dataset, # please make sure label_list in the first sub_dataset at least. sub_dataset = self.image_lists[0].split('.')[0] label_path = os.path.join(self.dataset_dir, sub_dataset, self.label_list) if not os.path.exists(label_path): logger.info( "Note: label_list {} does not exists, use VisDrone 10 classes labels as default.". format(label_path)) cname2cid = visdrone_mcmot_label() else: with open(label_path, 'r') as fr: label_id = 0 for line in fr.readlines(): cname2cid[line.strip()] = label_id label_id += 1 else: cname2cid = visdrone_mcmot_label() cid2cname = dict([(v, k) for (k, v) in cname2cid.items()]) logger.info('MCMOT dataset summary: ') logger.info(self.tid_num) logger.info('Total images: {}'.format(self.total_imgs)) logger.info('Image start index: {}'.format(self.img_start_index)) logger.info('Total identities of each category: ') num_identities_dict = sorted( self.num_identities_dict.items(), key=lambda x: x[0]) total_IDs_all_cats = 0 for (k, v) in num_identities_dict: logger.info('Category {} [{}] has {} IDs.'.format(k, cid2cname[k], v)) total_IDs_all_cats += v logger.info('Total identities of all categories: {}'.format( total_IDs_all_cats)) logger.info('Identity start index of each category: ') for k, v in self.tid_start_idx_of_cls_ids.items(): sorted_v = sorted(v.items(), key=lambda x: x[0]) for (cls_id, start_idx) in sorted_v: logger.info('Start index of dataset {} category {:d} is {:d}' .format(k, cls_id, start_idx)) records = [] for img_index in range(self.total_imgs): for i, (k, v) in enumerate(self.img_start_index.items()): if img_index >= v: data_name = list(self.label_files.keys())[i] start_index = v img_file = self.img_files[data_name][img_index - start_index] lbl_file = self.label_files[data_name][img_index - start_index] if not os.path.exists(img_file): logger.warning('Illegal image file: {}, and it will be ignored'. format(img_file)) continue if not os.path.isfile(lbl_file): logger.warning('Illegal label file: {}, and it will be ignored'. format(lbl_file)) continue labels = np.loadtxt(lbl_file, dtype=np.float32).reshape(-1, 6) # each row in labels (N, 6) is [gt_class, gt_identity, cx, cy, w, h] cx, cy = labels[:, 2], labels[:, 3] w, h = labels[:, 4], labels[:, 5] gt_bbox = np.stack((cx, cy, w, h)).T.astype('float32') gt_class = labels[:, 0:1].astype('int32') gt_score = np.ones((len(labels), 1)).astype('float32') gt_ide = labels[:, 1:2].astype('int32') for i, _ in enumerate(gt_ide): if gt_ide[i] > -1: cls_id = int(gt_class[i]) start_idx = self.tid_start_idx_of_cls_ids[data_name][cls_id] gt_ide[i] += start_idx mot_rec = { 'im_file': img_file, 'im_id': img_index, } if 'image' in self.data_fields else {} gt_rec = { 'gt_class': gt_class, 'gt_score': gt_score, 'gt_bbox': gt_bbox, 'gt_ide': gt_ide, } for k, v in gt_rec.items(): if k in self.data_fields: mot_rec[k] = v records.append(mot_rec) if self.sample_num > 0 and img_index >= self.sample_num: break assert len(records) > 0, 'not found any mot record in %s' % ( self.image_lists) self.roidbs, self.cname2cid = records, cname2cid @register @serializable class MOTImageFolder(DetDataset): """ Load MOT dataset with MOT format from image folder or video . Args: video_file (str): path of the video file, default ''. frame_rate (int): frame rate of the video, use cv2 VideoCapture if not set. dataset_dir (str): root directory for dataset. keep_ori_im (bool): whether to keep original image, default False. Set True when used during MOT model inference while saving images or video, or used in DeepSORT. """ def __init__(self, video_file=None, frame_rate=-1, dataset_dir=None, data_root=None, image_dir=None, sample_num=-1, keep_ori_im=False, anno_path=None, **kwargs): super(MOTImageFolder, self).__init__( dataset_dir, image_dir, sample_num=sample_num) self.video_file = video_file self.data_root = data_root self.keep_ori_im = keep_ori_im self._imid2path = {} self.roidbs = None self.frame_rate = frame_rate self.anno_path = anno_path def check_or_download_dataset(self): return def parse_dataset(self, ): if not self.roidbs: if self.video_file is None: self.frame_rate = 30 # set as default if infer image folder self.roidbs = self._load_images() else: self.roidbs = self._load_video_images() def _load_video_images(self): if self.frame_rate == -1: # if frame_rate is not set for video, use cv2.VideoCapture cap = cv2.VideoCapture(self.video_file) self.frame_rate = int(cap.get(cv2.CAP_PROP_FPS)) extension = self.video_file.split('.')[-1] output_path = self.video_file.replace('.{}'.format(extension), '') frames_path = video2frames(self.video_file, output_path, self.frame_rate) self.video_frames = sorted( glob.glob(os.path.join(frames_path, '*.png'))) self.video_length = len(self.video_frames) logger.info('Length of the video: {:d} frames.'.format( self.video_length)) ct = 0 records = [] for image in self.video_frames: assert image != '' and os.path.isfile(image), \ "Image {} not found".format(image) if self.sample_num > 0 and ct >= self.sample_num: break rec = {'im_id': np.array([ct]), 'im_file': image} if self.keep_ori_im: rec.update({'keep_ori_im': 1}) self._imid2path[ct] = image ct += 1 records.append(rec) assert len(records) > 0, "No image file found" return records def _find_images(self): image_dir = self.image_dir if not isinstance(image_dir, Sequence): image_dir = [image_dir] images = [] for im_dir in image_dir: if os.path.isdir(im_dir): im_dir = os.path.join(self.dataset_dir, im_dir) images.extend(_make_dataset(im_dir)) elif os.path.isfile(im_dir) and _is_valid_file(im_dir): images.append(im_dir) return images def _load_images(self): images = self._find_images() ct = 0 records = [] for image in images: assert image != '' and os.path.isfile(image), \ "Image {} not found".format(image) if self.sample_num > 0 and ct >= self.sample_num: break rec = {'im_id': np.array([ct]), 'im_file': image} if self.keep_ori_im: rec.update({'keep_ori_im': 1}) self._imid2path[ct] = image ct += 1 records.append(rec) assert len(records) > 0, "No image file found" return records def get_imid2path(self): return self._imid2path def set_images(self, images): self.image_dir = images self.roidbs = self._load_images() def set_video(self, video_file, frame_rate): # update video_file and frame_rate by command line of tools/infer_mot.py self.video_file = video_file self.frame_rate = frame_rate assert os.path.isfile(self.video_file) and _is_valid_video(self.video_file), \ "wrong or unsupported file format: {}".format(self.video_file) self.roidbs = self._load_video_images() def get_anno(self): return self.anno_path def _is_valid_video(f, extensions=('.mp4', '.avi', '.mov', '.rmvb', 'flv')): return f.lower().endswith(extensions) def video2frames(video_path, outpath, frame_rate, **kargs): def _dict2str(kargs): cmd_str = '' for k, v in kargs.items(): cmd_str += (' ' + str(k) + ' ' + str(v)) return cmd_str ffmpeg = ['ffmpeg ', ' -y -loglevel ', ' error '] vid_name = os.path.basename(video_path).split('.')[0] out_full_path = os.path.join(outpath, vid_name) if not os.path.exists(out_full_path): os.makedirs(out_full_path) # video file name outformat = os.path.join(out_full_path, '%08d.png') cmd = ffmpeg cmd = ffmpeg + [ ' -i ', video_path, ' -r ', str(frame_rate), ' -f image2 ', outformat ] cmd = ''.join(cmd) + _dict2str(kargs) if os.system(cmd) != 0: raise RuntimeError('ffmpeg process video: {} error'.format(video_path)) sys.exit(-1) sys.stdout.flush() return out_full_path def mot_label(): labels_map = {'person': 0} return labels_map def visdrone_mcmot_label(): labels_map = { 'pedestrian': 0, 'people': 1, 'bicycle': 2, 'car': 3, 'van': 4, 'truck': 5, 'tricycle': 6, 'awning-tricycle': 7, 'bus': 8, 'motor': 9, } return labels_map ================================================ FILE: ppdet/data/source/pose3d_cmb.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import cv2 import numpy as np import json import copy import pycocotools from pycocotools.coco import COCO from .dataset import DetDataset from ppdet.core.workspace import register, serializable from paddle.io import Dataset @serializable class Pose3DDataset(DetDataset): """Pose3D Dataset class. Args: dataset_dir (str): Root path to the dataset. anno_list (list of str): each of the element is a relative path to the annotation file. image_dirs (list of str): each of path is a relative path where images are held. transform (composed(operators)): A sequence of data transforms. test_mode (bool): Store True when building test or validation dataset. Default: False. 24 joints order: 0-2: 'R_Ankle', 'R_Knee', 'R_Hip', 3-5:'L_Hip', 'L_Knee', 'L_Ankle', 6-8:'R_Wrist', 'R_Elbow', 'R_Shoulder', 9-11:'L_Shoulder','L_Elbow','L_Wrist', 12-14:'Neck','Top_of_Head','Pelvis', 15-18:'Thorax','Spine','Jaw','Head', 19-23:'Nose','L_Eye','R_Eye','L_Ear','R_Ear' """ def __init__(self, dataset_dir, image_dirs, anno_list, transform=[], num_joints=24, test_mode=False): super().__init__(dataset_dir, image_dirs, anno_list) self.image_info = {} self.ann_info = {} self.num_joints = num_joints self.transform = transform self.test_mode = test_mode self.img_ids = [] self.dataset_dir = dataset_dir self.image_dirs = image_dirs self.anno_list = anno_list def get_mask(self, mvm_percent=0.3): num_joints = self.num_joints mjm_mask = np.ones((num_joints, 1)).astype(np.float32) if self.test_mode == False: pb = np.random.random_sample() masked_num = int( pb * mvm_percent * num_joints) # at most x% of the joints could be masked indices = np.random.choice( np.arange(num_joints), replace=False, size=masked_num) mjm_mask[indices, :] = 0.0 # return mjm_mask num_joints = 10 mvm_mask = np.ones((num_joints, 1)).astype(np.float) if self.test_mode == False: num_vertices = num_joints pb = np.random.random_sample() masked_num = int( pb * mvm_percent * num_vertices) # at most x% of the vertices could be masked indices = np.random.choice( np.arange(num_vertices), replace=False, size=masked_num) mvm_mask[indices, :] = 0.0 mjm_mask = np.concatenate([mjm_mask, mvm_mask], axis=0) return mjm_mask def filterjoints(self, x): if self.num_joints == 24: return x elif self.num_joints == 14: return x[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18], :] elif self.num_joints == 17: return x[ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19], :] else: raise ValueError( "unsupported joint numbers, only [24 or 17 or 14] is supported!") def parse_dataset(self): print("Loading annotations..., please wait") self.annos = [] im_id = 0 self.human36m_num = 0 for idx, annof in enumerate(self.anno_list): img_prefix = os.path.join(self.dataset_dir, self.image_dirs[idx]) dataf = os.path.join(self.dataset_dir, annof) with open(dataf, 'r') as rf: anno_data = json.load(rf) annos = anno_data['data'] new_annos = [] print("{} has annos numbers: {}".format(dataf, len(annos))) for anno in annos: new_anno = {} new_anno['im_id'] = im_id im_id += 1 imagename = anno['imageName'] if imagename.startswith("COCO_train2014_"): imagename = imagename[len("COCO_train2014_"):] elif imagename.startswith("COCO_val2014_"): imagename = imagename[len("COCO_val2014_"):] imagename = os.path.join(img_prefix, imagename) if not os.path.exists(imagename): if "train2017" in imagename: imagename = imagename.replace("train2017", "val2017") if not os.path.exists(imagename): print("cannot find imagepath:{}".format( imagename)) continue else: print("cannot find imagepath:{}".format(imagename)) continue new_anno['imageName'] = imagename if 'human3.6m' in imagename: self.human36m_num += 1 new_anno['bbox_center'] = anno['bbox_center'] new_anno['bbox_scale'] = anno['bbox_scale'] new_anno['joints_2d'] = np.array(anno[ 'gt_keypoint_2d']).astype(np.float32) if new_anno['joints_2d'].shape[0] == 49: #if the joints_2d is in SPIN format(which generated by eft), choose the last 24 public joints #for detail please refer: https://github.com/nkolot/SPIN/blob/master/constants.py new_anno['joints_2d'] = new_anno['joints_2d'][25:] new_anno['joints_3d'] = np.array(anno[ 'pose3d'])[:, :3].astype(np.float32) new_anno['mjm_mask'] = self.get_mask() if not 'has_3d_joints' in anno: new_anno['has_3d_joints'] = int(1) new_anno['has_2d_joints'] = int(1) else: new_anno['has_3d_joints'] = int(anno['has_3d_joints']) new_anno['has_2d_joints'] = int(anno['has_2d_joints']) new_anno['joints_2d'] = self.filterjoints(new_anno[ 'joints_2d']) self.annos.append(new_anno) del annos def get_temp_num(self): """get temporal data number, like human3.6m""" return self.human36m_num def __len__(self): """Get dataset length.""" return len(self.annos) def _get_imganno(self, idx): """Get anno for a single image.""" return self.annos[idx] def __getitem__(self, idx): """Prepare image for training given the index.""" records = copy.deepcopy(self._get_imganno(idx)) imgpath = records['imageName'] assert os.path.exists(imgpath), "cannot find image {}".format(imgpath) records['image'] = cv2.imread(imgpath) records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB) records = self.transform(records) return records def check_or_download_dataset(self): alldatafind = True for image_dir in self.image_dirs: image_dir = os.path.join(self.dataset_dir, image_dir) if not os.path.isdir(image_dir): print("dataset [{}] is not found".format(image_dir)) alldatafind = False if not alldatafind: raise ValueError( "Some dataset is not valid and cannot download automatically now, please prepare the dataset first" ) @register @serializable class Keypoint3DMultiFramesDataset(Dataset): """24 keypoints 3D dataset for pose estimation. each item is a list of images The dataset loads raw features and apply specified transforms to return a dict containing the image tensors and other information. Args: dataset_dir (str): Root path to the dataset. image_dir (str): Path to a directory where images are held. """ def __init__( self, dataset_dir, # 数据集根目录 image_dir, # 图像文件夹 p3d_dir, # 3D关键点文件夹 json_path, img_size, #图像resize大小 num_frames, # 帧序列长度 anno_path=None, ): self.dataset_dir = dataset_dir self.image_dir = image_dir self.p3d_dir = p3d_dir self.json_path = json_path self.img_size = img_size self.num_frames = num_frames self.anno_path = anno_path self.data_labels, self.mf_inds = self._generate_multi_frames_list() def _generate_multi_frames_list(self): act_list = os.listdir(self.dataset_dir) # 动作列表 count = 0 mf_list = [] annos_dict = {'images': [], 'annotations': [], 'act_inds': []} for act in act_list: #对每个动作,生成帧序列 if '.' in act: continue json_path = os.path.join(self.dataset_dir, act, self.json_path) with open(json_path, 'r') as j: annos = json.load(j) length = len(annos['images']) for k, v in annos.items(): if k in annos_dict: annos_dict[k].extend(v) annos_dict['act_inds'].extend([act] * length) mf = [[i + j + count for j in range(self.num_frames)] for i in range(0, length - self.num_frames + 1)] mf_list.extend(mf) count += length print("total data number:", len(mf_list)) return annos_dict, mf_list def __call__(self, *args, **kwargs): return self def __getitem__(self, index): # 拿一个连续的序列 inds = self.mf_inds[ index] # 如[568, 569, 570, 571, 572, 573],长度为num_frames images = self.data_labels['images'] # all images annots = self.data_labels['annotations'] # all annots act = self.data_labels['act_inds'][inds[0]] # 动作名(文件夹名) kps3d_list = [] kps3d_vis_list = [] names = [] h, w = 0, 0 for ind in inds: # one image height = float(images[ind]['height']) width = float(images[ind]['width']) name = images[ind]['file_name'] # 图像名称,带有后缀 kps3d_name = name.split('.')[0] + '.obj' kps3d_path = os.path.join(self.dataset_dir, act, self.p3d_dir, kps3d_name) joints, joints_vis = self.kps3d_process(kps3d_path) joints_vis = np.array(joints_vis, dtype=np.float32) kps3d_list.append(joints) kps3d_vis_list.append(joints_vis) names.append(name) kps3d = np.array(kps3d_list) # (6, 24, 3),(num_frames, joints_num, 3) kps3d_vis = np.array(kps3d_vis_list) # read image imgs = [] for name in names: img_path = os.path.join(self.dataset_dir, act, self.image_dir, name) image = cv2.imread(img_path, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) imgs.append(np.expand_dims(image, axis=0)) imgs = np.concatenate(imgs, axis=0) imgs = imgs.astype( np.float32) # (6, 1080, 1920, 3),(num_frames, h, w, c) # attention: 此时图像和标注是镜像的 records = { 'kps3d': kps3d, 'kps3d_vis': kps3d_vis, "image": imgs, 'act': act, 'names': names, 'im_id': index } return self.transform(records) def kps3d_process(self, kps3d_path): count = 0 kps = [] kps_vis = [] with open(kps3d_path, 'r') as f: lines = f.readlines() for line in lines: if line[0] == 'v': kps.append([]) line = line.strip('\n').split(' ')[1:] for kp in line: kps[-1].append(float(kp)) count += 1 kps_vis.append([1, 1, 1]) kps = np.array(kps) # 52,3 kps_vis = np.array(kps_vis) kps *= 10 # scale points kps -= kps[[0], :] # set root point to zero kps = np.concatenate((kps[0:23], kps[[37]]), axis=0) # 24,3 kps *= 10 kps_vis = np.concatenate((kps_vis[0:23], kps_vis[[37]]), axis=0) # 24,3 return kps, kps_vis def __len__(self): return len(self.mf_inds) def get_anno(self): if self.anno_path is None: return return os.path.join(self.dataset_dir, self.anno_path) def check_or_download_dataset(self): return def parse_dataset(self, ): return def set_transform(self, transform): self.transform = transform def set_epoch(self, epoch_id): self._epoch = epoch_id def set_kwargs(self, **kwargs): self.mixup_epoch = kwargs.get('mixup_epoch', -1) self.cutmix_epoch = kwargs.get('cutmix_epoch', -1) self.mosaic_epoch = kwargs.get('mosaic_epoch', -1) ================================================ FILE: ppdet/data/source/sniper_coco.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import cv2 import json import copy import numpy as np try: from collections.abc import Sequence except Exception: from collections import Sequence from ppdet.core.workspace import register, serializable from ppdet.data.crop_utils.annotation_cropper import AnnoCropper from .coco import COCODataSet from .dataset import _make_dataset, _is_valid_file from ppdet.utils.logger import setup_logger logger = setup_logger('sniper_coco_dataset') @register @serializable class SniperCOCODataSet(COCODataSet): """SniperCOCODataSet""" def __init__(self, dataset_dir=None, image_dir=None, anno_path=None, proposals_file=None, data_fields=['image'], sample_num=-1, load_crowd=False, allow_empty=True, empty_ratio=1., is_trainset=True, image_target_sizes=[2000, 1000], valid_box_ratio_ranges=[[-1, 0.1],[0.08, -1]], chip_target_size=500, chip_target_stride=200, use_neg_chip=False, max_neg_num_per_im=8, max_per_img=-1, nms_thresh=0.5): super(SniperCOCODataSet, self).__init__( dataset_dir=dataset_dir, image_dir=image_dir, anno_path=anno_path, data_fields=data_fields, sample_num=sample_num, load_crowd=load_crowd, allow_empty=allow_empty, empty_ratio=empty_ratio ) self.proposals_file = proposals_file self.proposals = None self.anno_cropper = None self.is_trainset = is_trainset self.image_target_sizes = image_target_sizes self.valid_box_ratio_ranges = valid_box_ratio_ranges self.chip_target_size = chip_target_size self.chip_target_stride = chip_target_stride self.use_neg_chip = use_neg_chip self.max_neg_num_per_im = max_neg_num_per_im self.max_per_img = max_per_img self.nms_thresh = nms_thresh def parse_dataset(self): if not hasattr(self, "roidbs"): super(SniperCOCODataSet, self).parse_dataset() if self.is_trainset: self._parse_proposals() self._merge_anno_proposals() self.ori_roidbs = copy.deepcopy(self.roidbs) self.init_anno_cropper() self.roidbs = self.generate_chips_roidbs(self.roidbs, self.is_trainset) def set_proposals_file(self, file_path): self.proposals_file = file_path def init_anno_cropper(self): logger.info("Init AnnoCropper...") self.anno_cropper = AnnoCropper( image_target_sizes=self.image_target_sizes, valid_box_ratio_ranges=self.valid_box_ratio_ranges, chip_target_size=self.chip_target_size, chip_target_stride=self.chip_target_stride, use_neg_chip=self.use_neg_chip, max_neg_num_per_im=self.max_neg_num_per_im, max_per_img=self.max_per_img, nms_thresh=self.nms_thresh ) def generate_chips_roidbs(self, roidbs, is_trainset): if is_trainset: roidbs = self.anno_cropper.crop_anno_records(roidbs) else: roidbs = self.anno_cropper.crop_infer_anno_records(roidbs) return roidbs def _parse_proposals(self): if self.proposals_file: self.proposals = {} logger.info("Parse proposals file:{}".format(self.proposals_file)) with open(self.proposals_file, 'r') as f: proposals = json.load(f) for prop in proposals: image_id = prop["image_id"] if image_id not in self.proposals: self.proposals[image_id] = [] x, y, w, h = prop["bbox"] self.proposals[image_id].append([x, y, x + w, y + h]) def _merge_anno_proposals(self): assert self.roidbs if self.proposals and len(self.proposals.keys()) > 0: logger.info("merge proposals to annos") for id, record in enumerate(self.roidbs): image_id = int(record["im_id"]) if image_id not in self.proposals.keys(): logger.info("image id :{} no proposals".format(image_id)) record["proposals"] = np.array(self.proposals.get(image_id, []), dtype=np.float32) self.roidbs[id] = record def get_ori_roidbs(self): if not hasattr(self, "ori_roidbs"): return None return self.ori_roidbs def get_roidbs(self): if not hasattr(self, "roidbs"): self.parse_dataset() return self.roidbs def set_roidbs(self, roidbs): self.roidbs = roidbs def check_or_download_dataset(self): return def _parse(self): image_dir = self.image_dir if not isinstance(image_dir, Sequence): image_dir = [image_dir] images = [] for im_dir in image_dir: if os.path.isdir(im_dir): im_dir = os.path.join(self.dataset_dir, im_dir) images.extend(_make_dataset(im_dir)) elif os.path.isfile(im_dir) and _is_valid_file(im_dir): images.append(im_dir) return images def _load_images(self): images = self._parse() ct = 0 records = [] for image in images: assert image != '' and os.path.isfile(image), \ "Image {} not found".format(image) if self.sample_num > 0 and ct >= self.sample_num: break im = cv2.imread(image) h, w, c = im.shape rec = {'im_id': np.array([ct]), 'im_file': image, "h": h, "w": w} self._imid2path[ct] = image ct += 1 records.append(rec) assert len(records) > 0, "No image file found" return records def get_imid2path(self): return self._imid2path def set_images(self, images): self._imid2path = {} self.image_dir = images self.roidbs = self._load_images() ================================================ FILE: ppdet/data/source/voc.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import numpy as np import xml.etree.ElementTree as ET from ppdet.core.workspace import register, serializable from .dataset import DetDataset from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) @register @serializable class VOCDataSet(DetDataset): """ Load dataset with PascalVOC format. Notes: `anno_path` must contains xml file and image file path for annotations. Args: dataset_dir (str): root directory for dataset. image_dir (str): directory for images. anno_path (str): voc annotation file path. data_fields (list): key name of data dictionary, at least have 'image'. sample_num (int): number of samples to load, -1 means all. label_list (str): if use_default_label is False, will load mapping between category and class index. allow_empty (bool): whether to load empty entry. False as default empty_ratio (float): the ratio of empty record number to total record's, if empty_ratio is out of [0. ,1.), do not sample the records and use all the empty entries. 1. as default repeat (int): repeat times for dataset, use in benchmark. """ def __init__(self, dataset_dir=None, image_dir=None, anno_path=None, data_fields=['image'], sample_num=-1, label_list=None, allow_empty=False, empty_ratio=1., repeat=1): super(VOCDataSet, self).__init__( dataset_dir=dataset_dir, image_dir=image_dir, anno_path=anno_path, data_fields=data_fields, sample_num=sample_num, repeat=repeat) self.label_list = label_list self.allow_empty = allow_empty self.empty_ratio = empty_ratio def _sample_empty(self, records, num): # if empty_ratio is out of [0. ,1.), do not sample the records if self.empty_ratio < 0. or self.empty_ratio >= 1.: return records import random sample_num = min( int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records)) records = random.sample(records, sample_num) return records def parse_dataset(self, ): anno_path = os.path.join(self.dataset_dir, self.anno_path) image_dir = os.path.join(self.dataset_dir, self.image_dir) # mapping category name to class id # first_class:0, second_class:1, ... records = [] empty_records = [] ct = 0 cname2cid = {} if self.label_list: label_path = os.path.join(self.dataset_dir, self.label_list) if not os.path.exists(label_path): raise ValueError("label_list {} does not exists".format( label_path)) with open(label_path, 'r') as fr: label_id = 0 for line in fr.readlines(): cname2cid[line.strip()] = label_id label_id += 1 else: cname2cid = pascalvoc_label() with open(anno_path, 'r') as fr: while True: line = fr.readline() if not line: break img_file, xml_file = [os.path.join(image_dir, x) \ for x in line.strip().split()[:2]] if not os.path.exists(img_file): logger.warning( 'Illegal image file: {}, and it will be ignored'.format( img_file)) continue if not os.path.isfile(xml_file): logger.warning( 'Illegal xml file: {}, and it will be ignored'.format( xml_file)) continue tree = ET.parse(xml_file) if tree.find('id') is None: im_id = np.array([ct]) else: im_id = np.array([int(tree.find('id').text)]) objs = tree.findall('object') im_w = float(tree.find('size').find('width').text) im_h = float(tree.find('size').find('height').text) if im_w < 0 or im_h < 0: logger.warning( 'Illegal width: {} or height: {} in annotation, ' 'and {} will be ignored'.format(im_w, im_h, xml_file)) continue num_bbox, i = len(objs), 0 gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32) gt_class = np.zeros((num_bbox, 1), dtype=np.int32) gt_score = np.zeros((num_bbox, 1), dtype=np.float32) difficult = np.zeros((num_bbox, 1), dtype=np.int32) for obj in objs: cname = obj.find('name').text # user dataset may not contain difficult field _difficult = obj.find('difficult') _difficult = int( _difficult.text) if _difficult is not None else 0 x1 = float(obj.find('bndbox').find('xmin').text) y1 = float(obj.find('bndbox').find('ymin').text) x2 = float(obj.find('bndbox').find('xmax').text) y2 = float(obj.find('bndbox').find('ymax').text) x1 = max(0, x1) y1 = max(0, y1) x2 = min(im_w - 1, x2) y2 = min(im_h - 1, y2) if x2 > x1 and y2 > y1: gt_bbox[i, :] = [x1, y1, x2, y2] gt_class[i, 0] = cname2cid[cname] gt_score[i, 0] = 1. difficult[i, 0] = _difficult i += 1 else: logger.warning( 'Found an invalid bbox in annotations: xml_file: {}' ', x1: {}, y1: {}, x2: {}, y2: {}.'.format( xml_file, x1, y1, x2, y2)) gt_bbox = gt_bbox[:i, :] gt_class = gt_class[:i, :] gt_score = gt_score[:i, :] difficult = difficult[:i, :] voc_rec = { 'im_file': img_file, 'im_id': im_id, 'h': im_h, 'w': im_w } if 'image' in self.data_fields else {} gt_rec = { 'gt_class': gt_class, 'gt_score': gt_score, 'gt_bbox': gt_bbox, 'difficult': difficult } for k, v in gt_rec.items(): if k in self.data_fields: voc_rec[k] = v if len(objs) == 0: empty_records.append(voc_rec) else: records.append(voc_rec) ct += 1 if self.sample_num > 0 and ct >= self.sample_num: break assert ct > 0, 'not found any voc record in %s' % (self.anno_path) logger.debug('{} samples in file {}'.format(ct, anno_path)) if self.allow_empty and len(empty_records) > 0: empty_records = self._sample_empty(empty_records, len(records)) records += empty_records self.roidbs, self.cname2cid = records, cname2cid def get_label_list(self): return os.path.join(self.dataset_dir, self.label_list) def pascalvoc_label(): labels_map = { 'aeroplane': 0, 'bicycle': 1, 'bird': 2, 'boat': 3, 'bottle': 4, 'bus': 5, 'car': 6, 'cat': 7, 'chair': 8, 'cow': 9, 'diningtable': 10, 'dog': 11, 'horse': 12, 'motorbike': 13, 'person': 14, 'pottedplant': 15, 'sheep': 16, 'sofa': 17, 'train': 18, 'tvmonitor': 19 } return labels_map ================================================ FILE: ppdet/data/source/widerface.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from collections import defaultdict import os import numpy as np from scipy.io import loadmat from ppdet.core.workspace import register, serializable from .dataset import DetDataset from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) @register @serializable class WIDERFaceDataSet(DetDataset): """ Load WiderFace records with 'anno_path' Args: dataset_dir (str): root directory for dataset. image_dir (str): directory for images. anno_path (str): WiderFace annotation data. data_fields (list): key name of data dictionary, at least have 'image'. sample_num (int): number of samples to load, -1 means all. with_lmk (bool): whether to load face landmark keypoint labels. """ def __init__(self, dataset_dir=None, image_dir=None, anno_path=None, data_fields=['image'], sample_num=-1, with_lmk=False): super(WIDERFaceDataSet, self).__init__( dataset_dir=dataset_dir, image_dir=image_dir, anno_path=anno_path, data_fields=data_fields, sample_num=sample_num, with_lmk=with_lmk) self.anno_path = anno_path self.sample_num = sample_num self.roidbs = None self.cname2cid = None self.with_lmk = with_lmk def parse_dataset(self): anno_path = os.path.join(self.dataset_dir, self.anno_path) image_dir = os.path.join(self.dataset_dir, self.image_dir) txt_file = anno_path records = [] ct = 0 file_lists = self._load_file_list(txt_file) cname2cid = widerface_label() for item in file_lists: im_fname = item[0] im_id = np.array([ct]) gt_bbox = np.zeros((len(item) - 1, 4), dtype=np.float32) gt_class = np.zeros((len(item) - 1, 1), dtype=np.int32) gt_lmk_labels = np.zeros((len(item) - 1, 10), dtype=np.float32) lmk_ignore_flag = np.zeros((len(item) - 1, 1), dtype=np.int32) for index_box in range(len(item)): if index_box < 1: continue gt_bbox[index_box - 1] = item[index_box][0] if self.with_lmk: gt_lmk_labels[index_box - 1] = item[index_box][1] lmk_ignore_flag[index_box - 1] = item[index_box][2] im_fname = os.path.join(image_dir, im_fname) if image_dir else im_fname widerface_rec = { 'im_file': im_fname, 'im_id': im_id, } if 'image' in self.data_fields else {} gt_rec = { 'gt_bbox': gt_bbox, 'gt_class': gt_class, } for k, v in gt_rec.items(): if k in self.data_fields: widerface_rec[k] = v if self.with_lmk: widerface_rec['gt_keypoint'] = gt_lmk_labels widerface_rec['keypoint_ignore'] = lmk_ignore_flag if len(item) != 0: records.append(widerface_rec) ct += 1 if self.sample_num > 0 and ct >= self.sample_num: break assert len(records) > 0, 'not found any widerface in %s' % (anno_path) logger.debug('{} samples in file {}'.format(ct, anno_path)) self.roidbs, self.cname2cid = records, cname2cid def _load_file_list(self, input_txt): with open(input_txt, 'r') as f_dir: lines_input_txt = f_dir.readlines() file_dict = {} num_class = 0 exts = ['jpg', 'jpeg', 'png', 'bmp'] exts += [ext.upper() for ext in exts] for i in range(len(lines_input_txt)): line_txt = lines_input_txt[i].strip('\n\t\r') split_str = line_txt.split(' ') if len(split_str) == 1: img_file_name = os.path.split(split_str[0])[1] split_txt = img_file_name.split('.') if len(split_txt) < 2: continue elif split_txt[-1] in exts: if i != 0: num_class += 1 file_dict[num_class] = [line_txt] else: if len(line_txt) <= 6: continue result_boxs = [] xmin = float(split_str[0]) ymin = float(split_str[1]) w = float(split_str[2]) h = float(split_str[3]) # Filter out wrong labels if w < 0 or h < 0: logger.warning('Illegal box with w: {}, h: {} in ' 'img: {}, and it will be ignored'.format( w, h, file_dict[num_class][0])) continue xmin = max(0, xmin) ymin = max(0, ymin) xmax = xmin + w ymax = ymin + h gt_bbox = [xmin, ymin, xmax, ymax] result_boxs.append(gt_bbox) if self.with_lmk: assert len(split_str) > 18, 'When `with_lmk=True`, the number' \ 'of characters per line in the annotation file should' \ 'exceed 18.' lmk0_x = float(split_str[5]) lmk0_y = float(split_str[6]) lmk1_x = float(split_str[8]) lmk1_y = float(split_str[9]) lmk2_x = float(split_str[11]) lmk2_y = float(split_str[12]) lmk3_x = float(split_str[14]) lmk3_y = float(split_str[15]) lmk4_x = float(split_str[17]) lmk4_y = float(split_str[18]) lmk_ignore_flag = 0 if lmk0_x == -1 else 1 gt_lmk_label = [ lmk0_x, lmk0_y, lmk1_x, lmk1_y, lmk2_x, lmk2_y, lmk3_x, lmk3_y, lmk4_x, lmk4_y ] result_boxs.append(gt_lmk_label) result_boxs.append(lmk_ignore_flag) file_dict[num_class].append(result_boxs) return list(file_dict.values()) def widerface_label(): labels_map = {'face': 0} return labels_map @register @serializable class WIDERFaceValDataset(WIDERFaceDataSet): def __init__(self, dataset_dir=None, image_dir=None, anno_path=None, gt_mat_path=None, data_fields=['image'], sample_num=-1, with_lmk=False): super().__init__( dataset_dir=dataset_dir, image_dir=image_dir, anno_path=anno_path, data_fields=data_fields, sample_num=sample_num, with_lmk=with_lmk) self.gt_mat_path = gt_mat_path self.val_mat = os.path.join(self.dataset_dir, self.gt_mat_path, 'wider_face_val.mat') self.hard_mat_path = os.path.join(self.dataset_dir, self.gt_mat_path, 'wider_hard_val.mat') self.medium_mat_path = os.path.join(self.dataset_dir, self.gt_mat_path, 'wider_medium_val.mat') self.easy_mat_path = os.path.join(self.dataset_dir, self.gt_mat_path, 'wider_easy_val.mat') assert os.path.exists(self.val_mat), f'{self.val_mat} not exist' assert os.path.exists(self.hard_mat_path), f'{self.hard_mat_path} not exist' assert os.path.exists(self.medium_mat_path), f'{self.medium_mat_path} not exist' assert os.path.exists(self.easy_mat_path), f'{self.easy_mat_path} not exist' def parse_dataset(self): super().parse_dataset() box_list, flie_list, event_list, hard_info_list, medium_info_list, \ easy_info_list = self.get_gt_infos() setting_infos = [easy_info_list, medium_info_list, hard_info_list] settings = ['easy', 'medium', 'hard'] info_by_name = defaultdict(dict) for setting_id in range(3): info_list = setting_infos[setting_id] setting = settings[setting_id] for i in range(len(event_list)): img_list = flie_list[i][0] gt_box_list = box_list[i][0] sub_info_list = info_list[i][0] for j in range(len(img_list)): img_name = str(img_list[j][0][0]) gt_boxes = gt_box_list[j][0].astype(np.float32) info_by_name[img_name]['gt_ori_bbox'] = gt_boxes keep_index = sub_info_list[j][0] ignore = np.zeros(gt_boxes.shape[0]) if len(keep_index) != 0: ignore[keep_index-1] = 1 info_by_name[img_name][f'gt_{setting}_ignore'] = ignore for roidb in self.roidbs: img_file = roidb['im_file'].split('/')[-1] img_name = ".".join(img_file.split(".")[:-1]) roidb.update(info_by_name[img_name]) def get_gt_infos(self): """ gt dir: (wider_face_val.mat, wider_easy_val.mat, wider_medium_val.mat, wider_hard_val.mat)""" val_mat = loadmat(self.val_mat) hard_mat = loadmat(self.hard_mat_path) medium_mat = loadmat(self.medium_mat_path) easy_mat = loadmat(self.easy_mat_path) box_list = val_mat['face_bbx_list'] file_list = val_mat['file_list'] event_list = val_mat['event_list'] hard_info_list = hard_mat['gt_list'] medium_info_list = medium_mat['gt_list'] easy_info_list = easy_mat['gt_list'] return box_list, file_list, event_list, hard_info_list, medium_info_list, easy_info_list ================================================ FILE: ppdet/data/transform/__init__.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import operators from . import batch_operators from . import keypoint_operators from . import mot_operators from . import rotated_operators from . import keypoints_3d_operators from . import culane_operators from .operators import * from .batch_operators import * from .keypoint_operators import * from .mot_operators import * from .rotated_operators import * from .keypoints_3d_operators import * from .culane_operators import * __all__ = [] __all__ += registered_ops __all__ += keypoint_operators.__all__ __all__ += mot_operators.__all__ __all__ += culane_operators.__all__ ================================================ FILE: ppdet/data/transform/atss_assigner.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # The code is based on: # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/assigners/atss_assigner.py from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6): """Calculate overlap between two set of bboxes. If ``is_aligned `` is ``False``, then calculate the overlaps between each bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned pair of bboxes1 and bboxes2. Args: bboxes1 (Tensor): shape (B, m, 4) in format or empty. bboxes2 (Tensor): shape (B, n, 4) in format or empty. B indicates the batch dim, in shape (B1, B2, ..., Bn). If ``is_aligned `` is ``True``, then m and n must be equal. mode (str): "iou" (intersection over union) or "iof" (intersection over foreground). is_aligned (bool, optional): If True, then m and n must be equal. Default False. eps (float, optional): A value added to the denominator for numerical stability. Default 1e-6. Returns: Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,) """ assert mode in ['iou', 'iof', 'giou', 'diou'], 'Unsupported mode {}'.format( mode) # Either the boxes are empty or the length of boxes's last dimenstion is 4 assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0) assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0) # Batch dim must be the same # Batch dim: (B1, B2, ... Bn) assert bboxes1.shape[:-2] == bboxes2.shape[:-2] batch_shape = bboxes1.shape[:-2] rows = bboxes1.shape[-2] if bboxes1.shape[0] > 0 else 0 cols = bboxes2.shape[-2] if bboxes2.shape[0] > 0 else 0 if is_aligned: assert rows == cols if rows * cols == 0: if is_aligned: return np.random.random(batch_shape + (rows, )) else: return np.random.random(batch_shape + (rows, cols)) area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * ( bboxes1[..., 3] - bboxes1[..., 1]) area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * ( bboxes2[..., 3] - bboxes2[..., 1]) if is_aligned: lt = np.maximum(bboxes1[..., :2], bboxes2[..., :2]) # [B, rows, 2] rb = np.minimum(bboxes1[..., 2:], bboxes2[..., 2:]) # [B, rows, 2] wh = (rb - lt).clip(min=0) # [B, rows, 2] overlap = wh[..., 0] * wh[..., 1] if mode in ['iou', 'giou']: union = area1 + area2 - overlap else: union = area1 if mode == 'giou': enclosed_lt = np.minimum(bboxes1[..., :2], bboxes2[..., :2]) enclosed_rb = np.maximum(bboxes1[..., 2:], bboxes2[..., 2:]) if mode == 'diou': enclosed_lt = np.minimum(bboxes1[..., :2], bboxes2[..., :2]) enclosed_rb = np.maximum(bboxes1[..., 2:], bboxes2[..., 2:]) b1_x1, b1_y1 = bboxes1[..., 0], bboxes1[..., 1] b1_x2, b1_y2 = bboxes1[..., 2], bboxes1[..., 3] b2_x1, b2_y1 = bboxes2[..., 0], bboxes2[..., 1] b2_x2, b2_y2 = bboxes2[..., 2], bboxes2[..., 3] else: lt = np.maximum(bboxes1[..., :, None, :2], bboxes2[..., None, :, :2]) # [B, rows, cols, 2] rb = np.minimum(bboxes1[..., :, None, 2:], bboxes2[..., None, :, 2:]) # [B, rows, cols, 2] wh = (rb - lt).clip(min=0) # [B, rows, cols, 2] overlap = wh[..., 0] * wh[..., 1] if mode in ['iou', 'giou']: union = area1[..., None] + area2[..., None, :] - overlap else: union = area1[..., None] if mode == 'giou': enclosed_lt = np.minimum(bboxes1[..., :, None, :2], bboxes2[..., None, :, :2]) enclosed_rb = np.maximum(bboxes1[..., :, None, 2:], bboxes2[..., None, :, 2:]) if mode == 'diou': enclosed_lt = np.minimum(bboxes1[..., :, None, :2], bboxes2[..., None, :, :2]) enclosed_rb = np.maximum(bboxes1[..., :, None, 2:], bboxes2[..., None, :, 2:]) b1_x1, b1_y1 = bboxes1[..., :, None, 0], bboxes1[..., :, None, 1] b1_x2, b1_y2 = bboxes1[..., :, None, 2], bboxes1[..., :, None, 3] b2_x1, b2_y1 = bboxes2[..., None, :, 0], bboxes2[..., None, :, 1] b2_x2, b2_y2 = bboxes2[..., None, :, 2], bboxes2[..., None, :, 3] eps = np.array([eps]) union = np.maximum(union, eps) ious = overlap / union if mode in ['iou', 'iof']: return ious # calculate gious if mode in ['giou']: enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0) enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1] enclose_area = np.maximum(enclose_area, eps) gious = ious - (enclose_area - union) / enclose_area return gious if mode in ['diou']: left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4 right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4 rho2 = left + right enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0) enclose_c = enclose_wh[..., 0]**2 + enclose_wh[..., 1]**2 enclose_c = np.maximum(enclose_c, eps) dious = ious - rho2 / enclose_c return dious def topk_(input, k, axis=1, largest=True): x = -input if largest else input if axis == 0: row_index = np.arange(input.shape[1 - axis]) if k == x.shape[0]: # argpartition requires index < len(input) topk_index = np.argpartition(x, k - 1, axis=axis)[0:k, :] else: topk_index = np.argpartition(x, k, axis=axis)[0:k, :] topk_data = x[topk_index, row_index] topk_index_sort = np.argsort(topk_data, axis=axis) topk_data_sort = topk_data[topk_index_sort, row_index] topk_index_sort = topk_index[0:k, :][topk_index_sort, row_index] else: column_index = np.arange(x.shape[1 - axis])[:, None] topk_index = np.argpartition(x, k, axis=axis)[:, 0:k] topk_data = x[column_index, topk_index] topk_data = -topk_data if largest else topk_data topk_index_sort = np.argsort(topk_data, axis=axis) topk_data_sort = topk_data[column_index, topk_index_sort] topk_index_sort = topk_index[:, 0:k][column_index, topk_index_sort] return topk_data_sort, topk_index_sort class ATSSAssigner(object): """Assign a corresponding gt bbox or background to each bbox. Each proposals will be assigned with `0` or a positive integer indicating the ground truth index. - 0: negative sample, no assigned gt - positive integer: positive sample, index (1-based) of assigned gt Args: topk (float): number of bbox selected in each level """ def __init__(self, topk=9): self.topk = topk def __call__(self, bboxes, num_level_bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None): """Assign gt to bboxes. The assignment is done in following steps 1. compute iou between all bbox (bbox of all pyramid levels) and gt 2. compute center distance between all bbox and gt 3. on each pyramid level, for each gt, select k bbox whose center are closest to the gt center, so we total select k*l bbox as candidates for each gt 4. get corresponding iou for the these candidates, and compute the mean and std, set mean + std as the iou threshold 5. select these candidates whose iou are greater than or equal to the threshold as postive 6. limit the positive sample's center in gt Args: bboxes (np.array): Bounding boxes to be assigned, shape(n, 4). num_level_bboxes (List): num of bboxes in each level gt_bboxes (np.array): Groundtruth boxes, shape (k, 4). gt_bboxes_ignore (np.array, optional): Ground truth bboxes that are labelled as `ignored`, e.g., crowd boxes in COCO. gt_labels (np.array, optional): Label of gt_bboxes, shape (k, ). """ bboxes = bboxes[:, :4] num_gt, num_bboxes = gt_bboxes.shape[0], bboxes.shape[0] # assign 0 by default assigned_gt_inds = np.zeros((num_bboxes, ), dtype=np.int64) if num_gt == 0 or num_bboxes == 0: # No ground truth or boxes, return empty assignment max_overlaps = np.zeros((num_bboxes, )) if num_gt == 0: # No truth, assign everything to background assigned_gt_inds[:] = 0 if not np.any(gt_labels): assigned_labels = None else: assigned_labels = -np.ones((num_bboxes, ), dtype=np.int64) return assigned_gt_inds, max_overlaps # compute iou between all bbox and gt overlaps = bbox_overlaps(bboxes, gt_bboxes) # compute center distance between all bbox and gt gt_cx = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0 gt_cy = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0 gt_points = np.stack((gt_cx, gt_cy), axis=1) bboxes_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0 bboxes_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0 bboxes_points = np.stack((bboxes_cx, bboxes_cy), axis=1) distances = np.sqrt( np.power((bboxes_points[:, None, :] - gt_points[None, :, :]), 2) .sum(-1)) # Selecting candidates based on the center distance candidate_idxs = [] start_idx = 0 for bboxes_per_level in num_level_bboxes: # on each pyramid level, for each gt, # select k bbox whose center are closest to the gt center end_idx = start_idx + bboxes_per_level distances_per_level = distances[start_idx:end_idx, :] selectable_k = min(self.topk, bboxes_per_level) _, topk_idxs_per_level = topk_( distances_per_level, selectable_k, axis=0, largest=False) candidate_idxs.append(topk_idxs_per_level + start_idx) start_idx = end_idx candidate_idxs = np.concatenate(candidate_idxs, axis=0) # get corresponding iou for the these candidates, and compute the # mean and std, set mean + std as the iou threshold candidate_overlaps = overlaps[candidate_idxs, np.arange(num_gt)] overlaps_mean_per_gt = candidate_overlaps.mean(0) overlaps_std_per_gt = candidate_overlaps.std(0) overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt is_pos = candidate_overlaps >= overlaps_thr_per_gt[None, :] # limit the positive sample's center in gt for gt_idx in range(num_gt): candidate_idxs[:, gt_idx] += gt_idx * num_bboxes ep_bboxes_cx = np.broadcast_to( bboxes_cx.reshape(1, -1), [num_gt, num_bboxes]).reshape(-1) ep_bboxes_cy = np.broadcast_to( bboxes_cy.reshape(1, -1), [num_gt, num_bboxes]).reshape(-1) candidate_idxs = candidate_idxs.reshape(-1) # calculate the left, top, right, bottom distance between positive # bbox center and gt side l_ = ep_bboxes_cx[candidate_idxs].reshape(-1, num_gt) - gt_bboxes[:, 0] t_ = ep_bboxes_cy[candidate_idxs].reshape(-1, num_gt) - gt_bboxes[:, 1] r_ = gt_bboxes[:, 2] - ep_bboxes_cx[candidate_idxs].reshape(-1, num_gt) b_ = gt_bboxes[:, 3] - ep_bboxes_cy[candidate_idxs].reshape(-1, num_gt) is_in_gts = np.stack([l_, t_, r_, b_], axis=1).min(axis=1) > 0.01 is_pos = is_pos & is_in_gts # if an anchor box is assigned to multiple gts, # the one with the highest IoU will be selected. overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1) index = candidate_idxs.reshape(-1)[is_pos.reshape(-1)] overlaps_inf[index] = overlaps.T.reshape(-1)[index] overlaps_inf = overlaps_inf.reshape(num_gt, -1).T max_overlaps = overlaps_inf.max(axis=1) argmax_overlaps = overlaps_inf.argmax(axis=1) assigned_gt_inds[max_overlaps != -np.inf] = argmax_overlaps[max_overlaps != -np.inf] + 1 return assigned_gt_inds, max_overlaps def get_vlr_region(self, bboxes, num_level_bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None): """get vlr region for ld distillation. Args: bboxes (np.array): Bounding boxes to be assigned, shape(n, 4). num_level_bboxes (List): num of bboxes in each level gt_bboxes (np.array): Groundtruth boxes, shape (k, 4). gt_bboxes_ignore (np.array, optional): Ground truth bboxes that are labelled as `ignored`, e.g., crowd boxes in COCO. gt_labels (np.array, optional): Label of gt_bboxes, shape (k, ). """ bboxes = bboxes[:, :4] num_gt, num_bboxes = gt_bboxes.shape[0], bboxes.shape[0] # compute iou between all bbox and gt overlaps = bbox_overlaps(bboxes, gt_bboxes) # compute diou between all bbox and gt diou = bbox_overlaps(bboxes, gt_bboxes, mode='diou') # assign 0 by default assigned_gt_inds = np.zeros((num_bboxes, ), dtype=np.int64) vlr_region_iou = (assigned_gt_inds + 0).astype(np.float32) if num_gt == 0 or num_bboxes == 0: # No ground truth or boxes, return empty assignment max_overlaps = np.zeros((num_bboxes, )) if num_gt == 0: # No truth, assign everything to background assigned_gt_inds[:] = 0 if not np.any(gt_labels): assigned_labels = None else: assigned_labels = -np.ones((num_bboxes, ), dtype=np.int64) return assigned_gt_inds, max_overlaps # compute center distance between all bbox and gt gt_cx = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0 gt_cy = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0 gt_points = np.stack((gt_cx, gt_cy), axis=1) bboxes_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0 bboxes_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0 bboxes_points = np.stack((bboxes_cx, bboxes_cy), axis=1) distances = np.sqrt( np.power((bboxes_points[:, None, :] - gt_points[None, :, :]), 2) .sum(-1)) # Selecting candidates based on the center distance candidate_idxs = [] candidate_idxs_t = [] start_idx = 0 for bboxes_per_level in num_level_bboxes: # on each pyramid level, for each gt, # select k bbox whose center are closest to the gt center end_idx = start_idx + bboxes_per_level distances_per_level = distances[start_idx:end_idx, :] selectable_t = min(self.topk, bboxes_per_level) selectable_k = bboxes_per_level #k for all _, topt_idxs_per_level = topk_( distances_per_level, selectable_t, axis=0, largest=False) _, topk_idxs_per_level = topk_( distances_per_level, selectable_k, axis=0, largest=False) candidate_idxs_t.append(topt_idxs_per_level + start_idx) candidate_idxs.append(topk_idxs_per_level + start_idx) start_idx = end_idx candidate_idxs_t = np.concatenate(candidate_idxs_t, axis=0) candidate_idxs = np.concatenate(candidate_idxs, axis=0) # get corresponding iou for the these candidates, and compute the # mean and std, set mean + std as the iou threshold candidate_overlaps_t = overlaps[candidate_idxs_t, np.arange(num_gt)] # compute tdiou t_diou = diou[candidate_idxs, np.arange(num_gt)] overlaps_mean_per_gt = candidate_overlaps_t.mean(0) overlaps_std_per_gt = candidate_overlaps_t.std( 0, ddof=1) # NOTE: use Bessel correction overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt # compute region is_pos = (t_diou < overlaps_thr_per_gt[None, :]) & ( t_diou >= 0.25 * overlaps_thr_per_gt[None, :]) # limit the positive sample's center in gt for gt_idx in range(num_gt): candidate_idxs[:, gt_idx] += gt_idx * num_bboxes candidate_idxs = candidate_idxs.reshape(-1) # if an anchor box is assigned to multiple gts, # the one with the highest IoU will be selected. overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1) index = candidate_idxs.reshape(-1)[is_pos.reshape(-1)] overlaps_inf[index] = overlaps.T.reshape(-1)[index] overlaps_inf = overlaps_inf.reshape(num_gt, -1).T max_overlaps = overlaps_inf.max(axis=1) argmax_overlaps = overlaps_inf.argmax(axis=1) overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1) overlaps_inf = overlaps_inf.reshape(num_gt, -1).T assigned_gt_inds[max_overlaps != -np.inf] = argmax_overlaps[max_overlaps != -np.inf] + 1 vlr_region_iou[max_overlaps != -np.inf] = max_overlaps[max_overlaps != -np.inf] + 0 return vlr_region_iou ================================================ FILE: ppdet/data/transform/autoaugment_utils.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Reference: # https://github.com/tensorflow/tpu/blob/master/models/official/detection/utils/autoaugment_utils.py """AutoAugment util file.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import inspect import math from PIL import Image, ImageEnhance import numpy as np import cv2 from copy import deepcopy # This signifies the max integer that the controller RNN could predict for the # augmentation scheme. _MAX_LEVEL = 10. # Represents an invalid bounding box that is used for checking for padding # lists of bounding box coordinates for a few augmentation operations _INVALID_BOX = [[-1.0, -1.0, -1.0, -1.0]] def policy_v0(): """Autoaugment policy that was used in AutoAugment Detection Paper.""" # Each tuple is an augmentation operation of the form # (operation, probability, magnitude). Each element in policy is a # sub-policy that will be applied sequentially on the image. policy = [ [('TranslateX_BBox', 0.6, 4), ('Equalize', 0.8, 10)], [('TranslateY_Only_BBoxes', 0.2, 2), ('Cutout', 0.8, 8)], [('Sharpness', 0.0, 8), ('ShearX_BBox', 0.4, 0)], [('ShearY_BBox', 1.0, 2), ('TranslateY_Only_BBoxes', 0.6, 6)], [('Rotate_BBox', 0.6, 10), ('Color', 1.0, 6)], ] return policy def policy_v1(): """Autoaugment policy that was used in AutoAugment Detection Paper.""" # Each tuple is an augmentation operation of the form # (operation, probability, magnitude). Each element in policy is a # sub-policy that will be applied sequentially on the image. policy = [ [('TranslateX_BBox', 0.6, 4), ('Equalize', 0.8, 10)], [('TranslateY_Only_BBoxes', 0.2, 2), ('Cutout', 0.8, 8)], [('Sharpness', 0.0, 8), ('ShearX_BBox', 0.4, 0)], [('ShearY_BBox', 1.0, 2), ('TranslateY_Only_BBoxes', 0.6, 6)], [('Rotate_BBox', 0.6, 10), ('Color', 1.0, 6)], [('Color', 0.0, 0), ('ShearX_Only_BBoxes', 0.8, 4)], [('ShearY_Only_BBoxes', 0.8, 2), ('Flip_Only_BBoxes', 0.0, 10)], [('Equalize', 0.6, 10), ('TranslateX_BBox', 0.2, 2)], [('Color', 1.0, 10), ('TranslateY_Only_BBoxes', 0.4, 6)], [('Rotate_BBox', 0.8, 10), ('Contrast', 0.0, 10)], # , [('Cutout', 0.2, 2), ('Brightness', 0.8, 10)], [('Color', 1.0, 6), ('Equalize', 1.0, 2)], [('Cutout_Only_BBoxes', 0.4, 6), ('TranslateY_Only_BBoxes', 0.8, 2)], [('Color', 0.2, 8), ('Rotate_BBox', 0.8, 10)], [('Sharpness', 0.4, 4), ('TranslateY_Only_BBoxes', 0.0, 4)], [('Sharpness', 1.0, 4), ('SolarizeAdd', 0.4, 4)], [('Rotate_BBox', 1.0, 8), ('Sharpness', 0.2, 8)], [('ShearY_BBox', 0.6, 10), ('Equalize_Only_BBoxes', 0.6, 8)], [('ShearX_BBox', 0.2, 6), ('TranslateY_Only_BBoxes', 0.2, 10)], [('SolarizeAdd', 0.6, 8), ('Brightness', 0.8, 10)], ] return policy def policy_vtest(): """Autoaugment test policy for debugging.""" # Each tuple is an augmentation operation of the form # (operation, probability, magnitude). Each element in policy is a # sub-policy that will be applied sequentially on the image. policy = [[('TranslateX_BBox', 1.0, 4), ('Equalize', 1.0, 10)], ] return policy def policy_v2(): """Additional policy that performs well on object detection.""" # Each tuple is an augmentation operation of the form # (operation, probability, magnitude). Each element in policy is a # sub-policy that will be applied sequentially on the image. policy = [ [('Color', 0.0, 6), ('Cutout', 0.6, 8), ('Sharpness', 0.4, 8)], [('Rotate_BBox', 0.4, 8), ('Sharpness', 0.4, 2), ('Rotate_BBox', 0.8, 10)], [('TranslateY_BBox', 1.0, 8), ('AutoContrast', 0.8, 2)], [('AutoContrast', 0.4, 6), ('ShearX_BBox', 0.8, 8), ('Brightness', 0.0, 10)], [('SolarizeAdd', 0.2, 6), ('Contrast', 0.0, 10), ('AutoContrast', 0.6, 0)], [('Cutout', 0.2, 0), ('Solarize', 0.8, 8), ('Color', 1.0, 4)], [('TranslateY_BBox', 0.0, 4), ('Equalize', 0.6, 8), ('Solarize', 0.0, 10)], [('TranslateY_BBox', 0.2, 2), ('ShearY_BBox', 0.8, 8), ('Rotate_BBox', 0.8, 8)], [('Cutout', 0.8, 8), ('Brightness', 0.8, 8), ('Cutout', 0.2, 2)], [('Color', 0.8, 4), ('TranslateY_BBox', 1.0, 6), ('Rotate_BBox', 0.6, 6)], [('Rotate_BBox', 0.6, 10), ('BBox_Cutout', 1.0, 4), ('Cutout', 0.2, 8)], [('Rotate_BBox', 0.0, 0), ('Equalize', 0.6, 6), ('ShearY_BBox', 0.6, 8)], [('Brightness', 0.8, 8), ('AutoContrast', 0.4, 2), ('Brightness', 0.2, 2)], [('TranslateY_BBox', 0.4, 8), ('Solarize', 0.4, 6), ('SolarizeAdd', 0.2, 10)], [('Contrast', 1.0, 10), ('SolarizeAdd', 0.2, 8), ('Equalize', 0.2, 4)], ] return policy def policy_v3(): """"Additional policy that performs well on object detection.""" # Each tuple is an augmentation operation of the form # (operation, probability, magnitude). Each element in policy is a # sub-policy that will be applied sequentially on the image. policy = [ [('Posterize', 0.8, 2), ('TranslateX_BBox', 1.0, 8)], [('BBox_Cutout', 0.2, 10), ('Sharpness', 1.0, 8)], [('Rotate_BBox', 0.6, 8), ('Rotate_BBox', 0.8, 10)], [('Equalize', 0.8, 10), ('AutoContrast', 0.2, 10)], [('SolarizeAdd', 0.2, 2), ('TranslateY_BBox', 0.2, 8)], [('Sharpness', 0.0, 2), ('Color', 0.4, 8)], [('Equalize', 1.0, 8), ('TranslateY_BBox', 1.0, 8)], [('Posterize', 0.6, 2), ('Rotate_BBox', 0.0, 10)], [('AutoContrast', 0.6, 0), ('Rotate_BBox', 1.0, 6)], [('Equalize', 0.0, 4), ('Cutout', 0.8, 10)], [('Brightness', 1.0, 2), ('TranslateY_BBox', 1.0, 6)], [('Contrast', 0.0, 2), ('ShearY_BBox', 0.8, 0)], [('AutoContrast', 0.8, 10), ('Contrast', 0.2, 10)], [('Rotate_BBox', 1.0, 10), ('Cutout', 1.0, 10)], [('SolarizeAdd', 0.8, 6), ('Equalize', 0.8, 8)], ] return policy def _equal(val1, val2, eps=1e-8): return abs(val1 - val2) <= eps def blend(image1, image2, factor): """Blend image1 and image2 using 'factor'. Factor can be above 0.0. A value of 0.0 means only image1 is used. A value of 1.0 means only image2 is used. A value between 0.0 and 1.0 means we linearly interpolate the pixel values between the two images. A value greater than 1.0 "extrapolates" the difference between the two pixel values, and we clip the results to values between 0 and 255. Args: image1: An image Tensor of type uint8. image2: An image Tensor of type uint8. factor: A floating point value above 0.0. Returns: A blended image Tensor of type uint8. """ if factor == 0.0: return image1 if factor == 1.0: return image2 image1 = image1.astype(np.float32) image2 = image2.astype(np.float32) difference = image2 - image1 scaled = factor * difference # Do addition in float. temp = image1 + scaled # Interpolate if factor > 0.0 and factor < 1.0: # Interpolation means we always stay within 0 and 255. return temp.astype(np.uint8) # Extrapolate: # # We need to clip and then cast. return np.clip(temp, a_min=0, a_max=255).astype(np.uint8) def cutout(image, pad_size, replace=0): """Apply cutout (https://arxiv.org/abs/1708.04552) to image. This operation applies a (2*pad_size x 2*pad_size) mask of zeros to a random location within `img`. The pixel values filled in will be of the value `replace`. The located where the mask will be applied is randomly chosen uniformly over the whole image. Args: image: An image Tensor of type uint8. pad_size: Specifies how big the zero mask that will be generated is that is applied to the image. The mask will be of size (2*pad_size x 2*pad_size). replace: What pixel value to fill in the image in the area that has the cutout mask applied to it. Returns: An image Tensor that is of type uint8. Example: img = cv2.imread( "/home/vis/gry/train/img_data/test.jpg", cv2.COLOR_BGR2RGB ) new_img = cutout(img, pad_size=50, replace=0) """ image_height, image_width = image.shape[0], image.shape[1] cutout_center_height = np.random.randint(low=0, high=image_height) cutout_center_width = np.random.randint(low=0, high=image_width) lower_pad = np.maximum(0, cutout_center_height - pad_size) upper_pad = np.maximum(0, image_height - cutout_center_height - pad_size) left_pad = np.maximum(0, cutout_center_width - pad_size) right_pad = np.maximum(0, image_width - cutout_center_width - pad_size) cutout_shape = [ image_height - (lower_pad + upper_pad), image_width - (left_pad + right_pad) ] padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]] mask = np.pad(np.zeros( cutout_shape, dtype=image.dtype), padding_dims, 'constant', constant_values=1) mask = np.expand_dims(mask, -1) mask = np.tile(mask, [1, 1, 3]) image = np.where( np.equal(mask, 0), np.ones_like( image, dtype=image.dtype) * replace, image) return image.astype(np.uint8) def solarize(image, threshold=128): # For each pixel in the image, select the pixel # if the value is less than the threshold. # Otherwise, subtract 255 from the pixel. return np.where(image < threshold, image, 255 - image) def solarize_add(image, addition=0, threshold=128): # For each pixel in the image less than threshold # we add 'addition' amount to it and then clip the # pixel value to be between 0 and 255. The value # of 'addition' is between -128 and 128. added_image = image.astype(np.int64) + addition added_image = np.clip(added_image, a_min=0, a_max=255).astype(np.uint8) return np.where(image < threshold, added_image, image) def color(image, factor): """use cv2 to deal""" gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) degenerate = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR) return blend(degenerate, image, factor) # refer to https://github.com/4uiiurz1/pytorch-auto-augment/blob/024b2eac4140c38df8342f09998e307234cafc80/auto_augment.py#L197 def contrast(img, factor): img = ImageEnhance.Contrast(Image.fromarray(img)).enhance(factor) return np.array(img) def brightness(image, factor): """Equivalent of PIL Brightness.""" degenerate = np.zeros_like(image) return blend(degenerate, image, factor) def posterize(image, bits): """Equivalent of PIL Posterize.""" shift = 8 - bits return np.left_shift(np.right_shift(image, shift), shift) def rotate(image, degrees, replace): """Rotates the image by degrees either clockwise or counterclockwise. Args: image: An image Tensor of type uint8. degrees: Float, a scalar angle in degrees to rotate all images by. If degrees is positive the image will be rotated clockwise otherwise it will be rotated counterclockwise. replace: A one or three value 1D tensor to fill empty pixels caused by the rotate operation. Returns: The rotated version of image. """ image = wrap(image) image = Image.fromarray(image) image = image.rotate(degrees) image = np.array(image, dtype=np.uint8) return unwrap(image, replace) def random_shift_bbox(image, bbox, pixel_scaling, replace, new_min_bbox_coords=None): """Move the bbox and the image content to a slightly new random location. Args: image: 3D uint8 Tensor. bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) of type float that represents the normalized coordinates between 0 and 1. The potential values for the new min corner of the bbox will be between [old_min - pixel_scaling * bbox_height/2, old_min - pixel_scaling * bbox_height/2]. pixel_scaling: A float between 0 and 1 that specifies the pixel range that the new bbox location will be sampled from. replace: A one or three value 1D tensor to fill empty pixels. new_min_bbox_coords: If not None, then this is a tuple that specifies the (min_y, min_x) coordinates of the new bbox. Normally this is randomly specified, but this allows it to be manually set. The coordinates are the absolute coordinates between 0 and image height/width and are int32. Returns: The new image that will have the shifted bbox location in it along with the new bbox that contains the new coordinates. """ # Obtains image height and width and create helper clip functions. image_height, image_width = image.shape[0], image.shape[1] image_height = float(image_height) image_width = float(image_width) def clip_y(val): return np.clip(val, a_min=0, a_max=image_height - 1).astype(np.int32) def clip_x(val): return np.clip(val, a_min=0, a_max=image_width - 1).astype(np.int32) # Convert bbox to pixel coordinates. min_y = int(image_height * bbox[0]) min_x = int(image_width * bbox[1]) max_y = clip_y(image_height * bbox[2]) max_x = clip_x(image_width * bbox[3]) bbox_height, bbox_width = (max_y - min_y + 1, max_x - min_x + 1) image_height = int(image_height) image_width = int(image_width) # Select the new min/max bbox ranges that are used for sampling the # new min x/y coordinates of the shifted bbox. minval_y = clip_y(min_y - np.int32(pixel_scaling * float(bbox_height) / 2.0)) maxval_y = clip_y(min_y + np.int32(pixel_scaling * float(bbox_height) / 2.0)) minval_x = clip_x(min_x - np.int32(pixel_scaling * float(bbox_width) / 2.0)) maxval_x = clip_x(min_x + np.int32(pixel_scaling * float(bbox_width) / 2.0)) # Sample and calculate the new unclipped min/max coordinates of the new bbox. if new_min_bbox_coords is None: unclipped_new_min_y = np.random.randint( low=minval_y, high=maxval_y, dtype=np.int32) unclipped_new_min_x = np.random.randint( low=minval_x, high=maxval_x, dtype=np.int32) else: unclipped_new_min_y, unclipped_new_min_x = ( clip_y(new_min_bbox_coords[0]), clip_x(new_min_bbox_coords[1])) unclipped_new_max_y = unclipped_new_min_y + bbox_height - 1 unclipped_new_max_x = unclipped_new_min_x + bbox_width - 1 # Determine if any of the new bbox was shifted outside the current image. # This is used for determining if any of the original bbox content should be # discarded. new_min_y, new_min_x, new_max_y, new_max_x = ( clip_y(unclipped_new_min_y), clip_x(unclipped_new_min_x), clip_y(unclipped_new_max_y), clip_x(unclipped_new_max_x)) shifted_min_y = (new_min_y - unclipped_new_min_y) + min_y shifted_max_y = max_y - (unclipped_new_max_y - new_max_y) shifted_min_x = (new_min_x - unclipped_new_min_x) + min_x shifted_max_x = max_x - (unclipped_new_max_x - new_max_x) # Create the new bbox tensor by converting pixel integer values to floats. new_bbox = np.stack([ float(new_min_y) / float(image_height), float(new_min_x) / float(image_width), float(new_max_y) / float(image_height), float(new_max_x) / float(image_width) ]) # Copy the contents in the bbox and fill the old bbox location # with gray (128). bbox_content = image[shifted_min_y:shifted_max_y + 1, shifted_min_x: shifted_max_x + 1, :] def mask_and_add_image(min_y_, min_x_, max_y_, max_x_, mask, content_tensor, image_): """Applies mask to bbox region in image then adds content_tensor to it.""" mask = np.pad(mask, [[min_y_, (image_height - 1) - max_y_], [min_x_, (image_width - 1) - max_x_], [0, 0]], 'constant', constant_values=1) content_tensor = np.pad(content_tensor, [[min_y_, (image_height - 1) - max_y_], [min_x_, (image_width - 1) - max_x_], [0, 0]], 'constant', constant_values=0) return image_ * mask + content_tensor # Zero out original bbox location. mask = np.zeros_like(image)[min_y:max_y + 1, min_x:max_x + 1, :] grey_tensor = np.zeros_like(mask) + replace[0] image = mask_and_add_image(min_y, min_x, max_y, max_x, mask, grey_tensor, image) # Fill in bbox content to new bbox location. mask = np.zeros_like(bbox_content) image = mask_and_add_image(new_min_y, new_min_x, new_max_y, new_max_x, mask, bbox_content, image) return image.astype(np.uint8), new_bbox def _clip_bbox(min_y, min_x, max_y, max_x): """Clip bounding box coordinates between 0 and 1. Args: min_y: Normalized bbox coordinate of type float between 0 and 1. min_x: Normalized bbox coordinate of type float between 0 and 1. max_y: Normalized bbox coordinate of type float between 0 and 1. max_x: Normalized bbox coordinate of type float between 0 and 1. Returns: Clipped coordinate values between 0 and 1. """ min_y = np.clip(min_y, a_min=0, a_max=1.0) min_x = np.clip(min_x, a_min=0, a_max=1.0) max_y = np.clip(max_y, a_min=0, a_max=1.0) max_x = np.clip(max_x, a_min=0, a_max=1.0) return min_y, min_x, max_y, max_x def _check_bbox_area(min_y, min_x, max_y, max_x, delta=0.05): """Adjusts bbox coordinates to make sure the area is > 0. Args: min_y: Normalized bbox coordinate of type float between 0 and 1. min_x: Normalized bbox coordinate of type float between 0 and 1. max_y: Normalized bbox coordinate of type float between 0 and 1. max_x: Normalized bbox coordinate of type float between 0 and 1. delta: Float, this is used to create a gap of size 2 * delta between bbox min/max coordinates that are the same on the boundary. This prevents the bbox from having an area of zero. Returns: Tuple of new bbox coordinates between 0 and 1 that will now have a guaranteed area > 0. """ height = max_y - min_y width = max_x - min_x def _adjust_bbox_boundaries(min_coord, max_coord): # Make sure max is never 0 and min is never 1. max_coord = np.maximum(max_coord, 0.0 + delta) min_coord = np.minimum(min_coord, 1.0 - delta) return min_coord, max_coord if _equal(height, 0): min_y, max_y = _adjust_bbox_boundaries(min_y, max_y) if _equal(width, 0): min_x, max_x = _adjust_bbox_boundaries(min_x, max_x) return min_y, min_x, max_y, max_x def _scale_bbox_only_op_probability(prob): """Reduce the probability of the bbox-only operation. Probability is reduced so that we do not distort the content of too many bounding boxes that are close to each other. The value of 3.0 was a chosen hyper parameter when designing the autoaugment algorithm that we found empirically to work well. Args: prob: Float that is the probability of applying the bbox-only operation. Returns: Reduced probability. """ return prob / 3.0 def _apply_bbox_augmentation(image, bbox, augmentation_func, *args): """Applies augmentation_func to the subsection of image indicated by bbox. Args: image: 3D uint8 Tensor. bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) of type float that represents the normalized coordinates between 0 and 1. augmentation_func: Augmentation function that will be applied to the subsection of image. *args: Additional parameters that will be passed into augmentation_func when it is called. Returns: A modified version of image, where the bbox location in the image will have `ugmentation_func applied to it. """ image_height = image.shape[0] image_width = image.shape[1] min_y = int(image_height * bbox[0]) min_x = int(image_width * bbox[1]) max_y = int(image_height * bbox[2]) max_x = int(image_width * bbox[3]) # Clip to be sure the max values do not fall out of range. max_y = np.minimum(max_y, image_height - 1) max_x = np.minimum(max_x, image_width - 1) # Get the sub-tensor that is the image within the bounding box region. bbox_content = image[min_y:max_y + 1, min_x:max_x + 1, :] # Apply the augmentation function to the bbox portion of the image. augmented_bbox_content = augmentation_func(bbox_content, *args) # Pad the augmented_bbox_content and the mask to match the shape of original # image. augmented_bbox_content = np.pad( augmented_bbox_content, [[min_y, (image_height - 1) - max_y], [min_x, (image_width - 1) - max_x], [0, 0]], 'constant', constant_values=1) # Create a mask that will be used to zero out a part of the original image. mask_tensor = np.zeros_like(bbox_content) mask_tensor = np.pad(mask_tensor, [[min_y, (image_height - 1) - max_y], [min_x, (image_width - 1) - max_x], [0, 0]], 'constant', constant_values=1) # Replace the old bbox content with the new augmented content. image = image * mask_tensor + augmented_bbox_content return image.astype(np.uint8) def _concat_bbox(bbox, bboxes): """Helper function that concates bbox to bboxes along the first dimension.""" # Note if all elements in bboxes are -1 (_INVALID_BOX), then this means # we discard bboxes and start the bboxes Tensor with the current bbox. bboxes_sum_check = np.sum(bboxes) bbox = np.expand_dims(bbox, 0) # This check will be true when it is an _INVALID_BOX if _equal(bboxes_sum_check, -4): bboxes = bbox else: bboxes = np.concatenate([bboxes, bbox], 0) return bboxes def _apply_bbox_augmentation_wrapper(image, bbox, new_bboxes, prob, augmentation_func, func_changes_bbox, *args): """Applies _apply_bbox_augmentation with probability prob. Args: image: 3D uint8 Tensor. bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) of type float that represents the normalized coordinates between 0 and 1. new_bboxes: 2D Tensor that is a list of the bboxes in the image after they have been altered by aug_func. These will only be changed when func_changes_bbox is set to true. Each bbox has 4 elements (min_y, min_x, max_y, max_x) of type float that are the normalized bbox coordinates between 0 and 1. prob: Float that is the probability of applying _apply_bbox_augmentation. augmentation_func: Augmentation function that will be applied to the subsection of image. func_changes_bbox: Boolean. Does augmentation_func return bbox in addition to image. *args: Additional parameters that will be passed into augmentation_func when it is called. Returns: A tuple. Fist element is a modified version of image, where the bbox location in the image will have augmentation_func applied to it if it is chosen to be called with probability `prob`. The second element is a Tensor of Tensors of length 4 that will contain the altered bbox after applying augmentation_func. """ should_apply_op = (np.random.rand() + prob >= 1) if func_changes_bbox: if should_apply_op: augmented_image, bbox = augmentation_func(image, bbox, *args) else: augmented_image, bbox = (image, bbox) else: if should_apply_op: augmented_image = _apply_bbox_augmentation(image, bbox, augmentation_func, *args) else: augmented_image = image new_bboxes = _concat_bbox(bbox, new_bboxes) return augmented_image.astype(np.uint8), new_bboxes def _apply_multi_bbox_augmentation(image, bboxes, prob, aug_func, func_changes_bbox, *args): """Applies aug_func to the image for each bbox in bboxes. Args: image: 3D uint8 Tensor. bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox has 4 elements (min_y, min_x, max_y, max_x) of type float. prob: Float that is the probability of applying aug_func to a specific bounding box within the image. aug_func: Augmentation function that will be applied to the subsections of image indicated by the bbox values in bboxes. func_changes_bbox: Boolean. Does augmentation_func return bbox in addition to image. *args: Additional parameters that will be passed into augmentation_func when it is called. Returns: A modified version of image, where each bbox location in the image will have augmentation_func applied to it if it is chosen to be called with probability prob independently across all bboxes. Also the final bboxes are returned that will be unchanged if func_changes_bbox is set to false and if true, the new altered ones will be returned. """ # Will keep track of the new altered bboxes after aug_func is repeatedly # applied. The -1 values are a dummy value and this first Tensor will be # removed upon appending the first real bbox. new_bboxes = np.array(_INVALID_BOX) # If the bboxes are empty, then just give it _INVALID_BOX. The result # will be thrown away. bboxes = np.array((_INVALID_BOX)) if bboxes.size == 0 else bboxes assert bboxes.shape[1] == 4, "bboxes.shape[1] must be 4!!!!" # pylint:disable=g-long-lambda # pylint:disable=line-too-long wrapped_aug_func = lambda _image, bbox, _new_bboxes: _apply_bbox_augmentation_wrapper(_image, bbox, _new_bboxes, prob, aug_func, func_changes_bbox, *args) # pylint:enable=g-long-lambda # pylint:enable=line-too-long # Setup the while_loop. num_bboxes = bboxes.shape[0] # We loop until we go over all bboxes. idx = 0 # Counter for the while loop. # Conditional function when to end the loop once we go over all bboxes # images_and_bboxes contain (_image, _new_bboxes) def cond(_idx, _images_and_bboxes): return _idx < num_bboxes # Shuffle the bboxes so that the augmentation order is not deterministic if # we are not changing the bboxes with aug_func. # if not func_changes_bbox: # print(bboxes) # loop_bboxes = np.take(bboxes,np.random.permutation(bboxes.shape[0]),axis=0) # print(loop_bboxes) # else: # loop_bboxes = bboxes # we can not shuffle the bbox because it does not contain class information here loop_bboxes = deepcopy(bboxes) # Main function of while_loop where we repeatedly apply augmentation on the # bboxes in the image. # pylint:disable=g-long-lambda body = lambda _idx, _images_and_bboxes: [ _idx + 1, wrapped_aug_func(_images_and_bboxes[0], loop_bboxes[_idx], _images_and_bboxes[1])] while (cond(idx, (image, new_bboxes))): idx, (image, new_bboxes) = body(idx, (image, new_bboxes)) # Either return the altered bboxes or the original ones depending on if # we altered them in anyway. if func_changes_bbox: final_bboxes = new_bboxes else: final_bboxes = bboxes return image, final_bboxes def _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, aug_func, func_changes_bbox, *args): """Checks to be sure num bboxes > 0 before calling inner function.""" num_bboxes = len(bboxes) new_image = deepcopy(image) new_bboxes = deepcopy(bboxes) if num_bboxes != 0: new_image, new_bboxes = _apply_multi_bbox_augmentation( new_image, new_bboxes, prob, aug_func, func_changes_bbox, *args) return new_image, new_bboxes def rotate_only_bboxes(image, bboxes, prob, degrees, replace): """Apply rotate to each bbox in the image with probability prob.""" func_changes_bbox = False prob = _scale_bbox_only_op_probability(prob) return _apply_multi_bbox_augmentation_wrapper( image, bboxes, prob, rotate, func_changes_bbox, degrees, replace) def shear_x_only_bboxes(image, bboxes, prob, level, replace): """Apply shear_x to each bbox in the image with probability prob.""" func_changes_bbox = False prob = _scale_bbox_only_op_probability(prob) return _apply_multi_bbox_augmentation_wrapper( image, bboxes, prob, shear_x, func_changes_bbox, level, replace) def shear_y_only_bboxes(image, bboxes, prob, level, replace): """Apply shear_y to each bbox in the image with probability prob.""" func_changes_bbox = False prob = _scale_bbox_only_op_probability(prob) return _apply_multi_bbox_augmentation_wrapper( image, bboxes, prob, shear_y, func_changes_bbox, level, replace) def translate_x_only_bboxes(image, bboxes, prob, pixels, replace): """Apply translate_x to each bbox in the image with probability prob.""" func_changes_bbox = False prob = _scale_bbox_only_op_probability(prob) return _apply_multi_bbox_augmentation_wrapper( image, bboxes, prob, translate_x, func_changes_bbox, pixels, replace) def translate_y_only_bboxes(image, bboxes, prob, pixels, replace): """Apply translate_y to each bbox in the image with probability prob.""" func_changes_bbox = False prob = _scale_bbox_only_op_probability(prob) return _apply_multi_bbox_augmentation_wrapper( image, bboxes, prob, translate_y, func_changes_bbox, pixels, replace) def flip_only_bboxes(image, bboxes, prob): """Apply flip_lr to each bbox in the image with probability prob.""" func_changes_bbox = False prob = _scale_bbox_only_op_probability(prob) return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, np.fliplr, func_changes_bbox) def solarize_only_bboxes(image, bboxes, prob, threshold): """Apply solarize to each bbox in the image with probability prob.""" func_changes_bbox = False prob = _scale_bbox_only_op_probability(prob) return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, solarize, func_changes_bbox, threshold) def equalize_only_bboxes(image, bboxes, prob): """Apply equalize to each bbox in the image with probability prob.""" func_changes_bbox = False prob = _scale_bbox_only_op_probability(prob) return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, equalize, func_changes_bbox) def cutout_only_bboxes(image, bboxes, prob, pad_size, replace): """Apply cutout to each bbox in the image with probability prob.""" func_changes_bbox = False prob = _scale_bbox_only_op_probability(prob) return _apply_multi_bbox_augmentation_wrapper( image, bboxes, prob, cutout, func_changes_bbox, pad_size, replace) def _rotate_bbox(bbox, image_height, image_width, degrees): """Rotates the bbox coordinated by degrees. Args: bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) of type float that represents the normalized coordinates between 0 and 1. image_height: Int, height of the image. image_width: Int, height of the image. degrees: Float, a scalar angle in degrees to rotate all images by. If degrees is positive the image will be rotated clockwise otherwise it will be rotated counterclockwise. Returns: A tensor of the same shape as bbox, but now with the rotated coordinates. """ image_height, image_width = (float(image_height), float(image_width)) # Convert from degrees to radians. degrees_to_radians = math.pi / 180.0 radians = degrees * degrees_to_radians # Translate the bbox to the center of the image and turn the normalized 0-1 # coordinates to absolute pixel locations. # Y coordinates are made negative as the y axis of images goes down with # increasing pixel values, so we negate to make sure x axis and y axis points # are in the traditionally positive direction. min_y = -int(image_height * (bbox[0] - 0.5)) min_x = int(image_width * (bbox[1] - 0.5)) max_y = -int(image_height * (bbox[2] - 0.5)) max_x = int(image_width * (bbox[3] - 0.5)) coordinates = np.stack([[min_y, min_x], [min_y, max_x], [max_y, min_x], [max_y, max_x]]).astype(np.float32) # Rotate the coordinates according to the rotation matrix clockwise if # radians is positive, else negative rotation_matrix = np.stack([[math.cos(radians), math.sin(radians)], [-math.sin(radians), math.cos(radians)]]) new_coords = np.matmul(rotation_matrix, np.transpose(coordinates)).astype(np.int32) # Find min/max values and convert them back to normalized 0-1 floats. min_y = -(float(np.max(new_coords[0, :])) / image_height - 0.5) min_x = float(np.min(new_coords[1, :])) / image_width + 0.5 max_y = -(float(np.min(new_coords[0, :])) / image_height - 0.5) max_x = float(np.max(new_coords[1, :])) / image_width + 0.5 # Clip the bboxes to be sure the fall between [0, 1]. min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x) min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x) return np.stack([min_y, min_x, max_y, max_x]) def rotate_with_bboxes(image, bboxes, degrees, replace): # Rotate the image. image = rotate(image, degrees, replace) # Convert bbox coordinates to pixel values. image_height, image_width = image.shape[:2] # pylint:disable=g-long-lambda wrapped_rotate_bbox = lambda bbox: _rotate_bbox(bbox, image_height, image_width, degrees) # pylint:enable=g-long-lambda new_bboxes = np.zeros_like(bboxes) for idx in range(len(bboxes)): new_bboxes[idx] = wrapped_rotate_bbox(bboxes[idx]) return image, new_bboxes def translate_x(image, pixels, replace): """Equivalent of PIL Translate in X dimension.""" image = Image.fromarray(wrap(image)) image = image.transform(image.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0)) return unwrap(np.array(image), replace) def translate_y(image, pixels, replace): """Equivalent of PIL Translate in Y dimension.""" image = Image.fromarray(wrap(image)) image = image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels)) return unwrap(np.array(image), replace) def _shift_bbox(bbox, image_height, image_width, pixels, shift_horizontal): """Shifts the bbox coordinates by pixels. Args: bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) of type float that represents the normalized coordinates between 0 and 1. image_height: Int, height of the image. image_width: Int, width of the image. pixels: An int. How many pixels to shift the bbox. shift_horizontal: Boolean. If true then shift in X dimension else shift in Y dimension. Returns: A tensor of the same shape as bbox, but now with the shifted coordinates. """ pixels = int(pixels) # Convert bbox to integer pixel locations. min_y = int(float(image_height) * bbox[0]) min_x = int(float(image_width) * bbox[1]) max_y = int(float(image_height) * bbox[2]) max_x = int(float(image_width) * bbox[3]) if shift_horizontal: min_x = np.maximum(0, min_x - pixels) max_x = np.minimum(image_width, max_x - pixels) else: min_y = np.maximum(0, min_y - pixels) max_y = np.minimum(image_height, max_y - pixels) # Convert bbox back to floats. min_y = float(min_y) / float(image_height) min_x = float(min_x) / float(image_width) max_y = float(max_y) / float(image_height) max_x = float(max_x) / float(image_width) # Clip the bboxes to be sure the fall between [0, 1]. min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x) min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x) return np.stack([min_y, min_x, max_y, max_x]) def translate_bbox(image, bboxes, pixels, replace, shift_horizontal): """Equivalent of PIL Translate in X/Y dimension that shifts image and bbox. Args: image: 3D uint8 Tensor. bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox has 4 elements (min_y, min_x, max_y, max_x) of type float with values between [0, 1]. pixels: An int. How many pixels to shift the image and bboxes replace: A one or three value 1D tensor to fill empty pixels. shift_horizontal: Boolean. If true then shift in X dimension else shift in Y dimension. Returns: A tuple containing a 3D uint8 Tensor that will be the result of translating image by pixels. The second element of the tuple is bboxes, where now the coordinates will be shifted to reflect the shifted image. """ if shift_horizontal: image = translate_x(image, pixels, replace) else: image = translate_y(image, pixels, replace) # Convert bbox coordinates to pixel values. image_height, image_width = image.shape[0], image.shape[1] # pylint:disable=g-long-lambda wrapped_shift_bbox = lambda bbox: _shift_bbox(bbox, image_height, image_width, pixels, shift_horizontal) # pylint:enable=g-long-lambda new_bboxes = deepcopy(bboxes) num_bboxes = len(bboxes) for idx in range(num_bboxes): new_bboxes[idx] = wrapped_shift_bbox(bboxes[idx]) return image.astype(np.uint8), new_bboxes def shear_x(image, level, replace): """Equivalent of PIL Shearing in X dimension.""" # Shear parallel to x axis is a projective transform # with a matrix form of: # [1 level # 0 1]. image = Image.fromarray(wrap(image)) image = image.transform(image.size, Image.AFFINE, (1, level, 0, 0, 1, 0)) return unwrap(np.array(image), replace) def shear_y(image, level, replace): """Equivalent of PIL Shearing in Y dimension.""" # Shear parallel to y axis is a projective transform # with a matrix form of: # [1 0 # level 1]. image = Image.fromarray(wrap(image)) image = image.transform(image.size, Image.AFFINE, (1, 0, 0, level, 1, 0)) return unwrap(np.array(image), replace) def _shear_bbox(bbox, image_height, image_width, level, shear_horizontal): """Shifts the bbox according to how the image was sheared. Args: bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) of type float that represents the normalized coordinates between 0 and 1. image_height: Int, height of the image. image_width: Int, height of the image. level: Float. How much to shear the image. shear_horizontal: If true then shear in X dimension else shear in the Y dimension. Returns: A tensor of the same shape as bbox, but now with the shifted coordinates. """ image_height, image_width = (float(image_height), float(image_width)) # Change bbox coordinates to be pixels. min_y = int(image_height * bbox[0]) min_x = int(image_width * bbox[1]) max_y = int(image_height * bbox[2]) max_x = int(image_width * bbox[3]) coordinates = np.stack( [[min_y, min_x], [min_y, max_x], [max_y, min_x], [max_y, max_x]]) coordinates = coordinates.astype(np.float32) # Shear the coordinates according to the translation matrix. if shear_horizontal: translation_matrix = np.stack([[1, 0], [-level, 1]]) else: translation_matrix = np.stack([[1, -level], [0, 1]]) translation_matrix = translation_matrix.astype(np.float32) new_coords = np.matmul(translation_matrix, np.transpose(coordinates)).astype(np.int32) # Find min/max values and convert them back to floats. min_y = float(np.min(new_coords[0, :])) / image_height min_x = float(np.min(new_coords[1, :])) / image_width max_y = float(np.max(new_coords[0, :])) / image_height max_x = float(np.max(new_coords[1, :])) / image_width # Clip the bboxes to be sure the fall between [0, 1]. min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x) min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x) return np.stack([min_y, min_x, max_y, max_x]) def shear_with_bboxes(image, bboxes, level, replace, shear_horizontal): """Applies Shear Transformation to the image and shifts the bboxes. Args: image: 3D uint8 Tensor. bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox has 4 elements (min_y, min_x, max_y, max_x) of type float with values between [0, 1]. level: Float. How much to shear the image. This value will be between -0.3 to 0.3. replace: A one or three value 1D tensor to fill empty pixels. shear_horizontal: Boolean. If true then shear in X dimension else shear in the Y dimension. Returns: A tuple containing a 3D uint8 Tensor that will be the result of shearing image by level. The second element of the tuple is bboxes, where now the coordinates will be shifted to reflect the sheared image. """ if shear_horizontal: image = shear_x(image, level, replace) else: image = shear_y(image, level, replace) # Convert bbox coordinates to pixel values. image_height, image_width = image.shape[:2] # pylint:disable=g-long-lambda wrapped_shear_bbox = lambda bbox: _shear_bbox(bbox, image_height, image_width, level, shear_horizontal) # pylint:enable=g-long-lambda new_bboxes = deepcopy(bboxes) num_bboxes = len(bboxes) for idx in range(num_bboxes): new_bboxes[idx] = wrapped_shear_bbox(bboxes[idx]) return image.astype(np.uint8), new_bboxes def autocontrast(image): """Implements Autocontrast function from PIL. Args: image: A 3D uint8 tensor. Returns: The image after it has had autocontrast applied to it and will be of type uint8. """ def scale_channel(image): """Scale the 2D image using the autocontrast rule.""" # A possibly cheaper version can be done using cumsum/unique_with_counts # over the histogram values, rather than iterating over the entire image. # to compute mins and maxes. lo = float(np.min(image)) hi = float(np.max(image)) # Scale the image, making the lowest value 0 and the highest value 255. def scale_values(im): scale = 255.0 / (hi - lo) offset = -lo * scale im = im.astype(np.float32) * scale + offset img = np.clip(im, a_min=0, a_max=255.0) return im.astype(np.uint8) result = scale_values(image) if hi > lo else image return result # Assumes RGB for now. Scales each channel independently # and then stacks the result. s1 = scale_channel(image[:, :, 0]) s2 = scale_channel(image[:, :, 1]) s3 = scale_channel(image[:, :, 2]) image = np.stack([s1, s2, s3], 2) return image def sharpness(image, factor): """Implements Sharpness function from PIL.""" orig_image = image image = image.astype(np.float32) # Make image 4D for conv operation. # SMOOTH PIL Kernel. kernel = np.array([[1, 1, 1], [1, 5, 1], [1, 1, 1]], dtype=np.float32) / 13. result = cv2.filter2D(image, -1, kernel).astype(np.uint8) # Blend the final result. return blend(result, orig_image, factor) def equalize(image): """Implements Equalize function from PIL using.""" def scale_channel(im, c): """Scale the data in the channel to implement equalize.""" im = im[:, :, c].astype(np.int32) # Compute the histogram of the image channel. histo, _ = np.histogram(im, range=[0, 255], bins=256) # For the purposes of computing the step, filter out the nonzeros. nonzero = np.where(np.not_equal(histo, 0)) nonzero_histo = np.reshape(np.take(histo, nonzero), [-1]) step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255 def build_lut(histo, step): # Compute the cumulative sum, shifting by step // 2 # and then normalization by step. lut = (np.cumsum(histo) + (step // 2)) // step # Shift lut, prepending with 0. lut = np.concatenate([[0], lut[:-1]], 0) # Clip the counts to be in range. This is done # in the C code for image.point. return np.clip(lut, a_min=0, a_max=255).astype(np.uint8) # If step is zero, return the original image. Otherwise, build # lut from the full histogram and step and then index from it. if step == 0: result = im else: result = np.take(build_lut(histo, step), im) return result.astype(np.uint8) # Assumes RGB for now. Scales each channel independently # and then stacks the result. s1 = scale_channel(image, 0) s2 = scale_channel(image, 1) s3 = scale_channel(image, 2) image = np.stack([s1, s2, s3], 2) return image def wrap(image): """Returns 'image' with an extra channel set to all 1s.""" shape = image.shape extended_channel = 255 * np.ones([shape[0], shape[1], 1], image.dtype) extended = np.concatenate([image, extended_channel], 2).astype(image.dtype) return extended def unwrap(image, replace): """Unwraps an image produced by wrap. Where there is a 0 in the last channel for every spatial position, the rest of the three channels in that spatial dimension are grayed (set to 128). Operations like translate and shear on a wrapped Tensor will leave 0s in empty locations. Some transformations look at the intensity of values to do preprocessing, and we want these empty pixels to assume the 'average' value, rather than pure black. Args: image: A 3D Image Tensor with 4 channels. replace: A one or three value 1D tensor to fill empty pixels. Returns: image: A 3D image Tensor with 3 channels. """ image_shape = image.shape # Flatten the spatial dimensions. flattened_image = np.reshape(image, [-1, image_shape[2]]) # Find all pixels where the last channel is zero. alpha_channel = flattened_image[:, 3] replace = np.concatenate([replace, np.ones([1], image.dtype)], 0) # Where they are zero, fill them in with 'replace'. alpha_channel = np.reshape(alpha_channel, (-1, 1)) alpha_channel = np.tile(alpha_channel, reps=(1, flattened_image.shape[1])) flattened_image = np.where( np.equal(alpha_channel, 0), np.ones_like( flattened_image, dtype=image.dtype) * replace, flattened_image) image = np.reshape(flattened_image, image_shape) image = image[:, :, :3] return image.astype(np.uint8) def _cutout_inside_bbox(image, bbox, pad_fraction): """Generates cutout mask and the mean pixel value of the bbox. First a location is randomly chosen within the image as the center where the cutout mask will be applied. Note this can be towards the boundaries of the image, so the full cutout mask may not be applied. Args: image: 3D uint8 Tensor. bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) of type float that represents the normalized coordinates between 0 and 1. pad_fraction: Float that specifies how large the cutout mask should be in in reference to the size of the original bbox. If pad_fraction is 0.25, then the cutout mask will be of shape (0.25 * bbox height, 0.25 * bbox width). Returns: A tuple. Fist element is a tensor of the same shape as image where each element is either a 1 or 0 that is used to determine where the image will have cutout applied. The second element is the mean of the pixels in the image where the bbox is located. mask value: [0,1] """ image_height, image_width = image.shape[0], image.shape[1] # Transform from shape [1, 4] to [4]. bbox = np.squeeze(bbox) min_y = int(float(image_height) * bbox[0]) min_x = int(float(image_width) * bbox[1]) max_y = int(float(image_height) * bbox[2]) max_x = int(float(image_width) * bbox[3]) # Calculate the mean pixel values in the bounding box, which will be used # to fill the cutout region. mean = np.mean(image[min_y:max_y + 1, min_x:max_x + 1], axis=(0, 1)) # Cutout mask will be size pad_size_heigh * 2 by pad_size_width * 2 if the # region lies entirely within the bbox. box_height = max_y - min_y + 1 box_width = max_x - min_x + 1 pad_size_height = int(pad_fraction * (box_height / 2)) pad_size_width = int(pad_fraction * (box_width / 2)) # Sample the center location in the image where the zero mask will be applied. cutout_center_height = np.random.randint(min_y, max_y + 1, dtype=np.int32) cutout_center_width = np.random.randint(min_x, max_x + 1, dtype=np.int32) lower_pad = np.maximum(0, cutout_center_height - pad_size_height) upper_pad = np.maximum( 0, image_height - cutout_center_height - pad_size_height) left_pad = np.maximum(0, cutout_center_width - pad_size_width) right_pad = np.maximum(0, image_width - cutout_center_width - pad_size_width) cutout_shape = [ image_height - (lower_pad + upper_pad), image_width - (left_pad + right_pad) ] padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]] mask = np.pad(np.zeros( cutout_shape, dtype=image.dtype), padding_dims, 'constant', constant_values=1) mask = np.expand_dims(mask, 2) mask = np.tile(mask, [1, 1, 3]) return mask, mean def bbox_cutout(image, bboxes, pad_fraction, replace_with_mean): """Applies cutout to the image according to bbox information. This is a cutout variant that using bbox information to make more informed decisions on where to place the cutout mask. Args: image: 3D uint8 Tensor. bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox has 4 elements (min_y, min_x, max_y, max_x) of type float with values between [0, 1]. pad_fraction: Float that specifies how large the cutout mask should be in in reference to the size of the original bbox. If pad_fraction is 0.25, then the cutout mask will be of shape (0.25 * bbox height, 0.25 * bbox width). replace_with_mean: Boolean that specified what value should be filled in where the cutout mask is applied. Since the incoming image will be of uint8 and will not have had any mean normalization applied, by default we set the value to be 128. If replace_with_mean is True then we find the mean pixel values across the channel dimension and use those to fill in where the cutout mask is applied. Returns: A tuple. First element is a tensor of the same shape as image that has cutout applied to it. Second element is the bboxes that were passed in that will be unchanged. """ def apply_bbox_cutout(image, bboxes, pad_fraction): """Applies cutout to a single bounding box within image.""" # Choose a single bounding box to apply cutout to. random_index = np.random.randint(0, bboxes.shape[0], dtype=np.int32) # Select the corresponding bbox and apply cutout. chosen_bbox = np.take(bboxes, random_index, axis=0) mask, mean = _cutout_inside_bbox(image, chosen_bbox, pad_fraction) # When applying cutout we either set the pixel value to 128 or to the mean # value inside the bbox. replace = mean if replace_with_mean else [128] * 3 # Apply the cutout mask to the image. Where the mask is 0 we fill it with # `replace`. image = np.where( np.equal(mask, 0), np.ones_like( image, dtype=image.dtype) * replace, image).astype(image.dtype) return image # Check to see if there are boxes, if so then apply boxcutout. if len(bboxes) != 0: image = apply_bbox_cutout(image, bboxes, pad_fraction) return image, bboxes NAME_TO_FUNC = { 'AutoContrast': autocontrast, 'Equalize': equalize, 'Posterize': posterize, 'Solarize': solarize, 'SolarizeAdd': solarize_add, 'Color': color, 'Contrast': contrast, 'Brightness': brightness, 'Sharpness': sharpness, 'Cutout': cutout, 'BBox_Cutout': bbox_cutout, 'Rotate_BBox': rotate_with_bboxes, # pylint:disable=g-long-lambda 'TranslateX_BBox': lambda image, bboxes, pixels, replace: translate_bbox( image, bboxes, pixels, replace, shift_horizontal=True), 'TranslateY_BBox': lambda image, bboxes, pixels, replace: translate_bbox( image, bboxes, pixels, replace, shift_horizontal=False), 'ShearX_BBox': lambda image, bboxes, level, replace: shear_with_bboxes( image, bboxes, level, replace, shear_horizontal=True), 'ShearY_BBox': lambda image, bboxes, level, replace: shear_with_bboxes( image, bboxes, level, replace, shear_horizontal=False), # pylint:enable=g-long-lambda 'Rotate_Only_BBoxes': rotate_only_bboxes, 'ShearX_Only_BBoxes': shear_x_only_bboxes, 'ShearY_Only_BBoxes': shear_y_only_bboxes, 'TranslateX_Only_BBoxes': translate_x_only_bboxes, 'TranslateY_Only_BBoxes': translate_y_only_bboxes, 'Flip_Only_BBoxes': flip_only_bboxes, 'Solarize_Only_BBoxes': solarize_only_bboxes, 'Equalize_Only_BBoxes': equalize_only_bboxes, 'Cutout_Only_BBoxes': cutout_only_bboxes, } def _randomly_negate_tensor(tensor): """With 50% prob turn the tensor negative.""" should_flip = np.floor(np.random.rand() + 0.5) >= 1 final_tensor = tensor if should_flip else -tensor return final_tensor def _rotate_level_to_arg(level): level = (level / _MAX_LEVEL) * 30. level = _randomly_negate_tensor(level) return (level, ) def _shrink_level_to_arg(level): """Converts level to ratio by which we shrink the image content.""" if level == 0: return (1.0, ) # if level is zero, do not shrink the image # Maximum shrinking ratio is 2.9. level = 2. / (_MAX_LEVEL / level) + 0.9 return (level, ) def _enhance_level_to_arg(level): return ((level / _MAX_LEVEL) * 1.8 + 0.1, ) def _shear_level_to_arg(level): level = (level / _MAX_LEVEL) * 0.3 # Flip level to negative with 50% chance. level = _randomly_negate_tensor(level) return (level, ) def _translate_level_to_arg(level, translate_const): level = (level / _MAX_LEVEL) * float(translate_const) # Flip level to negative with 50% chance. level = _randomly_negate_tensor(level) return (level, ) def _bbox_cutout_level_to_arg(level, hparams): cutout_pad_fraction = (level / _MAX_LEVEL) * 0.75 # hparams.cutout_max_pad_fraction return (cutout_pad_fraction, False) # hparams.cutout_bbox_replace_with_mean def level_to_arg(hparams): return { 'AutoContrast': lambda level: (), 'Equalize': lambda level: (), 'Posterize': lambda level: (int((level / _MAX_LEVEL) * 4), ), 'Solarize': lambda level: (int((level / _MAX_LEVEL) * 256), ), 'SolarizeAdd': lambda level: (int((level / _MAX_LEVEL) * 110), ), 'Color': _enhance_level_to_arg, 'Contrast': _enhance_level_to_arg, 'Brightness': _enhance_level_to_arg, 'Sharpness': _enhance_level_to_arg, 'Cutout': lambda level: (int((level / _MAX_LEVEL) * 100), ), # hparams.cutout_const=100 # pylint:disable=g-long-lambda 'BBox_Cutout': lambda level: _bbox_cutout_level_to_arg(level, hparams), 'TranslateX_BBox': lambda level: _translate_level_to_arg(level, 250), # hparams.translate_const=250 'TranslateY_BBox': lambda level: _translate_level_to_arg(level, 250), # hparams.translate_cons # pylint:enable=g-long-lambda 'ShearX_BBox': _shear_level_to_arg, 'ShearY_BBox': _shear_level_to_arg, 'Rotate_BBox': _rotate_level_to_arg, 'Rotate_Only_BBoxes': _rotate_level_to_arg, 'ShearX_Only_BBoxes': _shear_level_to_arg, 'ShearY_Only_BBoxes': _shear_level_to_arg, # pylint:disable=g-long-lambda 'TranslateX_Only_BBoxes': lambda level: _translate_level_to_arg(level, 120), # hparams.translate_bbox_const 'TranslateY_Only_BBoxes': lambda level: _translate_level_to_arg(level, 120), # hparams.translate_bbox_const # pylint:enable=g-long-lambda 'Flip_Only_BBoxes': lambda level: (), 'Solarize_Only_BBoxes': lambda level: (int((level / _MAX_LEVEL) * 256), ), 'Equalize_Only_BBoxes': lambda level: (), # pylint:disable=g-long-lambda 'Cutout_Only_BBoxes': lambda level: (int((level / _MAX_LEVEL) * 50), ), # hparams.cutout_bbox_const # pylint:enable=g-long-lambda } def bbox_wrapper(func): """Adds a bboxes function argument to func and returns unchanged bboxes.""" def wrapper(images, bboxes, *args, **kwargs): return (func(images, *args, **kwargs), bboxes) return wrapper def _parse_policy_info(name, prob, level, replace_value, augmentation_hparams): """Return the function that corresponds to `name` and update `level` param.""" func = NAME_TO_FUNC[name] args = level_to_arg(augmentation_hparams)[name](level) # Check to see if prob is passed into function. This is used for operations # where we alter bboxes independently. # pytype:disable=wrong-arg-types if 'prob' in inspect.getfullargspec(func)[0]: args = tuple([prob] + list(args)) # pytype:enable=wrong-arg-types # Add in replace arg if it is required for the function that is being called. if 'replace' in inspect.getfullargspec(func)[0]: # Make sure replace is the final argument assert 'replace' == inspect.getfullargspec(func)[0][-1] args = tuple(list(args) + [replace_value]) # Add bboxes as the second positional argument for the function if it does # not already exist. if 'bboxes' not in inspect.getfullargspec(func)[0]: func = bbox_wrapper(func) return (func, prob, args) def _apply_func_with_prob(func, image, args, prob, bboxes): """Apply `func` to image w/ `args` as input with probability `prob`.""" assert isinstance(args, tuple) assert 'bboxes' == inspect.getfullargspec(func)[0][1] # If prob is a function argument, then this randomness is being handled # inside the function, so make sure it is always called. if 'prob' in inspect.getfullargspec(func)[0]: prob = 1.0 # Apply the function with probability `prob`. should_apply_op = np.floor(np.random.rand() + 0.5) >= 1 if should_apply_op: augmented_image, augmented_bboxes = func(image, bboxes, *args) else: augmented_image, augmented_bboxes = (image, bboxes) return augmented_image, augmented_bboxes def select_and_apply_random_policy(policies, image, bboxes): """Select a random policy from `policies` and apply it to `image`.""" policy_to_select = np.random.randint(0, len(policies), dtype=np.int32) # policy_to_select = 6 # for test for (i, policy) in enumerate(policies): if i == policy_to_select: image, bboxes = policy(image, bboxes) return (image, bboxes) def build_and_apply_nas_policy(policies, image, bboxes, augmentation_hparams): """Build a policy from the given policies passed in and apply to image. Args: policies: list of lists of tuples in the form `(func, prob, level)`, `func` is a string name of the augmentation function, `prob` is the probability of applying the `func` operation, `level` is the input argument for `func`. image: numpy array that the resulting policy will be applied to. bboxes: augmentation_hparams: Hparams associated with the NAS learned policy. Returns: A version of image that now has data augmentation applied to it based on the `policies` pass into the function. Additionally, returns bboxes if a value for them is passed in that is not None """ replace_value = [128, 128, 128] # func is the string name of the augmentation function, prob is the # probability of applying the operation and level is the parameter associated # tf_policies are functions that take in an image and return an augmented # image. tf_policies = [] for policy in policies: tf_policy = [] # Link string name to the correct python function and make sure the correct # argument is passed into that function. for policy_info in policy: policy_info = list( policy_info) + [replace_value, augmentation_hparams] tf_policy.append(_parse_policy_info(*policy_info)) # Now build the tf policy that will apply the augmentation procedue # on image. def make_final_policy(tf_policy_): def final_policy(image_, bboxes_): for func, prob, args in tf_policy_: image_, bboxes_ = _apply_func_with_prob(func, image_, args, prob, bboxes_) return image_, bboxes_ return final_policy tf_policies.append(make_final_policy(tf_policy)) augmented_images, augmented_bboxes = select_and_apply_random_policy( tf_policies, image, bboxes) # If no bounding boxes were specified, then just return the images. return (augmented_images, augmented_bboxes) # TODO(barretzoph): Add in ArXiv link once paper is out. def distort_image_with_autoaugment(image, bboxes, augmentation_name): """Applies the AutoAugment policy to `image` and `bboxes`. Args: image: `Tensor` of shape [height, width, 3] representing an image. bboxes: `Tensor` of shape [N, 4] representing ground truth boxes that are normalized between [0, 1]. augmentation_name: The name of the AutoAugment policy to use. The available options are `v0`, `v1`, `v2`, `v3` and `test`. `v0` is the policy used for all of the results in the paper and was found to achieve the best results on the COCO dataset. `v1`, `v2` and `v3` are additional good policies found on the COCO dataset that have slight variation in what operations were used during the search procedure along with how many operations are applied in parallel to a single image (2 vs 3). Returns: A tuple containing the augmented versions of `image` and `bboxes`. """ available_policies = { 'v0': policy_v0, 'v1': policy_v1, 'v2': policy_v2, 'v3': policy_v3, 'test': policy_vtest } if augmentation_name not in available_policies: raise ValueError('Invalid augmentation_name: {}'.format( augmentation_name)) policy = available_policies[augmentation_name]() augmentation_hparams = {} return build_and_apply_nas_policy(policy, image, bboxes, augmentation_hparams) ================================================ FILE: ppdet/data/transform/batch_operators.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import typing try: from collections.abc import Sequence except Exception: from collections import Sequence import cv2 import copy import math import numpy as np from .operators import register_op, BaseOperator, Resize from .op_helper import jaccard_overlap, gaussian2D, gaussian_radius, draw_umich_gaussian from .atss_assigner import ATSSAssigner from scipy import ndimage from ppdet.modeling import bbox_utils from ppdet.utils.logger import setup_logger from ppdet.modeling.keypoint_utils import get_affine_transform, affine_transform logger = setup_logger(__name__) __all__ = [ 'PadBatch', 'BatchRandomResize', 'Gt2YoloTarget', 'Gt2FCOSTarget', 'Gt2TTFTarget', 'Gt2Solov2Target', 'Gt2SparseTarget', 'PadMaskBatch', 'Gt2GFLTarget', 'Gt2CenterNetTarget', 'Gt2CenterTrackTarget', 'PadGT', 'PadRGT', 'BatchRandomResizeForSSOD' ] @register_op class PadBatch(BaseOperator): """ Pad a batch of samples so they can be divisible by a stride. The layout of each image should be 'CHW'. Args: pad_to_stride (int): If `pad_to_stride > 0`, pad zeros to ensure height and width is divisible by `pad_to_stride`. """ def __init__(self, pad_to_stride=0): super(PadBatch, self).__init__() self.pad_to_stride = pad_to_stride def __call__(self, samples, context=None): """ Args: samples (list): a batch of sample, each is dict. """ coarsest_stride = self.pad_to_stride # multi scale input is nested list if isinstance(samples, typing.Sequence) and len(samples) > 0 and isinstance( samples[0], typing.Sequence): inner_samples = samples[0] else: inner_samples = samples max_shape = np.array( [data['image'].shape for data in inner_samples]).max(axis=0) if coarsest_stride > 0: max_shape[1] = int( np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride) max_shape[2] = int( np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride) for data in inner_samples: im = data['image'] im_c, im_h, im_w = im.shape[:] padding_im = np.zeros( (im_c, max_shape[1], max_shape[2]), dtype=np.float32) padding_im[:, :im_h, :im_w] = im data['image'] = padding_im if 'semantic' in data and data['semantic'] is not None: semantic = data['semantic'] padding_sem = np.zeros( (1, max_shape[1], max_shape[2]), dtype=np.float32) padding_sem[:, :im_h, :im_w] = semantic data['semantic'] = padding_sem if 'gt_segm' in data and data['gt_segm'] is not None: gt_segm = data['gt_segm'] padding_segm = np.zeros( (gt_segm.shape[0], max_shape[1], max_shape[2]), dtype=np.uint8) padding_segm[:, :im_h, :im_w] = gt_segm data['gt_segm'] = padding_segm return samples @register_op class BatchRandomResize(BaseOperator): """ Resize image to target size randomly. random target_size and interpolation method Args: target_size (int, list, tuple): image target size, if random size is True, must be list or tuple keep_ratio (bool): whether keep_raio or not, default true interp (int): the interpolation method random_size (bool): whether random select target size of image random_interp (bool): whether random select interpolation method """ def __init__(self, target_size, keep_ratio, interp=cv2.INTER_NEAREST, random_size=True, random_interp=False): super(BatchRandomResize, self).__init__() self.keep_ratio = keep_ratio self.interps = [ cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_AREA, cv2.INTER_CUBIC, cv2.INTER_LANCZOS4, ] self.interp = interp assert isinstance(target_size, ( int, Sequence)), "target_size must be int, list or tuple" if random_size and not isinstance(target_size, list): raise TypeError( "Type of target_size is invalid when random_size is True. Must be List, now is {}". format(type(target_size))) self.target_size = target_size self.random_size = random_size self.random_interp = random_interp def __call__(self, samples, context=None): if self.random_size: index = np.random.choice(len(self.target_size)) target_size = self.target_size[index] else: target_size = self.target_size if self.random_interp: interp = np.random.choice(self.interps) else: interp = self.interp resizer = Resize(target_size, keep_ratio=self.keep_ratio, interp=interp) return resizer(samples, context=context) @register_op class Gt2YoloTarget(BaseOperator): __shared__ = ['num_classes'] """ Generate YOLOv3 targets by groud truth data, this operator is only used in fine grained YOLOv3 loss mode """ def __init__(self, anchors, anchor_masks, downsample_ratios, num_classes=80, iou_thresh=1.): super(Gt2YoloTarget, self).__init__() self.anchors = anchors self.anchor_masks = anchor_masks self.downsample_ratios = downsample_ratios self.num_classes = num_classes self.iou_thresh = iou_thresh def __call__(self, samples, context=None): assert len(self.anchor_masks) == len(self.downsample_ratios), \ "anchor_masks', and 'downsample_ratios' should have same length." h, w = samples[0]['image'].shape[1:3] an_hw = np.array(self.anchors) / np.array([[w, h]]) for sample in samples: gt_bbox = sample['gt_bbox'] gt_class = sample['gt_class'] if 'gt_score' not in sample: sample['gt_score'] = np.ones( (gt_bbox.shape[0], 1), dtype=np.float32) gt_score = sample['gt_score'] for i, ( mask, downsample_ratio ) in enumerate(zip(self.anchor_masks, self.downsample_ratios)): grid_h = int(h / downsample_ratio) grid_w = int(w / downsample_ratio) target = np.zeros( (len(mask), 6 + self.num_classes, grid_h, grid_w), dtype=np.float32) for b in range(gt_bbox.shape[0]): gx, gy, gw, gh = gt_bbox[b, :] cls = gt_class[b] score = gt_score[b] if gw <= 0. or gh <= 0. or score <= 0.: continue # find best match anchor index best_iou = 0. best_idx = -1 for an_idx in range(an_hw.shape[0]): iou = jaccard_overlap( [0., 0., gw, gh], [0., 0., an_hw[an_idx, 0], an_hw[an_idx, 1]]) if iou > best_iou: best_iou = iou best_idx = an_idx gi = int(gx * grid_w) gj = int(gy * grid_h) # gtbox should be regresed in this layes if best match # anchor index in anchor mask of this layer if best_idx in mask: best_n = mask.index(best_idx) # x, y, w, h, scale target[best_n, 0, gj, gi] = gx * grid_w - gi target[best_n, 1, gj, gi] = gy * grid_h - gj target[best_n, 2, gj, gi] = np.log( gw * w / self.anchors[best_idx][0]) target[best_n, 3, gj, gi] = np.log( gh * h / self.anchors[best_idx][1]) target[best_n, 4, gj, gi] = 2.0 - gw * gh # objectness record gt_score target[best_n, 5, gj, gi] = score # classification target[best_n, 6 + cls, gj, gi] = 1. # For non-matched anchors, calculate the target if the iou # between anchor and gt is larger than iou_thresh if self.iou_thresh < 1: for idx, mask_i in enumerate(mask): if mask_i == best_idx: continue iou = jaccard_overlap( [0., 0., gw, gh], [0., 0., an_hw[mask_i, 0], an_hw[mask_i, 1]]) if iou > self.iou_thresh and target[idx, 5, gj, gi] == 0.: # x, y, w, h, scale target[idx, 0, gj, gi] = gx * grid_w - gi target[idx, 1, gj, gi] = gy * grid_h - gj target[idx, 2, gj, gi] = np.log( gw * w / self.anchors[mask_i][0]) target[idx, 3, gj, gi] = np.log( gh * h / self.anchors[mask_i][1]) target[idx, 4, gj, gi] = 2.0 - gw * gh # objectness record gt_score target[idx, 5, gj, gi] = score # classification target[idx, 6 + cls, gj, gi] = 1. sample['target{}'.format(i)] = target # remove useless gt_class and gt_score after target calculated sample.pop('gt_class') sample.pop('gt_score') return samples @register_op class Gt2FCOSTarget(BaseOperator): """ Generate FCOS targets by groud truth data """ def __init__(self, object_sizes_boundary, center_sampling_radius, downsample_ratios, num_shift=0.5, multiply_strides_reg_targets=False, norm_reg_targets=True): super(Gt2FCOSTarget, self).__init__() self.center_sampling_radius = center_sampling_radius self.downsample_ratios = downsample_ratios self.INF = np.inf self.object_sizes_boundary = [-1] + object_sizes_boundary + [self.INF] object_sizes_of_interest = [] for i in range(len(self.object_sizes_boundary) - 1): object_sizes_of_interest.append([ self.object_sizes_boundary[i], self.object_sizes_boundary[i + 1] ]) self.object_sizes_of_interest = object_sizes_of_interest self.num_shift = num_shift self.multiply_strides_reg_targets = multiply_strides_reg_targets self.norm_reg_targets = norm_reg_targets def _compute_points(self, w, h): """ compute the corresponding points in each feature map :param h: image height :param w: image width :return: points from all feature map """ locations = [] for stride in self.downsample_ratios: shift_x = np.arange(0, w, stride).astype(np.float32) shift_y = np.arange(0, h, stride).astype(np.float32) shift_x, shift_y = np.meshgrid(shift_x, shift_y) shift_x = shift_x.flatten() shift_y = shift_y.flatten() location = np.stack( [shift_x, shift_y], axis=1) + stride * self.num_shift locations.append(location) num_points_each_level = [len(location) for location in locations] locations = np.concatenate(locations, axis=0) return locations, num_points_each_level def _convert_xywh2xyxy(self, gt_bbox, w, h): """ convert the bounding box from style xywh to xyxy :param gt_bbox: bounding boxes normalized into [0, 1] :param w: image width :param h: image height :return: bounding boxes in xyxy style """ bboxes = gt_bbox.copy() bboxes[:, [0, 2]] = bboxes[:, [0, 2]] * w bboxes[:, [1, 3]] = bboxes[:, [1, 3]] * h bboxes[:, 2] = bboxes[:, 0] + bboxes[:, 2] bboxes[:, 3] = bboxes[:, 1] + bboxes[:, 3] return bboxes def _check_inside_boxes_limited(self, gt_bbox, xs, ys, num_points_each_level): """ check if points is within the clipped boxes :param gt_bbox: bounding boxes :param xs: horizontal coordinate of points :param ys: vertical coordinate of points :return: the mask of points is within gt_box or not """ bboxes = np.reshape( gt_bbox, newshape=[1, gt_bbox.shape[0], gt_bbox.shape[1]]) bboxes = np.tile(bboxes, reps=[xs.shape[0], 1, 1]) ct_x = (bboxes[:, :, 0] + bboxes[:, :, 2]) / 2 ct_y = (bboxes[:, :, 1] + bboxes[:, :, 3]) / 2 beg = 0 clipped_box = bboxes.copy() for lvl, stride in enumerate(self.downsample_ratios): end = beg + num_points_each_level[lvl] stride_exp = self.center_sampling_radius * stride clipped_box[beg:end, :, 0] = np.maximum( bboxes[beg:end, :, 0], ct_x[beg:end, :] - stride_exp) clipped_box[beg:end, :, 1] = np.maximum( bboxes[beg:end, :, 1], ct_y[beg:end, :] - stride_exp) clipped_box[beg:end, :, 2] = np.minimum( bboxes[beg:end, :, 2], ct_x[beg:end, :] + stride_exp) clipped_box[beg:end, :, 3] = np.minimum( bboxes[beg:end, :, 3], ct_y[beg:end, :] + stride_exp) beg = end l_res = xs - clipped_box[:, :, 0] r_res = clipped_box[:, :, 2] - xs t_res = ys - clipped_box[:, :, 1] b_res = clipped_box[:, :, 3] - ys clipped_box_reg_targets = np.stack([l_res, t_res, r_res, b_res], axis=2) inside_gt_box = np.min(clipped_box_reg_targets, axis=2) > 0 return inside_gt_box def __call__(self, samples, context=None): assert len(self.object_sizes_of_interest) == len(self.downsample_ratios), \ "object_sizes_of_interest', and 'downsample_ratios' should have same length." for sample in samples: im = sample['image'] bboxes = sample['gt_bbox'] gt_class = sample['gt_class'] # calculate the locations h, w = im.shape[1:3] points, num_points_each_level = self._compute_points(w, h) object_scale_exp = [] for i, num_pts in enumerate(num_points_each_level): object_scale_exp.append( np.tile( np.array([self.object_sizes_of_interest[i]]), reps=[num_pts, 1])) object_scale_exp = np.concatenate(object_scale_exp, axis=0) gt_area = (bboxes[:, 2] - bboxes[:, 0]) * ( bboxes[:, 3] - bboxes[:, 1]) xs, ys = points[:, 0], points[:, 1] xs = np.reshape(xs, newshape=[xs.shape[0], 1]) xs = np.tile(xs, reps=[1, bboxes.shape[0]]) ys = np.reshape(ys, newshape=[ys.shape[0], 1]) ys = np.tile(ys, reps=[1, bboxes.shape[0]]) l_res = xs - bboxes[:, 0] r_res = bboxes[:, 2] - xs t_res = ys - bboxes[:, 1] b_res = bboxes[:, 3] - ys reg_targets = np.stack([l_res, t_res, r_res, b_res], axis=2) if self.center_sampling_radius > 0: is_inside_box = self._check_inside_boxes_limited( bboxes, xs, ys, num_points_each_level) else: is_inside_box = np.min(reg_targets, axis=2) > 0 # check if the targets is inside the corresponding level max_reg_targets = np.max(reg_targets, axis=2) lower_bound = np.tile( np.expand_dims( object_scale_exp[:, 0], axis=1), reps=[1, max_reg_targets.shape[1]]) high_bound = np.tile( np.expand_dims( object_scale_exp[:, 1], axis=1), reps=[1, max_reg_targets.shape[1]]) is_match_current_level = \ (max_reg_targets > lower_bound) & \ (max_reg_targets < high_bound) points2gtarea = np.tile( np.expand_dims( gt_area, axis=0), reps=[xs.shape[0], 1]) points2gtarea[is_inside_box == 0] = self.INF points2gtarea[is_match_current_level == 0] = self.INF points2min_area = points2gtarea.min(axis=1) points2min_area_ind = points2gtarea.argmin(axis=1) labels = gt_class[points2min_area_ind] + 1 labels[points2min_area == self.INF] = 0 reg_targets = reg_targets[range(xs.shape[0]), points2min_area_ind] ctn_targets = np.sqrt((reg_targets[:, [0, 2]].min(axis=1) / \ reg_targets[:, [0, 2]].max(axis=1)) * \ (reg_targets[:, [1, 3]].min(axis=1) / \ reg_targets[:, [1, 3]].max(axis=1))).astype(np.float32) ctn_targets = np.reshape( ctn_targets, newshape=[ctn_targets.shape[0], 1]) ctn_targets[labels <= 0] = 0 pos_ind = np.nonzero(labels != 0) reg_targets_pos = reg_targets[pos_ind[0], :] split_sections = [] beg = 0 for lvl in range(len(num_points_each_level)): end = beg + num_points_each_level[lvl] split_sections.append(end) beg = end labels_by_level = np.split(labels, split_sections, axis=0) reg_targets_by_level = np.split(reg_targets, split_sections, axis=0) ctn_targets_by_level = np.split(ctn_targets, split_sections, axis=0) for lvl in range(len(self.downsample_ratios)): grid_w = int(np.ceil(w / self.downsample_ratios[lvl])) grid_h = int(np.ceil(h / self.downsample_ratios[lvl])) if self.norm_reg_targets: if self.multiply_strides_reg_targets: sample['reg_target{}'.format(lvl)] = np.reshape( reg_targets_by_level[lvl], newshape=[grid_h, grid_w, 4]) else: sample['reg_target{}'.format(lvl)] = \ np.reshape( reg_targets_by_level[lvl] / \ self.downsample_ratios[lvl], newshape=[grid_h, grid_w, 4]) else: sample['reg_target{}'.format(lvl)] = np.reshape( reg_targets_by_level[lvl], newshape=[grid_h, grid_w, 4]) sample['labels{}'.format(lvl)] = np.reshape( labels_by_level[lvl], newshape=[grid_h, grid_w, 1]) sample['centerness{}'.format(lvl)] = np.reshape( ctn_targets_by_level[lvl], newshape=[grid_h, grid_w, 1]) sample.pop('is_crowd', None) sample.pop('difficult', None) sample.pop('gt_class', None) sample.pop('gt_bbox', None) return samples @register_op class Gt2GFLTarget(BaseOperator): __shared__ = ['num_classes'] """ Generate GFocal loss targets by groud truth data """ def __init__(self, num_classes=80, downsample_ratios=[8, 16, 32, 64, 128], grid_cell_scale=4, cell_offset=0, compute_vlr_region=False): super(Gt2GFLTarget, self).__init__() self.num_classes = num_classes self.downsample_ratios = downsample_ratios self.grid_cell_scale = grid_cell_scale self.cell_offset = cell_offset self.compute_vlr_region = compute_vlr_region self.assigner = ATSSAssigner() def get_grid_cells(self, featmap_size, scale, stride, offset=0): """ Generate grid cells of a feature map for target assignment. Args: featmap_size: Size of a single level feature map. scale: Grid cell scale. stride: Down sample stride of the feature map. offset: Offset of grid cells. return: Grid_cells xyxy position. Size should be [feat_w * feat_h, 4] """ cell_size = stride * scale h, w = featmap_size x_range = (np.arange(w, dtype=np.float32) + offset) * stride y_range = (np.arange(h, dtype=np.float32) + offset) * stride x, y = np.meshgrid(x_range, y_range) y = y.flatten() x = x.flatten() grid_cells = np.stack( [ x - 0.5 * cell_size, y - 0.5 * cell_size, x + 0.5 * cell_size, y + 0.5 * cell_size ], axis=-1) return grid_cells def get_sample(self, assign_gt_inds, gt_bboxes): pos_inds = np.unique(np.nonzero(assign_gt_inds > 0)[0]) neg_inds = np.unique(np.nonzero(assign_gt_inds == 0)[0]) pos_assigned_gt_inds = assign_gt_inds[pos_inds] - 1 if gt_bboxes.size == 0: # hack for index error case assert pos_assigned_gt_inds.size == 0 pos_gt_bboxes = np.empty_like(gt_bboxes).reshape(-1, 4) else: if len(gt_bboxes.shape) < 2: gt_bboxes = gt_bboxes.resize(-1, 4) pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :] return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds def __call__(self, samples, context=None): assert len(samples) > 0 batch_size = len(samples) # get grid cells of image h, w = samples[0]['image'].shape[1:3] multi_level_grid_cells = [] for stride in self.downsample_ratios: featmap_size = (int(math.ceil(h / stride)), int(math.ceil(w / stride))) multi_level_grid_cells.append( self.get_grid_cells(featmap_size, self.grid_cell_scale, stride, self.cell_offset)) mlvl_grid_cells_list = [ multi_level_grid_cells for i in range(batch_size) ] # pixel cell number of multi-level feature maps num_level_cells = [ grid_cells.shape[0] for grid_cells in mlvl_grid_cells_list[0] ] num_level_cells_list = [num_level_cells] * batch_size # concat all level cells and to a single array for i in range(batch_size): mlvl_grid_cells_list[i] = np.concatenate(mlvl_grid_cells_list[i]) # target assign on all images for sample, grid_cells, num_level_cells in zip( samples, mlvl_grid_cells_list, num_level_cells_list): gt_bboxes = sample['gt_bbox'] gt_labels = sample['gt_class'].squeeze() if gt_labels.size == 1: gt_labels = np.array([gt_labels]).astype(np.int32) gt_bboxes_ignore = None assign_gt_inds, _ = self.assigner(grid_cells, num_level_cells, gt_bboxes, gt_bboxes_ignore, gt_labels) if self.compute_vlr_region: vlr_region = self.assigner.get_vlr_region( grid_cells, num_level_cells, gt_bboxes, gt_bboxes_ignore, gt_labels) sample['vlr_regions'] = vlr_region pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds = self.get_sample( assign_gt_inds, gt_bboxes) num_cells = grid_cells.shape[0] bbox_targets = np.zeros_like(grid_cells) bbox_weights = np.zeros_like(grid_cells) labels = np.ones([num_cells], dtype=np.int64) * self.num_classes label_weights = np.zeros([num_cells], dtype=np.float32) if len(pos_inds) > 0: pos_bbox_targets = pos_gt_bboxes bbox_targets[pos_inds, :] = pos_bbox_targets bbox_weights[pos_inds, :] = 1.0 if not np.any(gt_labels): labels[pos_inds] = 0 else: labels[pos_inds] = gt_labels[pos_assigned_gt_inds] label_weights[pos_inds] = 1.0 if len(neg_inds) > 0: label_weights[neg_inds] = 1.0 sample['grid_cells'] = grid_cells sample['labels'] = labels sample['label_weights'] = label_weights sample['bbox_targets'] = bbox_targets sample['pos_num'] = max(pos_inds.size, 1) sample.pop('is_crowd', None) sample.pop('difficult', None) sample.pop('gt_class', None) sample.pop('gt_bbox', None) sample.pop('gt_score', None) return samples @register_op class Gt2TTFTarget(BaseOperator): __shared__ = ['num_classes'] """ Gt2TTFTarget Generate TTFNet targets by ground truth data Args: num_classes(int): the number of classes. down_ratio(int): the down ratio from images to heatmap, 4 by default. alpha(float): the alpha parameter to generate gaussian target. 0.54 by default. """ def __init__(self, num_classes=80, down_ratio=4, alpha=0.54): super(Gt2TTFTarget, self).__init__() self.down_ratio = down_ratio self.num_classes = num_classes self.alpha = alpha def __call__(self, samples, context=None): output_size = samples[0]['image'].shape[1] feat_size = output_size // self.down_ratio for sample in samples: heatmap = np.zeros( (self.num_classes, feat_size, feat_size), dtype='float32') box_target = np.ones( (4, feat_size, feat_size), dtype='float32') * -1 reg_weight = np.zeros((1, feat_size, feat_size), dtype='float32') gt_bbox = sample['gt_bbox'] gt_class = sample['gt_class'] bbox_w = gt_bbox[:, 2] - gt_bbox[:, 0] + 1 bbox_h = gt_bbox[:, 3] - gt_bbox[:, 1] + 1 area = bbox_w * bbox_h boxes_areas_log = np.log(area) boxes_ind = np.argsort(boxes_areas_log, axis=0)[::-1] boxes_area_topk_log = boxes_areas_log[boxes_ind] gt_bbox = gt_bbox[boxes_ind] gt_class = gt_class[boxes_ind] feat_gt_bbox = gt_bbox / self.down_ratio feat_gt_bbox = np.clip(feat_gt_bbox, 0, feat_size - 1) feat_hs, feat_ws = (feat_gt_bbox[:, 3] - feat_gt_bbox[:, 1], feat_gt_bbox[:, 2] - feat_gt_bbox[:, 0]) ct_inds = np.stack( [(gt_bbox[:, 0] + gt_bbox[:, 2]) / 2, (gt_bbox[:, 1] + gt_bbox[:, 3]) / 2], axis=1) / self.down_ratio h_radiuses_alpha = (feat_hs / 2. * self.alpha).astype('int32') w_radiuses_alpha = (feat_ws / 2. * self.alpha).astype('int32') for k in range(len(gt_bbox)): cls_id = gt_class[k] fake_heatmap = np.zeros((feat_size, feat_size), dtype='float32') self.draw_truncate_gaussian(fake_heatmap, ct_inds[k], h_radiuses_alpha[k], w_radiuses_alpha[k]) heatmap[cls_id] = np.maximum(heatmap[cls_id], fake_heatmap) box_target_inds = fake_heatmap > 0 box_target[:, box_target_inds] = gt_bbox[k][:, None] local_heatmap = fake_heatmap[box_target_inds] ct_div = np.sum(local_heatmap) local_heatmap *= boxes_area_topk_log[k] reg_weight[0, box_target_inds] = local_heatmap / ct_div sample['ttf_heatmap'] = heatmap sample['ttf_box_target'] = box_target sample['ttf_reg_weight'] = reg_weight sample.pop('is_crowd', None) sample.pop('difficult', None) sample.pop('gt_class', None) sample.pop('gt_bbox', None) sample.pop('gt_score', None) return samples def draw_truncate_gaussian(self, heatmap, center, h_radius, w_radius): h, w = 2 * h_radius + 1, 2 * w_radius + 1 sigma_x = w / 6 sigma_y = h / 6 gaussian = gaussian2D((h, w), sigma_x, sigma_y) x, y = int(center[0]), int(center[1]) height, width = heatmap.shape[0:2] left, right = min(x, w_radius), min(width - x, w_radius + 1) top, bottom = min(y, h_radius), min(height - y, h_radius + 1) masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] masked_gaussian = gaussian[h_radius - top:h_radius + bottom, w_radius - left:w_radius + right] if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0: heatmap[y - top:y + bottom, x - left:x + right] = np.maximum( masked_heatmap, masked_gaussian) return heatmap @register_op class Gt2Solov2Target(BaseOperator): """Assign mask target and labels in SOLOv2 network. The code of this function is based on: https://github.com/WXinlong/SOLO/blob/master/mmdet/models/anchor_heads/solov2_head.py#L271 Args: num_grids (list): The list of feature map grids size. scale_ranges (list): The list of mask boundary range. coord_sigma (float): The coefficient of coordinate area length. sampling_ratio (float): The ratio of down sampling. """ def __init__(self, num_grids=[40, 36, 24, 16, 12], scale_ranges=[[1, 96], [48, 192], [96, 384], [192, 768], [384, 2048]], coord_sigma=0.2, sampling_ratio=4.0): super(Gt2Solov2Target, self).__init__() self.num_grids = num_grids self.scale_ranges = scale_ranges self.coord_sigma = coord_sigma self.sampling_ratio = sampling_ratio def _scale_size(self, im, scale): h, w = im.shape[:2] new_size = (int(w * float(scale) + 0.5), int(h * float(scale) + 0.5)) resized_img = cv2.resize( im, None, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR) return resized_img def __call__(self, samples, context=None): sample_id = 0 max_ins_num = [0] * len(self.num_grids) for sample in samples: gt_bboxes_raw = sample['gt_bbox'] gt_labels_raw = sample['gt_class'] + 1 im_c, im_h, im_w = sample['image'].shape[:] gt_masks_raw = sample['gt_segm'].astype(np.uint8) mask_feat_size = [ int(im_h / self.sampling_ratio), int(im_w / self.sampling_ratio) ] gt_areas = np.sqrt((gt_bboxes_raw[:, 2] - gt_bboxes_raw[:, 0]) * (gt_bboxes_raw[:, 3] - gt_bboxes_raw[:, 1])) ins_ind_label_list = [] idx = 0 for (lower_bound, upper_bound), num_grid \ in zip(self.scale_ranges, self.num_grids): hit_indices = ((gt_areas >= lower_bound) & (gt_areas <= upper_bound)).nonzero()[0] num_ins = len(hit_indices) ins_label = [] grid_order = [] cate_label = np.zeros([num_grid, num_grid], dtype=np.int64) ins_ind_label = np.zeros([num_grid**2], dtype=np.bool_) if num_ins == 0: ins_label = np.zeros( [1, mask_feat_size[0], mask_feat_size[1]], dtype=np.uint8) ins_ind_label_list.append(ins_ind_label) sample['cate_label{}'.format(idx)] = cate_label.flatten() sample['ins_label{}'.format(idx)] = ins_label sample['grid_order{}'.format(idx)] = np.asarray( [sample_id * num_grid * num_grid + 0], dtype=np.int32) idx += 1 continue gt_bboxes = gt_bboxes_raw[hit_indices] gt_labels = gt_labels_raw[hit_indices] gt_masks = gt_masks_raw[hit_indices, ...] half_ws = 0.5 * ( gt_bboxes[:, 2] - gt_bboxes[:, 0]) * self.coord_sigma half_hs = 0.5 * ( gt_bboxes[:, 3] - gt_bboxes[:, 1]) * self.coord_sigma for seg_mask, gt_label, half_h, half_w in zip( gt_masks, gt_labels, half_hs, half_ws): if seg_mask.sum() == 0: continue # mass center upsampled_size = (mask_feat_size[0] * 4, mask_feat_size[1] * 4) center_h, center_w = ndimage.measurements.center_of_mass( seg_mask) coord_w = int( (center_w / upsampled_size[1]) // (1. / num_grid)) coord_h = int( (center_h / upsampled_size[0]) // (1. / num_grid)) # left, top, right, down top_box = max(0, int(((center_h - half_h) / upsampled_size[0]) // (1. / num_grid))) down_box = min(num_grid - 1, int(((center_h + half_h) / upsampled_size[0]) // (1. / num_grid))) left_box = max(0, int(((center_w - half_w) / upsampled_size[1]) // (1. / num_grid))) right_box = min(num_grid - 1, int(((center_w + half_w) / upsampled_size[1]) // (1. / num_grid))) top = max(top_box, coord_h - 1) down = min(down_box, coord_h + 1) left = max(coord_w - 1, left_box) right = min(right_box, coord_w + 1) cate_label[top:(down + 1), left:(right + 1)] = gt_label seg_mask = self._scale_size( seg_mask, scale=1. / self.sampling_ratio) for i in range(top, down + 1): for j in range(left, right + 1): label = int(i * num_grid + j) cur_ins_label = np.zeros( [mask_feat_size[0], mask_feat_size[1]], dtype=np.uint8) cur_ins_label[:seg_mask.shape[0], :seg_mask.shape[ 1]] = seg_mask ins_label.append(cur_ins_label) ins_ind_label[label] = True grid_order.append(sample_id * num_grid * num_grid + label) if ins_label == []: ins_label = np.zeros( [1, mask_feat_size[0], mask_feat_size[1]], dtype=np.uint8) ins_ind_label_list.append(ins_ind_label) sample['cate_label{}'.format(idx)] = cate_label.flatten() sample['ins_label{}'.format(idx)] = ins_label sample['grid_order{}'.format(idx)] = np.asarray( [sample_id * num_grid * num_grid + 0], dtype=np.int32) else: ins_label = np.stack(ins_label, axis=0) ins_ind_label_list.append(ins_ind_label) sample['cate_label{}'.format(idx)] = cate_label.flatten() sample['ins_label{}'.format(idx)] = ins_label sample['grid_order{}'.format(idx)] = np.asarray( grid_order, dtype=np.int32) assert len(grid_order) > 0 max_ins_num[idx] = max( max_ins_num[idx], sample['ins_label{}'.format(idx)].shape[0]) idx += 1 ins_ind_labels = np.concatenate([ ins_ind_labels_level_img for ins_ind_labels_level_img in ins_ind_label_list ]) fg_num = np.sum(ins_ind_labels) sample['fg_num'] = fg_num sample_id += 1 sample.pop('is_crowd') sample.pop('gt_class') sample.pop('gt_bbox') sample.pop('gt_poly') sample.pop('gt_segm') # padding batch for data in samples: for idx in range(len(self.num_grids)): gt_ins_data = np.zeros( [ max_ins_num[idx], data['ins_label{}'.format(idx)].shape[1], data['ins_label{}'.format(idx)].shape[2] ], dtype=np.uint8) gt_ins_data[0:data['ins_label{}'.format(idx)].shape[ 0], :, :] = data['ins_label{}'.format(idx)] gt_grid_order = np.zeros([max_ins_num[idx]], dtype=np.int32) gt_grid_order[0:data['grid_order{}'.format(idx)].shape[ 0]] = data['grid_order{}'.format(idx)] data['ins_label{}'.format(idx)] = gt_ins_data data['grid_order{}'.format(idx)] = gt_grid_order return samples @register_op class Gt2SparseTarget(BaseOperator): def __init__(self, use_padding_shape=False): super(Gt2SparseTarget, self).__init__() self.use_padding_shape = use_padding_shape def __call__(self, samples, context=None): for sample in samples: ori_h, ori_w = sample['h'], sample['w'] if self.use_padding_shape: h, w = sample["image"].shape[1:3] if "scale_factor" in sample: sf_w, sf_h = sample["scale_factor"][1], sample[ "scale_factor"][0] sample["scale_factor_whwh"] = np.array( [sf_w, sf_h, sf_w, sf_h], dtype=np.float32) else: sample["scale_factor_whwh"] = np.array( [1.0, 1.0, 1.0, 1.0], dtype=np.float32) else: h, w = round(sample['im_shape'][0]), round(sample['im_shape'][ 1]) sample["scale_factor_whwh"] = np.array( [w / ori_w, h / ori_h, w / ori_w, h / ori_h], dtype=np.float32) sample["img_whwh"] = np.array([w, h, w, h], dtype=np.float32) sample["ori_shape"] = np.array([ori_h, ori_w], dtype=np.int32) return samples @register_op class PadMaskBatch(BaseOperator): """ Pad a batch of samples so that they can be divisible by a stride. The layout of each image should be 'CHW'. Args: pad_to_stride (int): If `pad_to_stride > 0`, pad zeros to ensure height and width is divisible by `pad_to_stride`. return_pad_mask (bool): If `return_pad_mask = True`, return `pad_mask` for transformer. """ def __init__(self, pad_to_stride=0, return_pad_mask=True): super(PadMaskBatch, self).__init__() self.pad_to_stride = pad_to_stride self.return_pad_mask = return_pad_mask def __call__(self, samples, context=None): """ Args: samples (list): a batch of sample, each is dict. """ coarsest_stride = self.pad_to_stride max_shape = np.array([data['image'].shape for data in samples]).max( axis=0) if coarsest_stride > 0: max_shape[1] = int( np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride) max_shape[2] = int( np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride) for data in samples: im = data['image'] im_c, im_h, im_w = im.shape[:] padding_im = np.zeros( (im_c, max_shape[1], max_shape[2]), dtype=np.float32) padding_im[:, :im_h, :im_w] = im.astype(np.float32) data['image'] = padding_im if 'semantic' in data and data['semantic'] is not None: semantic = data['semantic'] padding_sem = np.zeros( (1, max_shape[1], max_shape[2]), dtype=np.float32) padding_sem[:, :im_h, :im_w] = semantic data['semantic'] = padding_sem if 'gt_segm' in data and data['gt_segm'] is not None: gt_segm = data['gt_segm'] padding_segm = np.zeros( (gt_segm.shape[0], max_shape[1], max_shape[2]), dtype=np.uint8) padding_segm[:, :im_h, :im_w] = gt_segm data['gt_segm'] = padding_segm if self.return_pad_mask: padding_mask = np.zeros( (max_shape[1], max_shape[2]), dtype=np.float32) padding_mask[:im_h, :im_w] = 1. data['pad_mask'] = padding_mask return samples @register_op class Gt2CenterNetTarget(BaseOperator): __shared__ = ['num_classes'] """Gt2CenterNetTarget Genterate CenterNet targets by ground-truth Args: down_ratio (int): The down sample ratio between output feature and input image. num_classes (int): The number of classes, 80 by default. max_objs (int): The maximum objects detected, 128 by default. """ def __init__(self, num_classes=80, down_ratio=4, max_objs=128): super(Gt2CenterNetTarget, self).__init__() self.nc = num_classes self.down_ratio = down_ratio self.max_objs = max_objs def __call__(self, sample, context=None): input_h, input_w = sample['image'].shape[1:] output_h = input_h // self.down_ratio output_w = input_w // self.down_ratio gt_bbox = sample['gt_bbox'] gt_class = sample['gt_class'] hm = np.zeros((self.nc, output_h, output_w), dtype=np.float32) wh = np.zeros((self.max_objs, 2), dtype=np.float32) reg = np.zeros((self.max_objs, 2), dtype=np.float32) ind = np.zeros((self.max_objs), dtype=np.int64) reg_mask = np.zeros((self.max_objs), dtype=np.int32) cat_spec_wh = np.zeros((self.max_objs, self.nc * 2), dtype=np.float32) cat_spec_mask = np.zeros((self.max_objs, self.nc * 2), dtype=np.int32) trans_output = get_affine_transform( center=sample['center'], input_size=[sample['scale'], sample['scale']], rot=0, output_size=[output_w, output_h]) gt_det = [] for i, (bbox, cls) in enumerate(zip(gt_bbox, gt_class)): cls = int(cls) bbox[:2] = affine_transform(bbox[:2], trans_output) bbox[2:] = affine_transform(bbox[2:], trans_output) bbox_amodal = copy.deepcopy(bbox) bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1) bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1) h, w = bbox[3] - bbox[1], bbox[2] - bbox[0] if h > 0 and w > 0: radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7) radius = max(0, int(radius)) ct = np.array( [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32) ct_int = ct.astype(np.int32) # get hm,wh,reg,ind,ind_mask draw_umich_gaussian(hm[cls], ct_int, radius) wh[i] = 1. * w, 1. * h reg[i] = ct - ct_int ind[i] = ct_int[1] * output_w + ct_int[0] reg_mask[i] = 1 cat_spec_wh[i, cls * 2:cls * 2 + 2] = wh[i] cat_spec_mask[i, cls * 2:cls * 2 + 2] = 1 gt_det.append([ ct[0] - w / 2, ct[1] - h / 2, ct[0] + w / 2, ct[1] + h / 2, 1, cls ]) sample.pop('gt_bbox', None) sample.pop('gt_class', None) sample.pop('center', None) sample.pop('scale', None) sample.pop('is_crowd', None) sample.pop('difficult', None) sample['index'] = ind sample['index_mask'] = reg_mask sample['heatmap'] = hm sample['size'] = wh sample['offset'] = reg return sample @register_op class PadGT(BaseOperator): """ Pad 0 to `gt_class`, `gt_bbox`, `gt_score`... The num_max_boxes is the largest for batch. Args: return_gt_mask (bool): If true, return `pad_gt_mask`, 1 means bbox, 0 means no bbox. """ def __init__(self, return_gt_mask=True, pad_img=False, minimum_gtnum=0, only_origin_box=False): super(PadGT, self).__init__() self.return_gt_mask = return_gt_mask self.pad_img = pad_img self.minimum_gtnum = minimum_gtnum self.only_origin_box = only_origin_box def _impad(self, img: np.ndarray, *, shape=None, padding=None, pad_val=0, padding_mode='constant') -> np.ndarray: """Pad the given image to a certain shape or pad on all sides with specified padding mode and padding value. Args: img (ndarray): Image to be padded. shape (tuple[int]): Expected padding shape (h, w). Default: None. padding (int or tuple[int]): Padding on each border. If a single int is provided this is used to pad all borders. If tuple of length 2 is provided this is the padding on left/right and top/bottom respectively. If a tuple of length 4 is provided this is the padding for the left, top, right and bottom borders respectively. Default: None. Note that `shape` and `padding` can not be both set. pad_val (Number | Sequence[Number]): Values to be filled in padding areas when padding_mode is 'constant'. Default: 0. padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric. Default: constant. - constant: pads with a constant value, this value is specified with pad_val. - edge: pads with the last value at the edge of the image. - reflect: pads with reflection of image without repeating the last value on the edge. For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode will result in [3, 2, 1, 2, 3, 4, 3, 2]. - symmetric: pads with reflection of image repeating the last value on the edge. For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode will result in [2, 1, 1, 2, 3, 4, 4, 3] Returns: ndarray: The padded image. """ assert (shape is not None) ^ (padding is not None) if shape is not None: width = max(shape[1] - img.shape[1], 0) height = max(shape[0] - img.shape[0], 0) padding = (0, 0, int(width), int(height)) # check pad_val import numbers if isinstance(pad_val, tuple): assert len(pad_val) == img.shape[-1] elif not isinstance(pad_val, numbers.Number): raise TypeError('pad_val must be a int or a tuple. ' f'But received {type(pad_val)}') # check padding if isinstance(padding, tuple) and len(padding) in [2, 4]: if len(padding) == 2: padding = (padding[0], padding[1], padding[0], padding[1]) elif isinstance(padding, numbers.Number): padding = (padding, padding, padding, padding) else: raise ValueError('Padding must be a int or a 2, or 4 element tuple.' f'But received {padding}') # check padding mode assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric'] border_type = { 'constant': cv2.BORDER_CONSTANT, 'edge': cv2.BORDER_REPLICATE, 'reflect': cv2.BORDER_REFLECT_101, 'symmetric': cv2.BORDER_REFLECT } img = cv2.copyMakeBorder( img, padding[1], padding[3], padding[0], padding[2], border_type[padding_mode], value=pad_val) return img def checkmaxshape(self, samples): maxh, maxw = 0, 0 for sample in samples: h, w = sample['im_shape'] if h > maxh: maxh = h if w > maxw: maxw = w return (maxh, maxw) def __call__(self, samples, context=None): num_max_boxes = max([len(s['gt_bbox']) for s in samples]) num_max_boxes = max(self.minimum_gtnum, num_max_boxes) if self.pad_img: maxshape = self.checkmaxshape(samples) if self.only_origin_box: for sample in samples: if self.pad_img: img = sample['image'] padimg = self._impad(img, shape=maxshape) sample['image'] = padimg if self.return_gt_mask: sample['pad_origin_gt_mask'] = np.zeros( (num_max_boxes, 1), dtype=np.float32) if num_max_boxes == 0: continue num_gt = len(sample['origin_gt_bbox']) pad_origin_gt_class = np.zeros((num_max_boxes, 1), dtype=np.int32) pad_origin_gt_bbox = np.zeros((num_max_boxes, 4), dtype=np.float32) if num_gt > 0: pad_origin_gt_class[:num_gt] = sample['origin_gt_class'] pad_origin_gt_bbox[:num_gt] = sample['origin_gt_bbox'] sample['origin_gt_class'] = pad_origin_gt_class sample['origin_gt_bbox'] = pad_origin_gt_bbox if 'pad_origin_gt_mask' in sample: sample['pad_origin_gt_mask'][:num_gt] = 1 else: for sample in samples: if self.pad_img: img = sample['image'] padimg = self._impad(img, shape=maxshape) sample['image'] = padimg if self.return_gt_mask: sample['pad_gt_mask'] = np.zeros( (num_max_boxes, 1), dtype=np.float32) if num_max_boxes == 0: continue num_gt = len(sample['gt_bbox']) pad_gt_class = np.zeros((num_max_boxes, 1), dtype=np.int32) pad_gt_bbox = np.zeros((num_max_boxes, 4), dtype=np.float32) if num_gt > 0: pad_gt_class[:num_gt] = sample['gt_class'] pad_gt_bbox[:num_gt] = sample['gt_bbox'] sample['gt_class'] = pad_gt_class sample['gt_bbox'] = pad_gt_bbox # pad_gt_mask if 'pad_gt_mask' in sample: sample['pad_gt_mask'][:num_gt] = 1 # gt_score if 'gt_score' in sample: pad_gt_score = np.zeros((num_max_boxes, 1), dtype=np.float32) if num_gt > 0: pad_gt_score[:num_gt] = sample['gt_score'] sample['gt_score'] = pad_gt_score if 'is_crowd' in sample: pad_is_crowd = np.zeros((num_max_boxes, 1), dtype=np.int32) if num_gt > 0: pad_is_crowd[:num_gt] = sample['is_crowd'] sample['is_crowd'] = pad_is_crowd if 'difficult' in sample: pad_diff = np.zeros((num_max_boxes, 1), dtype=np.int32) if num_gt > 0: pad_diff[:num_gt] = sample['difficult'] sample['difficult'] = pad_diff if 'gt_joints' in sample: num_joints = sample['gt_joints'].shape[1] pad_gt_joints = np.zeros( (num_max_boxes, num_joints, 3), dtype=np.float32) if num_gt > 0: pad_gt_joints[:num_gt] = sample['gt_joints'] sample['gt_joints'] = pad_gt_joints if 'gt_areas' in sample: pad_gt_areas = np.zeros((num_max_boxes, 1), dtype=np.float32) if num_gt > 0: pad_gt_areas[:num_gt, 0] = sample['gt_areas'] sample['gt_areas'] = pad_gt_areas # gt_segm if 'gt_segm' in sample: pad_gt_segm = np.zeros( (num_max_boxes, *sample['gt_segm'].shape[-2:]), dtype=np.uint8) if num_gt > 0: pad_gt_segm[:num_gt] = sample['gt_segm'] sample['gt_segm'] = pad_gt_segm.astype(np.float32) return samples @register_op class PadRGT(BaseOperator): """ Pad 0 to `gt_class`, `gt_bbox`, `gt_score`... The num_max_boxes is the largest for batch. Args: return_gt_mask (bool): If true, return `pad_gt_mask`, 1 means bbox, 0 means no bbox. """ def __init__(self, return_gt_mask=True): super(PadRGT, self).__init__() self.return_gt_mask = return_gt_mask def pad_field(self, sample, field, num_gt): name, shape, dtype = field if name in sample: pad_v = np.zeros(shape, dtype=dtype) if num_gt > 0: pad_v[:num_gt] = sample[name] sample[name] = pad_v def __call__(self, samples, context=None): num_max_boxes = max([len(s['gt_bbox']) for s in samples]) for sample in samples: if self.return_gt_mask: sample['pad_gt_mask'] = np.zeros( (num_max_boxes, 1), dtype=np.float32) if num_max_boxes == 0: continue num_gt = len(sample['gt_bbox']) pad_gt_class = np.zeros((num_max_boxes, 1), dtype=np.int32) pad_gt_bbox = np.zeros((num_max_boxes, 4), dtype=np.float32) if num_gt > 0: pad_gt_class[:num_gt] = sample['gt_class'] pad_gt_bbox[:num_gt] = sample['gt_bbox'] sample['gt_class'] = pad_gt_class sample['gt_bbox'] = pad_gt_bbox # pad_gt_mask if 'pad_gt_mask' in sample: sample['pad_gt_mask'][:num_gt] = 1 # gt_score names = ['gt_score', 'is_crowd', 'difficult', 'gt_poly', 'gt_rbox'] dims = [1, 1, 1, 8, 5] dtypes = [np.float32, np.int32, np.int32, np.float32, np.float32] for name, dim, dtype in zip(names, dims, dtypes): self.pad_field(sample, [name, (num_max_boxes, dim), dtype], num_gt) return samples @register_op class Gt2CenterTrackTarget(BaseOperator): __shared__ = ['num_classes'] """Gt2CenterTrackTarget Genterate CenterTrack targets by ground-truth Args: num_classes (int): The number of classes, 1 by default. down_ratio (int): The down sample ratio between output feature and input image. max_objs (int): The maximum objects detected, 256 by default. """ def __init__(self, num_classes=1, down_ratio=4, max_objs=256, hm_disturb=0.05, lost_disturb=0.4, fp_disturb=0.1, pre_hm=True, add_tracking=True, add_ltrb_amodal=True): super(Gt2CenterTrackTarget, self).__init__() self.nc = num_classes self.down_ratio = down_ratio self.max_objs = max_objs self.hm_disturb = hm_disturb self.lost_disturb = lost_disturb self.fp_disturb = fp_disturb self.pre_hm = pre_hm self.add_tracking = add_tracking self.add_ltrb_amodal = add_ltrb_amodal def _get_pre_dets(self, input_h, input_w, trans_input_pre, gt_bbox_pre, gt_class_pre, gt_track_id_pre): hm_h, hm_w = input_h, input_w reutrn_hm = self.pre_hm pre_hm = np.zeros( (1, hm_h, hm_w), dtype=np.float32) if reutrn_hm else None pre_cts, track_ids = [], [] for i, ( bbox, cls, track_id ) in enumerate(zip(gt_bbox_pre, gt_class_pre, gt_track_id_pre)): cls = int(cls) bbox[:2] = affine_transform(bbox[:2], trans_input_pre) bbox[2:] = affine_transform(bbox[2:], trans_input_pre) bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, hm_w - 1) bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, hm_h - 1) h, w = bbox[3] - bbox[1], bbox[2] - bbox[0] max_rad = 1 if (h > 0 and w > 0): radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7) radius = max(0, int(radius)) max_rad = max(max_rad, radius) ct = np.array( [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32) ct0 = ct.copy() conf = 1 ct[0] = ct[0] + np.random.randn() * self.hm_disturb * w ct[1] = ct[1] + np.random.randn() * self.hm_disturb * h conf = 1 if np.random.rand() > self.lost_disturb else 0 ct_int = ct.astype(np.int32) if conf == 0: pre_cts.append(ct / self.down_ratio) else: pre_cts.append(ct0 / self.down_ratio) track_ids.append(track_id) if reutrn_hm: draw_umich_gaussian(pre_hm[0], ct_int, radius, k=conf) if np.random.rand() < self.fp_disturb and reutrn_hm: ct2 = ct0.copy() # Hard code heatmap disturb ratio, haven't tried other numbers. ct2[0] = ct2[0] + np.random.randn() * 0.05 * w ct2[1] = ct2[1] + np.random.randn() * 0.05 * h ct2_int = ct2.astype(np.int32) draw_umich_gaussian(pre_hm[0], ct2_int, radius, k=conf) return pre_hm, pre_cts, track_ids def __call__(self, sample, context=None): input_h, input_w = sample['image'].shape[1:] output_h = input_h // self.down_ratio output_w = input_w // self.down_ratio gt_bbox = sample['gt_bbox'] gt_class = sample['gt_class'] # init hm = np.zeros((self.nc, output_h, output_w), dtype=np.float32) wh = np.zeros((self.max_objs, 2), dtype=np.float32) reg = np.zeros((self.max_objs, 2), dtype=np.float32) ind = np.zeros((self.max_objs), dtype=np.int64) reg_mask = np.zeros((self.max_objs), dtype=np.int32) if self.add_tracking: tr = np.zeros((self.max_objs, 2), dtype=np.float32) if self.add_ltrb_amodal: ltrb_amodal = np.zeros((self.max_objs, 4), dtype=np.float32) trans_output = get_affine_transform( center=sample['center'], input_size=[sample['scale'], sample['scale']], rot=0, output_size=[output_w, output_h]) pre_hm, pre_cts, track_ids = self._get_pre_dets( input_h, input_w, sample['trans_input'], sample['pre_gt_bbox'], sample['pre_gt_class'], sample['pre_gt_track_id']) for i, (bbox, cls) in enumerate(zip(gt_bbox, gt_class)): cls = int(cls) rect = np.array( [[bbox[0], bbox[1]], [bbox[0], bbox[3]], [bbox[2], bbox[3]], [bbox[2], bbox[1]]], dtype=np.float32) for t in range(4): rect[t] = affine_transform(rect[t], trans_output) bbox[:2] = rect[:, 0].min(), rect[:, 1].min() bbox[2:] = rect[:, 0].max(), rect[:, 1].max() bbox_amodal = copy.deepcopy(bbox) bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1) bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1) h, w = bbox[3] - bbox[1], bbox[2] - bbox[0] if h > 0 and w > 0: radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7) radius = max(0, int(radius)) ct = np.array( [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32) ct_int = ct.astype(np.int32) # get hm,wh,reg,ind,ind_mask draw_umich_gaussian(hm[cls], ct_int, radius) wh[i] = 1. * w, 1. * h reg[i] = ct - ct_int ind[i] = ct_int[1] * output_w + ct_int[0] reg_mask[i] = 1 if self.add_tracking: if sample['gt_track_id'][i] in track_ids: pre_ct = pre_cts[track_ids.index(sample['gt_track_id'][ i])] tr[i] = pre_ct - ct_int if self.add_ltrb_amodal: ltrb_amodal[i] = \ bbox_amodal[0] - ct_int[0], bbox_amodal[1] - ct_int[1], \ bbox_amodal[2] - ct_int[0], bbox_amodal[3] - ct_int[1] new_sample = {'image': sample['image']} new_sample['index'] = ind new_sample['index_mask'] = reg_mask new_sample['heatmap'] = hm new_sample['size'] = wh new_sample['offset'] = reg if self.add_tracking: new_sample['tracking'] = tr if self.add_ltrb_amodal: new_sample['ltrb_amodal'] = ltrb_amodal new_sample['pre_image'] = sample['pre_image'] new_sample['pre_hm'] = pre_hm del sample return new_sample @register_op class BatchRandomResizeForSSOD(BaseOperator): """ Resize image to target size randomly. random target_size and interpolation method Args: target_size (int, list, tuple): image target size, if random size is True, must be list or tuple keep_ratio (bool): whether keep_raio or not, default true interp (int): the interpolation method random_size (bool): whether random select target size of image random_interp (bool): whether random select interpolation method """ def __init__(self, target_size, keep_ratio, interp=cv2.INTER_NEAREST, random_size=True, random_interp=False): super(BatchRandomResizeForSSOD, self).__init__() self.keep_ratio = keep_ratio self.interps = [ cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_AREA, cv2.INTER_CUBIC, cv2.INTER_LANCZOS4, ] self.interp = interp assert isinstance(target_size, ( int, Sequence)), "target_size must be int, list or tuple" if random_size and not isinstance(target_size, list): raise TypeError( "Type of target_size is invalid when random_size is True. Must be List, now is {}". format(type(target_size))) self.target_size = target_size self.random_size = random_size self.random_interp = random_interp def __call__(self, samples, context=None): if self.random_size: index = np.random.choice(len(self.target_size)) target_size = self.target_size[index] else: target_size = self.target_size if context is not None: target_size = self.target_size[context] if self.random_interp: interp = np.random.choice(self.interps) else: interp = self.interp resizer = Resize(target_size, keep_ratio=self.keep_ratio, interp=interp) return [resizer(samples, context=context), index] ================================================ FILE: ppdet/data/transform/culane_operators.py ================================================ import numpy as np import imgaug.augmenters as iaa from .operators import BaseOperator, register_op from ppdet.utils.logger import setup_logger from ppdet.data.culane_utils import linestrings_to_lanes, transform_annotation logger = setup_logger(__name__) __all__ = [ "CULaneTrainProcess", "CULaneDataProcess", "HorizontalFlip", "ChannelShuffle", "CULaneAffine", "CULaneResize", "OneOfBlur", "MultiplyAndAddToBrightness", "AddToHueAndSaturation" ] def trainTransforms(img_h, img_w): transforms = [{ 'name': 'Resize', 'parameters': dict(size=dict( height=img_h, width=img_w)), 'p': 1.0 }, { 'name': 'HorizontalFlip', 'parameters': dict(p=1.0), 'p': 0.5 }, { 'name': 'ChannelShuffle', 'parameters': dict(p=1.0), 'p': 0.1 }, { 'name': 'MultiplyAndAddToBrightness', 'parameters': dict( mul=(0.85, 1.15), add=(-10, 10)), 'p': 0.6 }, { 'name': 'AddToHueAndSaturation', 'parameters': dict(value=(-10, 10)), 'p': 0.7 }, { 'name': 'OneOf', 'transforms': [ dict( name='MotionBlur', parameters=dict(k=(3, 5))), dict( name='MedianBlur', parameters=dict(k=(3, 5))) ], 'p': 0.2 }, { 'name': 'Affine', 'parameters': dict( translate_percent=dict( x=(-0.1, 0.1), y=(-0.1, 0.1)), rotate=(-10, 10), scale=(0.8, 1.2)), 'p': 0.7 }, { 'name': 'Resize', 'parameters': dict(size=dict( height=img_h, width=img_w)), 'p': 1.0 }] return transforms @register_op class CULaneTrainProcess(BaseOperator): def __init__(self, img_w, img_h): super(CULaneTrainProcess, self).__init__() self.img_w = img_w self.img_h = img_h self.transforms = trainTransforms(self.img_h, self.img_w) if self.transforms is not None: img_transforms = [] for aug in self.transforms: p = aug['p'] if aug['name'] != 'OneOf': img_transforms.append( iaa.Sometimes( p=p, then_list=getattr(iaa, aug['name'])(**aug[ 'parameters']))) else: img_transforms.append( iaa.Sometimes( p=p, then_list=iaa.OneOf([ getattr(iaa, aug_['name'])(**aug_['parameters']) for aug_ in aug['transforms'] ]))) else: img_transforms = [] self.iaa_transform = iaa.Sequential(img_transforms) def apply(self, sample, context=None): img, line_strings, seg = self.iaa_transform( image=sample['image'], line_strings=sample['lanes'], segmentation_maps=sample['mask']) sample['image'] = img sample['lanes'] = line_strings sample['mask'] = seg return sample @register_op class CULaneDataProcess(BaseOperator): def __init__(self, img_w, img_h, num_points, max_lanes): super(CULaneDataProcess, self).__init__() self.img_w = img_w self.img_h = img_h self.num_points = num_points self.n_offsets = num_points self.n_strips = num_points - 1 self.strip_size = self.img_h / self.n_strips self.max_lanes = max_lanes self.offsets_ys = np.arange(self.img_h, -1, -self.strip_size) def apply(self, sample, context=None): data = {} line_strings = sample['lanes'] line_strings.clip_out_of_image_() new_anno = {'lanes': linestrings_to_lanes(line_strings)} for i in range(30): try: annos = transform_annotation( self.img_w, self.img_h, self.max_lanes, self.n_offsets, self.offsets_ys, self.n_strips, self.strip_size, new_anno) label = annos['label'] lane_endpoints = annos['lane_endpoints'] break except: if (i + 1) == 30: logger.critical('Transform annotation failed 30 times :(') exit() sample['image'] = sample['image'].astype(np.float32) / 255. data['image'] = sample['image'].transpose(2, 0, 1) data['lane_line'] = label data['seg'] = sample['seg'] data['full_img_path'] = sample['full_img_path'] data['img_name'] = sample['img_name'] data['im_id'] = sample['im_id'] if 'mask' in sample.keys(): data['seg'] = sample['mask'].get_arr() data['im_shape'] = np.array([self.img_w, self.img_h], dtype=np.float32) data['scale_factor'] = np.array([1., 1.], dtype=np.float32) return data @register_op class CULaneResize(BaseOperator): def __init__(self, img_h, img_w, prob=0.5): super(CULaneResize, self).__init__() self.img_h = img_h self.img_w = img_w self.prob = prob def apply(self, sample, context=None): transform = iaa.Sometimes(self.prob, iaa.Resize({ "height": self.img_h, "width": self.img_w })) if 'mask' in sample.keys(): img, line_strings, seg = transform( image=sample['image'], line_strings=sample['lanes'], segmentation_maps=sample['mask']) sample['image'] = img sample['lanes'] = line_strings sample['mask'] = seg else: img, line_strings = transform( image=sample['image'].copy().astype(np.uint8), line_strings=sample['lanes']) sample['image'] = img sample['lanes'] = line_strings return sample @register_op class HorizontalFlip(BaseOperator): def __init__(self, prob=0.5): super(HorizontalFlip, self).__init__() self.prob = prob def apply(self, sample, context=None): transform = iaa.Sometimes(self.prob, iaa.HorizontalFlip(1.0)) if 'mask' in sample.keys(): img, line_strings, seg = transform( image=sample['image'], line_strings=sample['lanes'], segmentation_maps=sample['mask']) sample['image'] = img sample['lanes'] = line_strings sample['mask'] = seg else: img, line_strings = transform( image=sample['image'], line_strings=sample['lanes']) sample['image'] = img sample['lanes'] = line_strings return sample @register_op class ChannelShuffle(BaseOperator): def __init__(self, prob=0.1): super(ChannelShuffle, self).__init__() self.prob = prob def apply(self, sample, context=None): transform = iaa.Sometimes(self.prob, iaa.ChannelShuffle(1.0)) if 'mask' in sample.keys(): img, line_strings, seg = transform( image=sample['image'], line_strings=sample['lanes'], segmentation_maps=sample['mask']) sample['image'] = img sample['lanes'] = line_strings sample['mask'] = seg else: img, line_strings = transform( image=sample['image'], line_strings=sample['lanes']) sample['image'] = img sample['lanes'] = line_strings return sample @register_op class MultiplyAndAddToBrightness(BaseOperator): def __init__(self, mul=(0.85, 1.15), add=(-10, 10), prob=0.5): super(MultiplyAndAddToBrightness, self).__init__() self.mul = tuple(mul) self.add = tuple(add) self.prob = prob def apply(self, sample, context=None): transform = iaa.Sometimes( self.prob, iaa.MultiplyAndAddToBrightness( mul=self.mul, add=self.add)) if 'mask' in sample.keys(): img, line_strings, seg = transform( image=sample['image'], line_strings=sample['lanes'], segmentation_maps=sample['mask']) sample['image'] = img sample['lanes'] = line_strings sample['mask'] = seg else: img, line_strings = transform( image=sample['image'], line_strings=sample['lanes']) sample['image'] = img sample['lanes'] = line_strings return sample @register_op class AddToHueAndSaturation(BaseOperator): def __init__(self, value=(-10, 10), prob=0.5): super(AddToHueAndSaturation, self).__init__() self.value = tuple(value) self.prob = prob def apply(self, sample, context=None): transform = iaa.Sometimes( self.prob, iaa.AddToHueAndSaturation(value=self.value)) if 'mask' in sample.keys(): img, line_strings, seg = transform( image=sample['image'], line_strings=sample['lanes'], segmentation_maps=sample['mask']) sample['image'] = img sample['lanes'] = line_strings sample['mask'] = seg else: img, line_strings = transform( image=sample['image'], line_strings=sample['lanes']) sample['image'] = img sample['lanes'] = line_strings return sample @register_op class OneOfBlur(BaseOperator): def __init__(self, MotionBlur_k=(3, 5), MedianBlur_k=(3, 5), prob=0.5): super(OneOfBlur, self).__init__() self.MotionBlur_k = tuple(MotionBlur_k) self.MedianBlur_k = tuple(MedianBlur_k) self.prob = prob def apply(self, sample, context=None): transform = iaa.Sometimes( self.prob, iaa.OneOf([ iaa.MotionBlur(k=self.MotionBlur_k), iaa.MedianBlur(k=self.MedianBlur_k) ])) if 'mask' in sample.keys(): img, line_strings, seg = transform( image=sample['image'], line_strings=sample['lanes'], segmentation_maps=sample['mask']) sample['image'] = img sample['lanes'] = line_strings sample['mask'] = seg else: img, line_strings = transform( image=sample['image'], line_strings=sample['lanes']) sample['image'] = img sample['lanes'] = line_strings return sample @register_op class CULaneAffine(BaseOperator): def __init__(self, translate_percent_x=(-0.1, 0.1), translate_percent_y=(-0.1, 0.1), rotate=(3, 5), scale=(0.8, 1.2), prob=0.5): super(CULaneAffine, self).__init__() self.translate_percent = { 'x': tuple(translate_percent_x), 'y': tuple(translate_percent_y) } self.rotate = tuple(rotate) self.scale = tuple(scale) self.prob = prob def apply(self, sample, context=None): transform = iaa.Sometimes( self.prob, iaa.Affine( translate_percent=self.translate_percent, rotate=self.rotate, scale=self.scale)) if 'mask' in sample.keys(): img, line_strings, seg = transform( image=sample['image'], line_strings=sample['lanes'], segmentation_maps=sample['mask']) sample['image'] = img sample['lanes'] = line_strings sample['mask'] = seg else: img, line_strings = transform( image=sample['image'], line_strings=sample['lanes']) sample['image'] = img sample['lanes'] = line_strings return sample ================================================ FILE: ppdet/data/transform/gridmask_utils.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # The code is based on: # https://github.com/dvlab-research/GridMask/blob/master/detection_grid/maskrcnn_benchmark/data/transforms/grid.py from __future__ import absolute_import from __future__ import print_function from __future__ import division import numpy as np from PIL import Image class Gridmask(object): def __init__(self, use_h=True, use_w=True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7, upper_iter=360000): super(Gridmask, self).__init__() self.use_h = use_h self.use_w = use_w self.rotate = rotate self.offset = offset self.ratio = ratio self.mode = mode self.prob = prob self.st_prob = prob self.upper_iter = upper_iter def __call__(self, x, curr_iter): self.prob = self.st_prob * min(1, 1.0 * curr_iter / self.upper_iter) if np.random.rand() > self.prob: return x h, w, _ = x.shape hh = int(1.5 * h) ww = int(1.5 * w) d = np.random.randint(2, h) self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1) mask = np.ones((hh, ww), np.float32) st_h = np.random.randint(d) st_w = np.random.randint(d) if self.use_h: for i in range(hh // d): s = d * i + st_h t = min(s + self.l, hh) mask[s:t, :] *= 0 if self.use_w: for i in range(ww // d): s = d * i + st_w t = min(s + self.l, ww) mask[:, s:t] *= 0 r = np.random.randint(self.rotate) mask = Image.fromarray(np.uint8(mask)) mask = mask.rotate(r) mask = np.asarray(mask) mask = mask[(hh - h) // 2:(hh - h) // 2 + h, (ww - w) // 2:(ww - w) // 2 + w].astype(np.float32) if self.mode == 1: mask = 1 - mask mask = np.expand_dims(mask, axis=-1) if self.offset: offset = (2 * (np.random.rand(h, w) - 0.5)).astype(np.float32) x = (x * mask + offset * (1 - mask)).astype(x.dtype) else: x = (x * mask).astype(x.dtype) return x ================================================ FILE: ppdet/data/transform/keypoint_operators.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # function: # operators to process sample, # eg: decode/resize/crop image from __future__ import absolute_import try: from collections.abc import Sequence except Exception: from collections import Sequence import cv2 import numpy as np import math import copy from ...modeling.keypoint_utils import get_affine_mat_kernel, warp_affine_joints, get_affine_transform, affine_transform, get_warp_matrix from ppdet.core.workspace import serializable from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) registered_ops = [] __all__ = [ 'RandomAffine', 'KeyPointFlip', 'TagGenerate', 'ToHeatmaps', 'NormalizePermute', 'EvalAffine', 'RandomFlipHalfBodyTransform', 'TopDownRandomFlip', 'TopDownRandomShiftBboxCenter', 'TopDownGetRandomScaleRotation', 'TopDownAffine', 'ToHeatmapsTopDown', 'ToHeatmapsTopDown_DARK', 'ToHeatmapsTopDown_UDP', 'TopDownEvalAffine', 'AugmentationbyInformantionDropping', 'SinglePoseAffine', 'NoiseJitter', 'FlipPose', 'PETR_Resize' ] def register_keypointop(cls): return serializable(cls) @register_keypointop class KeyPointFlip(object): """Get the fliped image by flip_prob. flip the coords also the left coords and right coords should exchange while flip, for the right keypoint will be left keypoint after image fliped Args: flip_permutation (list[17]): the left-right exchange order list corresponding to [0,1,2,...,16] hmsize (list[2]): output heatmap's shape list of different scale outputs of higherhrnet flip_prob (float): the ratio whether to flip the image records(dict): the dict contained the image, mask and coords Returns: records(dict): contain the image, mask and coords after tranformed """ def __init__(self, flip_permutation, hmsize=None, flip_prob=0.5): super(KeyPointFlip, self).__init__() assert isinstance(flip_permutation, Sequence) self.flip_permutation = flip_permutation self.flip_prob = flip_prob self.hmsize = hmsize def _flipjoints(self, records, sizelst): ''' records['gt_joints'] is Sequence in higherhrnet ''' if not ('gt_joints' in records and len(records['gt_joints']) > 0): return records kpts_lst = records['gt_joints'] if isinstance(kpts_lst, Sequence): for idx, hmsize in enumerate(sizelst): if kpts_lst[idx].ndim == 3: kpts_lst[idx] = kpts_lst[idx][:, self.flip_permutation] else: kpts_lst[idx] = kpts_lst[idx][self.flip_permutation] kpts_lst[idx][..., 0] = hmsize - kpts_lst[idx][..., 0] else: hmsize = sizelst[0] if kpts_lst.ndim == 3: kpts_lst = kpts_lst[:, self.flip_permutation] else: kpts_lst = kpts_lst[self.flip_permutation] kpts_lst[..., 0] = hmsize - kpts_lst[..., 0] records['gt_joints'] = kpts_lst return records def _flipmask(self, records, sizelst): if not 'mask' in records: return records mask_lst = records['mask'] for idx, hmsize in enumerate(sizelst): if len(mask_lst) > idx: mask_lst[idx] = mask_lst[idx][:, ::-1] records['mask'] = mask_lst return records def _flipbbox(self, records, sizelst): if not 'gt_bbox' in records: return records bboxes = records['gt_bbox'] hmsize = sizelst[0] bboxes[:, 0::2] = hmsize - bboxes[:, 0::2][:, ::-1] bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, hmsize) records['gt_bbox'] = bboxes return records def __call__(self, records): flip = np.random.random() < self.flip_prob if flip: image = records['image'] image = image[:, ::-1] records['image'] = image if self.hmsize is None: sizelst = [image.shape[1]] else: sizelst = self.hmsize self._flipjoints(records, sizelst) self._flipmask(records, sizelst) self._flipbbox(records, sizelst) return records @register_keypointop class RandomAffine(object): """apply affine transform to image, mask and coords to achieve the rotate, scale and shift effect for training image Args: max_degree (float): the max abslute rotate degree to apply, transform range is [-max_degree, max_degree] max_scale (list[2]): the scale range to apply, transform range is [min, max] max_shift (float): the max abslute shift ratio to apply, transform range is [-max_shift*imagesize, max_shift*imagesize] hmsize (list[2]): output heatmap's shape list of different scale outputs of higherhrnet trainsize (list[2]): the standard length used to train, the 'scale_type' of [h,w] will be resize to trainsize for standard scale_type (str): the length of [h,w] to used for trainsize, chosed between 'short' and 'long' records(dict): the dict contained the image, mask and coords Returns: records(dict): contain the image, mask and coords after tranformed """ def __init__(self, max_degree=30, scale=[0.75, 1.5], max_shift=0.2, hmsize=None, trainsize=[512, 512], scale_type='short', boldervalue=[114, 114, 114]): super(RandomAffine, self).__init__() self.max_degree = max_degree self.min_scale = scale[0] self.max_scale = scale[1] self.max_shift = max_shift self.hmsize = hmsize self.trainsize = trainsize self.scale_type = scale_type self.boldervalue = boldervalue def _get_affine_matrix_old(self, center, scale, res, rot=0): """Generate transformation matrix.""" h = scale t = np.zeros((3, 3), dtype=np.float32) t[0, 0] = float(res[1]) / h t[1, 1] = float(res[0]) / h t[0, 2] = res[1] * (-float(center[0]) / h + .5) t[1, 2] = res[0] * (-float(center[1]) / h + .5) t[2, 2] = 1 if rot != 0: rot = -rot # To match direction of rotation from cropping rot_mat = np.zeros((3, 3), dtype=np.float32) rot_rad = rot * np.pi / 180 sn, cs = np.sin(rot_rad), np.cos(rot_rad) rot_mat[0, :2] = [cs, -sn] rot_mat[1, :2] = [sn, cs] rot_mat[2, 2] = 1 # Need to rotate around center t_mat = np.eye(3) t_mat[0, 2] = -res[1] / 2 t_mat[1, 2] = -res[0] / 2 t_inv = t_mat.copy() t_inv[:2, 2] *= -1 t = np.dot(t_inv, np.dot(rot_mat, np.dot(t_mat, t))) return t def _get_affine_matrix(self, center, scale, res, rot=0): """Generate transformation matrix.""" w, h = scale t = np.zeros((3, 3), dtype=np.float32) t[0, 0] = float(res[0]) / w t[1, 1] = float(res[1]) / h t[0, 2] = res[0] * (-float(center[0]) / w + .5) t[1, 2] = res[1] * (-float(center[1]) / h + .5) t[2, 2] = 1 if rot != 0: rot = -rot # To match direction of rotation from cropping rot_mat = np.zeros((3, 3), dtype=np.float32) rot_rad = rot * np.pi / 180 sn, cs = np.sin(rot_rad), np.cos(rot_rad) rot_mat[0, :2] = [cs, -sn] rot_mat[1, :2] = [sn, cs] rot_mat[2, 2] = 1 # Need to rotate around center t_mat = np.eye(3) t_mat[0, 2] = -res[0] / 2 t_mat[1, 2] = -res[1] / 2 t_inv = t_mat.copy() t_inv[:2, 2] *= -1 t = np.dot(t_inv, np.dot(rot_mat, np.dot(t_mat, t))) return t def _affine_joints_mask(self, degree, center, roi_size, dsize, keypoints=None, heatmap_mask=None, gt_bbox=None): kpts = None mask = None bbox = None mask_affine_mat = self._get_affine_matrix(center, roi_size, dsize, degree)[:2] if heatmap_mask is not None: mask = cv2.warpAffine(heatmap_mask, mask_affine_mat, dsize) mask = ((mask / 255) > 0.5).astype(np.float32) if keypoints is not None: kpts = copy.deepcopy(keypoints) kpts[..., 0:2] = warp_affine_joints(kpts[..., 0:2].copy(), mask_affine_mat) kpts[(kpts[..., 0]) > dsize[0], :] = 0 kpts[(kpts[..., 1]) > dsize[1], :] = 0 kpts[(kpts[..., 0]) < 0, :] = 0 kpts[(kpts[..., 1]) < 0, :] = 0 if gt_bbox is not None: temp_bbox = gt_bbox[:, [0, 3, 2, 1]] cat_bbox = np.concatenate((gt_bbox, temp_bbox), axis=-1) gt_bbox_warped = warp_affine_joints(cat_bbox, mask_affine_mat) bbox = np.zeros_like(gt_bbox) bbox[:, 0] = gt_bbox_warped[:, 0::2].min(1).clip(0, dsize[0]) bbox[:, 2] = gt_bbox_warped[:, 0::2].max(1).clip(0, dsize[0]) bbox[:, 1] = gt_bbox_warped[:, 1::2].min(1).clip(0, dsize[1]) bbox[:, 3] = gt_bbox_warped[:, 1::2].max(1).clip(0, dsize[1]) return kpts, mask, bbox def __call__(self, records): image = records['image'] shape = np.array(image.shape[:2][::-1]) keypoints = None heatmap_mask = None gt_bbox = None if 'gt_joints' in records: keypoints = records['gt_joints'] if 'mask' in records: heatmap_mask = records['mask'] heatmap_mask *= 255 if 'gt_bbox' in records: gt_bbox = records['gt_bbox'] degree = (np.random.random() * 2 - 1) * self.max_degree center = center = np.array((np.array(shape) / 2)) aug_scale = np.random.random() * (self.max_scale - self.min_scale ) + self.min_scale if self.scale_type == 'long': scale = np.array([max(shape[0], shape[1]) / 1.0] * 2) elif self.scale_type == 'short': scale = np.array([min(shape[0], shape[1]) / 1.0] * 2) elif self.scale_type == 'wh': scale = shape else: raise ValueError('Unknown scale type: {}'.format(self.scale_type)) roi_size = aug_scale * scale dx = int(0) dy = int(0) if self.max_shift > 0: dx = np.random.randint(-self.max_shift * roi_size[0], self.max_shift * roi_size[0]) dy = np.random.randint(-self.max_shift * roi_size[0], self.max_shift * roi_size[1]) center += np.array([dx, dy]) input_size = 2 * center if self.trainsize != -1: dsize = self.trainsize imgshape = (dsize) else: dsize = scale imgshape = (shape.tolist()) image_affine_mat = self._get_affine_matrix(center, roi_size, dsize, degree)[:2] image = cv2.warpAffine( image, image_affine_mat, imgshape, flags=cv2.INTER_LINEAR, borderValue=self.boldervalue) if self.hmsize is None: kpts, mask, gt_bbox = self._affine_joints_mask( degree, center, roi_size, dsize, keypoints, heatmap_mask, gt_bbox) records['image'] = image if kpts is not None: records['gt_joints'] = kpts if mask is not None: records['mask'] = mask if gt_bbox is not None: records['gt_bbox'] = gt_bbox return records kpts_lst = [] mask_lst = [] for hmsize in self.hmsize: kpts, mask, gt_bbox = self._affine_joints_mask( degree, center, roi_size, [hmsize, hmsize], keypoints, heatmap_mask, gt_bbox) kpts_lst.append(kpts) mask_lst.append(mask) records['image'] = image if 'gt_joints' in records: records['gt_joints'] = kpts_lst if 'mask' in records: records['mask'] = mask_lst if 'gt_bbox' in records: records['gt_bbox'] = gt_bbox return records @register_keypointop class EvalAffine(object): """apply affine transform to image resize the short of [h,w] to standard size for eval Args: size (int): the standard length used to train, the 'short' of [h,w] will be resize to trainsize for standard records(dict): the dict contained the image, mask and coords Returns: records(dict): contain the image, mask and coords after tranformed """ def __init__(self, size, stride=64): super(EvalAffine, self).__init__() self.size = size self.stride = stride def __call__(self, records): image = records['image'] mask = records['mask'] if 'mask' in records else None s = self.size h, w, _ = image.shape trans, size_resized = get_affine_mat_kernel(h, w, s, inv=False) image_resized = cv2.warpAffine(image, trans, size_resized) if mask is not None: mask = cv2.warpAffine(mask, trans, size_resized) records['mask'] = mask if 'gt_joints' in records: del records['gt_joints'] records['image'] = image_resized records['scale_factor'] = self.size / min(h, w) return records @register_keypointop class NormalizePermute(object): def __init__(self, mean=[123.675, 116.28, 103.53], std=[58.395, 57.120, 57.375], is_scale=True): super(NormalizePermute, self).__init__() self.mean = mean self.std = std self.is_scale = is_scale def __call__(self, records): image = records['image'] image = image.astype(np.float32) if self.is_scale: image /= 255. image = image.transpose((2, 0, 1)) mean = np.array(self.mean, dtype=np.float32) std = np.array(self.std, dtype=np.float32) invstd = 1. / std for v, m, s in zip(image, mean, invstd): v.__isub__(m).__imul__(s) records['image'] = image return records @register_keypointop class TagGenerate(object): """record gt coords for aeloss to sample coords value in tagmaps Args: num_joints (int): the keypoint numbers of dataset to train num_people (int): maxmum people to support for sample aeloss records(dict): the dict contained the image, mask and coords Returns: records(dict): contain the gt coords used in tagmap """ def __init__(self, num_joints, max_people=30): super(TagGenerate, self).__init__() self.max_people = max_people self.num_joints = num_joints def __call__(self, records): kpts_lst = records['gt_joints'] kpts = kpts_lst[0] tagmap = np.zeros((self.max_people, self.num_joints, 4), dtype=np.int64) inds = np.where(kpts[..., 2] > 0) p, j = inds[0], inds[1] visible = kpts[inds] # tagmap is [p, j, 3], where last dim is j, y, x tagmap[p, j, 0] = j tagmap[p, j, 1] = visible[..., 1] # y tagmap[p, j, 2] = visible[..., 0] # x tagmap[p, j, 3] = 1 records['tagmap'] = tagmap del records['gt_joints'] return records @register_keypointop class ToHeatmaps(object): """to generate the gaussin heatmaps of keypoint for heatmap loss Args: num_joints (int): the keypoint numbers of dataset to train hmsize (list[2]): output heatmap's shape list of different scale outputs of higherhrnet sigma (float): the std of gaussin kernel genereted records(dict): the dict contained the image, mask and coords Returns: records(dict): contain the heatmaps used to heatmaploss """ def __init__(self, num_joints, hmsize, sigma=None): super(ToHeatmaps, self).__init__() self.num_joints = num_joints self.hmsize = np.array(hmsize) if sigma is None: sigma = hmsize[0] // 64 self.sigma = sigma r = 6 * sigma + 3 x = np.arange(0, r, 1, np.float32) y = x[:, None] x0, y0 = 3 * sigma + 1, 3 * sigma + 1 self.gaussian = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2)) def __call__(self, records): kpts_lst = records['gt_joints'] mask_lst = records['mask'] for idx, hmsize in enumerate(self.hmsize): mask = mask_lst[idx] kpts = kpts_lst[idx] heatmaps = np.zeros((self.num_joints, hmsize, hmsize)) inds = np.where(kpts[..., 2] > 0) visible = kpts[inds].astype(np.int64)[..., :2] ul = np.round(visible - 3 * self.sigma - 1) br = np.round(visible + 3 * self.sigma + 2) sul = np.maximum(0, -ul) sbr = np.minimum(hmsize, br) - ul dul = np.clip(ul, 0, hmsize - 1) dbr = np.clip(br, 0, hmsize) for i in range(len(visible)): if visible[i][0] < 0 or visible[i][1] < 0 or visible[i][ 0] >= hmsize or visible[i][1] >= hmsize: continue dx1, dy1 = dul[i] dx2, dy2 = dbr[i] sx1, sy1 = sul[i] sx2, sy2 = sbr[i] heatmaps[inds[1][i], dy1:dy2, dx1:dx2] = np.maximum( self.gaussian[sy1:sy2, sx1:sx2], heatmaps[inds[1][i], dy1:dy2, dx1:dx2]) records['heatmap_gt{}x'.format(idx + 1)] = heatmaps records['mask_{}x'.format(idx + 1)] = mask del records['mask'] return records @register_keypointop class RandomFlipHalfBodyTransform(object): """apply data augment to image and coords to achieve the flip, scale, rotate and half body transform effect for training image Args: trainsize (list):[w, h], Image target size upper_body_ids (list): The upper body joint ids flip_pairs (list): The left-right joints exchange order list pixel_std (int): The pixel std of the scale scale (float): The scale factor to transform the image rot (int): The rotate factor to transform the image num_joints_half_body (int): The joints threshold of the half body transform prob_half_body (float): The threshold of the half body transform flip (bool): Whether to flip the image Returns: records(dict): contain the image and coords after tranformed """ def __init__(self, trainsize, upper_body_ids, flip_pairs, pixel_std, scale=0.35, rot=40, num_joints_half_body=8, prob_half_body=0.3, flip=True, rot_prob=0.6): super(RandomFlipHalfBodyTransform, self).__init__() self.trainsize = trainsize self.upper_body_ids = upper_body_ids self.flip_pairs = flip_pairs self.pixel_std = pixel_std self.scale = scale self.rot = rot self.num_joints_half_body = num_joints_half_body self.prob_half_body = prob_half_body self.flip = flip self.aspect_ratio = trainsize[0] * 1.0 / trainsize[1] self.rot_prob = rot_prob def halfbody_transform(self, joints, joints_vis): upper_joints = [] lower_joints = [] for joint_id in range(joints.shape[0]): if joints_vis[joint_id][0] > 0: if joint_id in self.upper_body_ids: upper_joints.append(joints[joint_id]) else: lower_joints.append(joints[joint_id]) if np.random.randn() < 0.5 and len(upper_joints) > 2: selected_joints = upper_joints else: selected_joints = lower_joints if len( lower_joints) > 2 else upper_joints if len(selected_joints) < 2: return None, None selected_joints = np.array(selected_joints, dtype=np.float32) center = selected_joints.mean(axis=0)[:2] left_top = np.amin(selected_joints, axis=0) right_bottom = np.amax(selected_joints, axis=0) w = right_bottom[0] - left_top[0] h = right_bottom[1] - left_top[1] if w > self.aspect_ratio * h: h = w * 1.0 / self.aspect_ratio elif w < self.aspect_ratio * h: w = h * self.aspect_ratio scale = np.array( [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std], dtype=np.float32) scale = scale * 1.5 return center, scale def flip_joints(self, joints, joints_vis, width, matched_parts): joints[:, 0] = width - joints[:, 0] - 1 for pair in matched_parts: joints[pair[0], :], joints[pair[1], :] = \ joints[pair[1], :], joints[pair[0], :].copy() joints_vis[pair[0], :], joints_vis[pair[1], :] = \ joints_vis[pair[1], :], joints_vis[pair[0], :].copy() return joints * joints_vis, joints_vis def __call__(self, records): image = records['image'] joints = records['gt_joints'] joints_vis = records['joints_vis'] c = records['center'] s = records['scale'] r = 0 if (np.sum(joints_vis[:, 0]) > self.num_joints_half_body and np.random.rand() < self.prob_half_body): c_half_body, s_half_body = self.halfbody_transform(joints, joints_vis) if c_half_body is not None and s_half_body is not None: c, s = c_half_body, s_half_body sf = self.scale rf = self.rot s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf) r = np.clip(np.random.randn() * rf, -rf * 2, rf * 2) if np.random.random() <= self.rot_prob else 0 if self.flip and np.random.random() <= 0.5: image = image[:, ::-1, :] joints, joints_vis = self.flip_joints( joints, joints_vis, image.shape[1], self.flip_pairs) c[0] = image.shape[1] - c[0] - 1 records['image'] = image records['gt_joints'] = joints records['joints_vis'] = joints_vis records['center'] = c records['scale'] = s records['rotate'] = r return records @register_keypointop class AugmentationbyInformantionDropping(object): """AID: Augmentation by Informantion Dropping. Please refer to https://arxiv.org/abs/2008.07139 Args: prob_cutout (float): The probability of the Cutout augmentation. offset_factor (float): Offset factor of cutout center. num_patch (int): Number of patches to be cutout. records(dict): the dict contained the image and coords Returns: records (dict): contain the image and coords after tranformed """ def __init__(self, trainsize, prob_cutout=0.0, offset_factor=0.2, num_patch=1): self.prob_cutout = prob_cutout self.offset_factor = offset_factor self.num_patch = num_patch self.trainsize = trainsize def _cutout(self, img, joints, joints_vis): height, width, _ = img.shape img = img.reshape((height * width, -1)) feat_x_int = np.arange(0, width) feat_y_int = np.arange(0, height) feat_x_int, feat_y_int = np.meshgrid(feat_x_int, feat_y_int) feat_x_int = feat_x_int.reshape((-1, )) feat_y_int = feat_y_int.reshape((-1, )) for _ in range(self.num_patch): vis_idx, _ = np.where(joints_vis > 0) occlusion_joint_id = np.random.choice(vis_idx) center = joints[occlusion_joint_id, 0:2] offset = np.random.randn(2) * self.trainsize[0] * self.offset_factor center = center + offset radius = np.random.uniform(0.1, 0.2) * self.trainsize[0] x_offset = (center[0] - feat_x_int) / radius y_offset = (center[1] - feat_y_int) / radius dis = x_offset**2 + y_offset**2 keep_pos = np.where((dis <= 1) & (dis >= 0))[0] img[keep_pos, :] = 0 img = img.reshape((height, width, -1)) return img def __call__(self, records): img = records['image'] joints = records['gt_joints'] joints_vis = records['joints_vis'] if np.random.rand() < self.prob_cutout: img = self._cutout(img, joints, joints_vis) records['image'] = img return records @register_keypointop class TopDownRandomFlip(object): """Data augmentation with random image flip. Args: flip_perm: (list[tuple]): Pairs of keypoints which are mirrored (for example, left ear and right ear). flip_prob (float): Probability of flip. """ def __init__(self, flip_perm=[], flip_prob=0.5): self.flip_perm = flip_perm self.flip_prob = flip_prob def flip_joints(self, joints_3d, joints_3d_visible, img_width, flip_pairs): assert len(joints_3d) == len(joints_3d_visible) assert img_width > 0 joints_3d_flipped = joints_3d.copy() joints_3d_visible_flipped = joints_3d_visible.copy() # Swap left-right parts for left, right in flip_pairs: joints_3d_flipped[left, :] = joints_3d[right, :] joints_3d_flipped[right, :] = joints_3d[left, :] joints_3d_visible_flipped[left, :] = joints_3d_visible[right, :] joints_3d_visible_flipped[right, :] = joints_3d_visible[left, :] # Flip horizontally joints_3d_flipped[:, 0] = img_width - 1 - joints_3d_flipped[:, 0] joints_3d_flipped = joints_3d_flipped * (joints_3d_visible_flipped > 0) return joints_3d_flipped, joints_3d_visible_flipped def __call__(self, results): """Perform data augmentation with random image flip.""" if np.random.rand() <= self.flip_prob: return results img = results['image'] joints_3d = results['gt_joints'] joints_3d_visible = results['joints_vis'] center = results['center'] # A flag indicating whether the image is flipped, # which can be used by child class. if not isinstance(img, list): img = img[:, ::-1, :] else: img = [i[:, ::-1, :] for i in img] if not isinstance(img, list): joints_3d, joints_3d_visible = self.flip_joints( joints_3d, joints_3d_visible, img.shape[1], self.flip_perm) center[0] = img.shape[1] - center[0] - 1 else: joints_3d, joints_3d_visible = self.flip_joints( joints_3d, joints_3d_visible, img[0].shape[1], self.flip_perm) center[0] = img[0].shape[1] - center[0] - 1 results['image'] = img results['gt_joints'] = joints_3d results['joints_vis'] = joints_3d_visible results['center'] = center return results @register_keypointop class TopDownRandomShiftBboxCenter(object): """Random shift the bbox center. Args: shift_factor (float): The factor to control the shift range, which is scale*pixel_std*scale_factor. Default: 0.16 shift_prob (float): Probability of applying random shift. Default: 0.3 """ def __init__(self, shift_factor=0.16, shift_prob=0.3): self.shift_factor = shift_factor self.shift_prob = shift_prob def __call__(self, results): center = results['center'] scale = results['scale'] if np.random.rand() < self.shift_prob: center += np.random.uniform( -1, 1, 2) * self.shift_factor * scale * 200.0 results['center'] = center return results @register_keypointop class TopDownGetRandomScaleRotation(object): """Data augmentation with random scaling & rotating. Args: rot_factor (int): Rotating to ``[-2*rot_factor, 2*rot_factor]``. scale_factor (float): Scaling to ``[1-scale_factor, 1+scale_factor]``. rot_prob (float): Probability of random rotation. """ def __init__(self, rot_factor=40, scale_factor=0.5, rot_prob=0.6): self.rot_factor = rot_factor self.scale_factor = scale_factor self.rot_prob = rot_prob def __call__(self, results): """Perform data augmentation with random scaling & rotating.""" s = results['scale'] sf = self.scale_factor rf = self.rot_factor s_factor = np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf) s = s * s_factor r_factor = np.clip(np.random.randn() * rf, -rf * 2, rf * 2) r = r_factor if np.random.rand() <= self.rot_prob else 0 results['scale'] = s results['rotate'] = r return results @register_keypointop class TopDownAffine(object): """apply affine transform to image and coords Args: trainsize (list): [w, h], the standard size used to train use_udp (bool): whether to use Unbiased Data Processing. records(dict): the dict contained the image and coords Returns: records (dict): contain the image and coords after tranformed """ def __init__(self, trainsize, use_udp=False): self.trainsize = trainsize self.use_udp = use_udp def __call__(self, records): image = records['image'] joints = records['gt_joints'] joints_vis = records['joints_vis'] rot = records['rotate'] if "rotate" in records else 0 if self.use_udp: trans = get_warp_matrix( rot, records['center'] * 2.0, [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0], records['scale'] * 200.0) image = cv2.warpAffine( image, trans, (int(self.trainsize[0]), int(self.trainsize[1])), flags=cv2.INTER_LINEAR) joints[:, 0:2] = warp_affine_joints(joints[:, 0:2].copy(), trans) else: trans = get_affine_transform(records['center'], records['scale'] * 200, rot, self.trainsize) image = cv2.warpAffine( image, trans, (int(self.trainsize[0]), int(self.trainsize[1])), flags=cv2.INTER_LINEAR) for i in range(joints.shape[0]): if joints_vis[i, 0] > 0.0: joints[i, 0:2] = affine_transform(joints[i, 0:2], trans) records['image'] = image records['gt_joints'] = joints return records @register_keypointop class SinglePoseAffine(object): """apply affine transform to image and coords Args: trainsize (list): [w, h], the standard size used to train use_udp (bool): whether to use Unbiased Data Processing. records(dict): the dict contained the image and coords Returns: records (dict): contain the image and coords after tranformed """ def __init__(self, trainsize, rotate=[1.0, 30], scale=[1.0, 0.25], use_udp=False): self.trainsize = trainsize self.use_udp = use_udp self.rot_prob = rotate[0] self.rot_range = rotate[1] self.scale_prob = scale[0] self.scale_ratio = scale[1] def __call__(self, records): image = records['image'] if 'joints_2d' in records: joints = records['joints_2d'] if 'joints_2d' in records else None joints_vis = records[ 'joints_vis'] if 'joints_vis' in records else np.ones( (len(joints), 1)) rot = 0 s = 1. if np.random.random() < self.rot_prob: rot = np.clip(np.random.randn() * self.rot_range, -self.rot_range * 2, self.rot_range * 2) if np.random.random() < self.scale_prob: s = np.clip(np.random.randn() * self.scale_ratio + 1, 1 - self.scale_ratio, 1 + self.scale_ratio) if self.use_udp: trans = get_warp_matrix( rot, np.array(records['bbox_center']) * 2.0, [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0], records['bbox_scale'] * 200.0 * s) image = cv2.warpAffine( image, trans, (int(self.trainsize[0]), int(self.trainsize[1])), flags=cv2.INTER_LINEAR) if 'joints_2d' in records: joints[:, 0:2] = warp_affine_joints(joints[:, 0:2].copy(), trans) else: trans = get_affine_transform( np.array(records['bbox_center']), records['bbox_scale'] * s * 200, rot, self.trainsize) image = cv2.warpAffine( image, trans, (int(self.trainsize[0]), int(self.trainsize[1])), flags=cv2.INTER_LINEAR) if 'joints_2d' in records: for i in range(len(joints)): if joints_vis[i, 0] > 0.0: joints[i, 0:2] = affine_transform(joints[i, 0:2], trans) if 'joints_3d' in records: pose3d = records['joints_3d'] if not rot == 0: trans_3djoints = np.eye(3) rot_rad = -rot * np.pi / 180 sn, cs = np.sin(rot_rad), np.cos(rot_rad) trans_3djoints[0, :2] = [cs, -sn] trans_3djoints[1, :2] = [sn, cs] pose3d[:, :3] = np.einsum('ij,kj->ki', trans_3djoints, pose3d[:, :3]) records['joints_3d'] = pose3d records['image'] = image if 'joints_2d' in records: records['joints_2d'] = joints return records @register_keypointop class NoiseJitter(object): """apply NoiseJitter to image Args: noise_factor (float): the noise factor ratio used to generate the jitter Returns: records (dict): contain the image and coords after tranformed """ def __init__(self, noise_factor=0.4): self.noise_factor = noise_factor def __call__(self, records): self.pn = np.random.uniform(1 - self.noise_factor, 1 + self.noise_factor, 3) rgb_img = records['image'] rgb_img[:, :, 0] = np.minimum( 255.0, np.maximum(0.0, rgb_img[:, :, 0] * self.pn[0])) rgb_img[:, :, 1] = np.minimum( 255.0, np.maximum(0.0, rgb_img[:, :, 1] * self.pn[1])) rgb_img[:, :, 2] = np.minimum( 255.0, np.maximum(0.0, rgb_img[:, :, 2] * self.pn[2])) records['image'] = rgb_img return records @register_keypointop class FlipPose(object): """random apply flip to image Args: noise_factor (float): the noise factor ratio used to generate the jitter Returns: records (dict): contain the image and coords after tranformed """ def __init__(self, flip_prob=0.5, img_res=224, num_joints=14): self.flip_pob = flip_prob self.img_res = img_res if num_joints == 24: self.perm = [ 5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6, 12, 13, 14, 15, 16, 17, 18, 19, 21, 20, 23, 22 ] elif num_joints == 14: self.perm = [5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6, 12, 13] else: print("error num_joints in flip :{}".format(num_joints)) def __call__(self, records): if np.random.random() < self.flip_pob: img = records['image'] img = np.fliplr(img) if 'joints_2d' in records: joints_2d = records['joints_2d'] joints_2d = joints_2d[self.perm] joints_2d[:, 0] = self.img_res - joints_2d[:, 0] records['joints_2d'] = joints_2d if 'joints_3d' in records: joints_3d = records['joints_3d'] joints_3d = joints_3d[self.perm] joints_3d[:, 0] = -joints_3d[:, 0] records['joints_3d'] = joints_3d records['image'] = img return records @register_keypointop class TopDownEvalAffine(object): """apply affine transform to image and coords Args: trainsize (list): [w, h], the standard size used to train use_udp (bool): whether to use Unbiased Data Processing. records(dict): the dict contained the image and coords Returns: records (dict): contain the image and coords after tranformed """ def __init__(self, trainsize, use_udp=False): self.trainsize = trainsize self.use_udp = use_udp def __call__(self, records): image = records['image'] rot = 0 imshape = records['im_shape'][::-1] center = imshape / 2. scale = imshape if self.use_udp: trans = get_warp_matrix( rot, center * 2.0, [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0], scale) image = cv2.warpAffine( image, trans, (int(self.trainsize[0]), int(self.trainsize[1])), flags=cv2.INTER_LINEAR) else: trans = get_affine_transform(center, scale, rot, self.trainsize) image = cv2.warpAffine( image, trans, (int(self.trainsize[0]), int(self.trainsize[1])), flags=cv2.INTER_LINEAR) records['image'] = image return records @register_keypointop class ToHeatmapsTopDown(object): """to generate the gaussin heatmaps of keypoint for heatmap loss Args: hmsize (list): [w, h] output heatmap's size sigma (float): the std of gaussin kernel genereted records(dict): the dict contained the image and coords Returns: records (dict): contain the heatmaps used to heatmaploss """ def __init__(self, hmsize, sigma): super(ToHeatmapsTopDown, self).__init__() self.hmsize = np.array(hmsize) self.sigma = sigma def __call__(self, records): """refer to https://github.com/leoxiaobin/deep-high-resolution-net.pytorch Copyright (c) Microsoft, under the MIT License. """ joints = records['gt_joints'] joints_vis = records['joints_vis'] num_joints = joints.shape[0] image_size = np.array( [records['image'].shape[1], records['image'].shape[0]]) target_weight = np.ones((num_joints, 1), dtype=np.float32) target_weight[:, 0] = joints_vis[:, 0] target = np.zeros( (num_joints, self.hmsize[1], self.hmsize[0]), dtype=np.float32) tmp_size = self.sigma * 3 feat_stride = image_size / self.hmsize for joint_id in range(num_joints): mu_x = int(joints[joint_id][0] / feat_stride[0] + 0.5) mu_y = int(joints[joint_id][1] / feat_stride[1] + 0.5) # Check that any part of the gaussian is in-bounds ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)] br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)] if ul[0] >= self.hmsize[0] or ul[1] >= self.hmsize[1] or br[ 0] < 0 or br[1] < 0: # If not, just return the image as is target_weight[joint_id] = 0 continue # # Generate gaussian size = 2 * tmp_size + 1 x = np.arange(0, size, 1, np.float32) y = x[:, np.newaxis] x0 = y0 = size // 2 # The gaussian is not normalized, we want the center value to equal 1 g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * self.sigma**2)) # Usable gaussian range g_x = max(0, -ul[0]), min(br[0], self.hmsize[0]) - ul[0] g_y = max(0, -ul[1]), min(br[1], self.hmsize[1]) - ul[1] # Image range img_x = max(0, ul[0]), min(br[0], self.hmsize[0]) img_y = max(0, ul[1]), min(br[1], self.hmsize[1]) v = target_weight[joint_id] if v > 0.5: target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = g[g_y[ 0]:g_y[1], g_x[0]:g_x[1]] records['target'] = target records['target_weight'] = target_weight del records['gt_joints'], records['joints_vis'] return records @register_keypointop class ToHeatmapsTopDown_DARK(object): """to generate the gaussin heatmaps of keypoint for heatmap loss Args: hmsize (list): [w, h] output heatmap's size sigma (float): the std of gaussin kernel genereted records(dict): the dict contained the image and coords Returns: records (dict): contain the heatmaps used to heatmaploss """ def __init__(self, hmsize, sigma): super(ToHeatmapsTopDown_DARK, self).__init__() self.hmsize = np.array(hmsize) self.sigma = sigma def __call__(self, records): joints = records['gt_joints'] joints_vis = records['joints_vis'] num_joints = joints.shape[0] image_size = np.array( [records['image'].shape[1], records['image'].shape[0]]) target_weight = np.ones((num_joints, 1), dtype=np.float32) target_weight[:, 0] = joints_vis[:, 0] target = np.zeros( (num_joints, self.hmsize[1], self.hmsize[0]), dtype=np.float32) tmp_size = self.sigma * 3 feat_stride = image_size / self.hmsize for joint_id in range(num_joints): mu_x = joints[joint_id][0] / feat_stride[0] mu_y = joints[joint_id][1] / feat_stride[1] # Check that any part of the gaussian is in-bounds ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)] br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)] if ul[0] >= self.hmsize[0] or ul[1] >= self.hmsize[1] or br[ 0] < 0 or br[1] < 0: # If not, just return the image as is target_weight[joint_id] = 0 continue x = np.arange(0, self.hmsize[0], 1, np.float32) y = np.arange(0, self.hmsize[1], 1, np.float32) y = y[:, np.newaxis] v = target_weight[joint_id] if v > 0.5: target[joint_id] = np.exp(-( (x - mu_x)**2 + (y - mu_y)**2) / (2 * self.sigma**2)) records['target'] = target records['target_weight'] = target_weight del records['gt_joints'], records['joints_vis'] return records @register_keypointop class ToHeatmapsTopDown_UDP(object): """This code is based on: https://github.com/HuangJunJie2017/UDP-Pose/blob/master/deep-high-resolution-net.pytorch/lib/dataset/JointsDataset.py to generate the gaussian heatmaps of keypoint for heatmap loss. ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020). Args: hmsize (list): [w, h] output heatmap's size sigma (float): the std of gaussin kernel genereted records(dict): the dict contained the image and coords Returns: records (dict): contain the heatmaps used to heatmaploss """ def __init__(self, hmsize, sigma): super(ToHeatmapsTopDown_UDP, self).__init__() self.hmsize = np.array(hmsize) self.sigma = sigma def __call__(self, records): joints = records['gt_joints'] joints_vis = records['joints_vis'] num_joints = joints.shape[0] image_size = np.array( [records['image'].shape[1], records['image'].shape[0]]) target_weight = np.ones((num_joints, 1), dtype=np.float32) target_weight[:, 0] = joints_vis[:, 0] target = np.zeros( (num_joints, self.hmsize[1], self.hmsize[0]), dtype=np.float32) tmp_size = self.sigma * 3 size = 2 * tmp_size + 1 x = np.arange(0, size, 1, np.float32) y = x[:, None] feat_stride = (image_size - 1.0) / (self.hmsize - 1.0) for joint_id in range(num_joints): mu_x = int(joints[joint_id][0] / feat_stride[0] + 0.5) mu_y = int(joints[joint_id][1] / feat_stride[1] + 0.5) # Check that any part of the gaussian is in-bounds ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)] br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)] if ul[0] >= self.hmsize[0] or ul[1] >= self.hmsize[1] or br[ 0] < 0 or br[1] < 0: # If not, just return the image as is target_weight[joint_id] = 0 continue mu_x_ac = joints[joint_id][0] / feat_stride[0] mu_y_ac = joints[joint_id][1] / feat_stride[1] x0 = y0 = size // 2 x0 += mu_x_ac - mu_x y0 += mu_y_ac - mu_y g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * self.sigma**2)) # Usable gaussian range g_x = max(0, -ul[0]), min(br[0], self.hmsize[0]) - ul[0] g_y = max(0, -ul[1]), min(br[1], self.hmsize[1]) - ul[1] # Image range img_x = max(0, ul[0]), min(br[0], self.hmsize[0]) img_y = max(0, ul[1]), min(br[1], self.hmsize[1]) v = target_weight[joint_id] if v > 0.5: target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = g[g_y[ 0]:g_y[1], g_x[0]:g_x[1]] records['target'] = target records['target_weight'] = target_weight del records['gt_joints'], records['joints_vis'] return records from typing import Optional, Tuple, Union, List import numbers def _scale_size( size: Tuple[int, int], scale: Union[float, int, tuple], ) -> Tuple[int, int]: """Rescale a size by a ratio. Args: size (tuple[int]): (w, h). scale (float | tuple(float)): Scaling factor. Returns: tuple[int]: scaled size. """ if isinstance(scale, (float, int)): scale = (scale, scale) w, h = size return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5) def rescale_size(old_size: tuple, scale: Union[float, int, tuple], return_scale: bool=False) -> tuple: """Calculate the new size to be rescaled to. Args: old_size (tuple[int]): The old size (w, h) of image. scale (float | tuple[int]): The scaling factor or maximum size. If it is a float number, then the image will be rescaled by this factor, else if it is a tuple of 2 integers, then the image will be rescaled as large as possible within the scale. return_scale (bool): Whether to return the scaling factor besides the rescaled image size. Returns: tuple[int]: The new rescaled image size. """ w, h = old_size if isinstance(scale, (float, int)): if scale <= 0: raise ValueError(f'Invalid scale {scale}, must be positive.') scale_factor = scale elif isinstance(scale, list): max_long_edge = max(scale) max_short_edge = min(scale) scale_factor = min(max_long_edge / max(h, w), max_short_edge / min(h, w)) else: raise TypeError( f'Scale must be a number or tuple of int, but got {type(scale)}') new_size = _scale_size((w, h), scale_factor) if return_scale: return new_size, scale_factor else: return new_size def imrescale(img: np.ndarray, scale: Union[float, Tuple[int, int]], return_scale: bool=False, interpolation: str='bilinear', backend: Optional[str]=None) -> Union[np.ndarray, Tuple[ np.ndarray, float]]: """Resize image while keeping the aspect ratio. Args: img (ndarray): The input image. scale (float | tuple[int]): The scaling factor or maximum size. If it is a float number, then the image will be rescaled by this factor, else if it is a tuple of 2 integers, then the image will be rescaled as large as possible within the scale. return_scale (bool): Whether to return the scaling factor besides the rescaled image. interpolation (str): Same as :func:`resize`. backend (str | None): Same as :func:`resize`. Returns: ndarray: The rescaled image. """ h, w = img.shape[:2] new_size, scale_factor = rescale_size((w, h), scale, return_scale=True) rescaled_img = imresize( img, new_size, interpolation=interpolation, backend=backend) if return_scale: return rescaled_img, scale_factor else: return rescaled_img def imresize( img: np.ndarray, size: Tuple[int, int], return_scale: bool=False, interpolation: str='bilinear', out: Optional[np.ndarray]=None, backend: Optional[str]=None, interp=cv2.INTER_LINEAR, ) -> Union[Tuple[np.ndarray, float, float], np.ndarray]: """Resize image to a given size. Args: img (ndarray): The input image. size (tuple[int]): Target size (w, h). return_scale (bool): Whether to return `w_scale` and `h_scale`. interpolation (str): Interpolation method, accepted values are "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' backend, "nearest", "bilinear" for 'pillow' backend. out (ndarray): The output destination. backend (str | None): The image resize backend type. Options are `cv2`, `pillow`, `None`. If backend is None, the global imread_backend specified by ``mmcv.use_backend()`` will be used. Default: None. Returns: tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or `resized_img`. """ h, w = img.shape[:2] if backend is None: backend = imread_backend if backend not in ['cv2', 'pillow']: raise ValueError(f'backend: {backend} is not supported for resize.' f"Supported backends are 'cv2', 'pillow'") if backend == 'pillow': assert img.dtype == np.uint8, 'Pillow backend only support uint8 type' pil_image = Image.fromarray(img) pil_image = pil_image.resize(size, pillow_interp_codes[interpolation]) resized_img = np.array(pil_image) else: resized_img = cv2.resize(img, size, dst=out, interpolation=interp) if not return_scale: return resized_img else: w_scale = size[0] / w h_scale = size[1] / h return resized_img, w_scale, h_scale class PETR_Resize: """Resize images & bbox & mask. This transform resizes the input image to some scale. Bboxes and masks are then resized with the same scale factor. If the input dict contains the key "scale", then the scale in the input dict is used, otherwise the specified scale in the init method is used. If the input dict contains the key "scale_factor" (if MultiScaleFlipAug does not give img_scale but scale_factor), the actual scale will be computed by image shape and scale_factor. `img_scale` can either be a tuple (single-scale) or a list of tuple (multi-scale). There are 3 multiscale modes: - ``ratio_range is not None``: randomly sample a ratio from the ratio \ range and multiply it with the image scale. - ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \ sample a scale from the multiscale range. - ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \ sample a scale from multiple scales. Args: img_scale (tuple or list[tuple]): Images scales for resizing. multiscale_mode (str): Either "range" or "value". ratio_range (tuple[float]): (min_ratio, max_ratio) keep_ratio (bool): Whether to keep the aspect ratio when resizing the image. bbox_clip_border (bool, optional): Whether to clip the objects outside the border of the image. In some dataset like MOT17, the gt bboxes are allowed to cross the border of images. Therefore, we don't need to clip the gt bboxes in these cases. Defaults to True. backend (str): Image resize backend, choices are 'cv2' and 'pillow'. These two backends generates slightly different results. Defaults to 'cv2'. interpolation (str): Interpolation method, accepted values are "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' backend, "nearest", "bilinear" for 'pillow' backend. override (bool, optional): Whether to override `scale` and `scale_factor` so as to call resize twice. Default False. If True, after the first resizing, the existed `scale` and `scale_factor` will be ignored so the second resizing can be allowed. This option is a work-around for multiple times of resize in DETR. Defaults to False. """ def __init__(self, img_scale=None, multiscale_mode='range', ratio_range=None, keep_ratio=True, bbox_clip_border=True, backend='cv2', interpolation='bilinear', override=False, keypoint_clip_border=True): if img_scale is None: self.img_scale = None else: if isinstance(img_scale, list): self.img_scale = img_scale else: self.img_scale = [img_scale] assert isinstance(self.img_scale, list) if ratio_range is not None: # mode 1: given a scale and a range of image ratio assert len(self.img_scale) == 1 else: # mode 2: given multiple scales or a range of scales assert multiscale_mode in ['value', 'range'] self.backend = backend self.multiscale_mode = multiscale_mode self.ratio_range = ratio_range self.keep_ratio = keep_ratio # TODO: refactor the override option in Resize self.interpolation = interpolation self.override = override self.bbox_clip_border = bbox_clip_border self.keypoint_clip_border = keypoint_clip_border @staticmethod def random_select(img_scales): """Randomly select an img_scale from given candidates. Args: img_scales (list[tuple]): Images scales for selection. Returns: (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \ where ``img_scale`` is the selected image scale and \ ``scale_idx`` is the selected index in the given candidates. """ assert isinstance(img_scales, list) scale_idx = np.random.randint(len(img_scales)) img_scale = img_scales[scale_idx] return img_scale, scale_idx @staticmethod def random_sample(img_scales): """Randomly sample an img_scale when ``multiscale_mode=='range'``. Args: img_scales (list[tuple]): Images scale range for sampling. There must be two tuples in img_scales, which specify the lower and upper bound of image scales. Returns: (tuple, None): Returns a tuple ``(img_scale, None)``, where \ ``img_scale`` is sampled scale and None is just a placeholder \ to be consistent with :func:`random_select`. """ assert isinstance(img_scales, list) and len(img_scales) == 2 img_scale_long = [max(s) for s in img_scales] img_scale_short = [min(s) for s in img_scales] long_edge = np.random.randint( min(img_scale_long), max(img_scale_long) + 1) short_edge = np.random.randint( min(img_scale_short), max(img_scale_short) + 1) img_scale = (long_edge, short_edge) return img_scale, None @staticmethod def random_sample_ratio(img_scale, ratio_range): """Randomly sample an img_scale when ``ratio_range`` is specified. A ratio will be randomly sampled from the range specified by ``ratio_range``. Then it would be multiplied with ``img_scale`` to generate sampled scale. Args: img_scale (list): Images scale base to multiply with ratio. ratio_range (tuple[float]): The minimum and maximum ratio to scale the ``img_scale``. Returns: (tuple, None): Returns a tuple ``(scale, None)``, where \ ``scale`` is sampled ratio multiplied with ``img_scale`` and \ None is just a placeholder to be consistent with \ :func:`random_select`. """ assert isinstance(img_scale, list) and len(img_scale) == 2 min_ratio, max_ratio = ratio_range assert min_ratio <= max_ratio ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio) return scale, None def _random_scale(self, results): """Randomly sample an img_scale according to ``ratio_range`` and ``multiscale_mode``. If ``ratio_range`` is specified, a ratio will be sampled and be multiplied with ``img_scale``. If multiple scales are specified by ``img_scale``, a scale will be sampled according to ``multiscale_mode``. Otherwise, single scale will be used. Args: results (dict): Result dict from :obj:`dataset`. Returns: dict: Two new keys 'scale` and 'scale_idx` are added into \ ``results``, which would be used by subsequent pipelines. """ if self.ratio_range is not None: scale, scale_idx = self.random_sample_ratio(self.img_scale[0], self.ratio_range) elif len(self.img_scale) == 1: scale, scale_idx = self.img_scale[0], 0 elif self.multiscale_mode == 'range': scale, scale_idx = self.random_sample(self.img_scale) elif self.multiscale_mode == 'value': scale, scale_idx = self.random_select(self.img_scale) else: raise NotImplementedError results['scale'] = scale results['scale_idx'] = scale_idx def _resize_img(self, results): """Resize images with ``results['scale']``.""" for key in ['image'] if 'image' in results else []: if self.keep_ratio: img, scale_factor = imrescale( results[key], results['scale'], return_scale=True, interpolation=self.interpolation, backend=self.backend) # the w_scale and h_scale has minor difference # a real fix should be done in the imrescale in the future new_h, new_w = img.shape[:2] h, w = results[key].shape[:2] w_scale = new_w / w h_scale = new_h / h else: img, w_scale, h_scale = imresize( results[key], results['scale'], return_scale=True, interpolation=self.interpolation, backend=self.backend) scale_factor = np.array( [w_scale, h_scale, w_scale, h_scale], dtype=np.float32) results['im_shape'] = np.array(img.shape) # in case that there is no padding results['pad_shape'] = img.shape results['scale_factor'] = scale_factor results['keep_ratio'] = self.keep_ratio # img_pad = self.impad(img, shape=results['scale']) results[key] = img def _resize_bboxes(self, results): """Resize bounding boxes with ``results['scale_factor']``.""" for key in ['gt_bbox'] if 'gt_bbox' in results else []: bboxes = results[key] * results['scale_factor'] if self.bbox_clip_border: img_shape = results['im_shape'] bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1]) bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0]) results[key] = bboxes def _resize_masks(self, results): """Resize masks with ``results['scale']``""" for key in ['mask'] if 'mask' in results else []: if results[key] is None: continue if self.keep_ratio: results[key] = results[key].rescale(results['scale']) else: results[key] = results[key].resize(results['im_shape'][:2]) def _resize_seg(self, results): """Resize semantic segmentation map with ``results['scale']``.""" for key in ['seg'] if 'seg' in results else []: if self.keep_ratio: gt_seg = imrescale( results[key], results['scale'], interpolation='nearest', backend=self.backend) else: gt_seg = imresize( results[key], results['scale'], interpolation='nearest', backend=self.backend) results[key] = gt_seg def _resize_keypoints(self, results): """Resize keypoints with ``results['scale_factor']``.""" for key in ['gt_joints'] if 'gt_joints' in results else []: keypoints = results[key].copy() keypoints[..., 0] = keypoints[..., 0] * results['scale_factor'][0] keypoints[..., 1] = keypoints[..., 1] * results['scale_factor'][1] if self.keypoint_clip_border: img_shape = results['im_shape'] keypoints[..., 0] = np.clip(keypoints[..., 0], 0, img_shape[1]) keypoints[..., 1] = np.clip(keypoints[..., 1], 0, img_shape[0]) results[key] = keypoints def _resize_areas(self, results): """Resize mask areas with ``results['scale_factor']``.""" for key in ['gt_areas'] if 'gt_areas' in results else []: areas = results[key].copy() areas = areas * results['scale_factor'][0] * results[ 'scale_factor'][1] results[key] = areas def __call__(self, results): """Call function to resize images, bounding boxes, masks, semantic segmentation map. Args: results (dict): Result dict from loading pipeline. Returns: dict: Resized results, 'im_shape', 'pad_shape', 'scale_factor', \ 'keep_ratio' keys are added into result dict. """ if 'scale' not in results: if 'scale_factor' in results: img_shape = results['image'].shape[:2] scale_factor = results['scale_factor'][0] # assert isinstance(scale_factor, float) results['scale'] = [int(x * scale_factor) for x in img_shape][::-1] else: self._random_scale(results) else: if not self.override: assert 'scale_factor' not in results, ( 'scale and scale_factor cannot be both set.') else: results.pop('scale') if 'scale_factor' in results: results.pop('scale_factor') self._random_scale(results) self._resize_img(results) self._resize_bboxes(results) self._resize_masks(results) self._resize_seg(results) self._resize_keypoints(results) self._resize_areas(results) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(img_scale={self.img_scale}, ' repr_str += f'multiscale_mode={self.multiscale_mode}, ' repr_str += f'ratio_range={self.ratio_range}, ' repr_str += f'keep_ratio={self.keep_ratio}, ' repr_str += f'bbox_clip_border={self.bbox_clip_border})' repr_str += f'keypoint_clip_border={self.keypoint_clip_border})' return repr_str ================================================ FILE: ppdet/data/transform/keypoints_3d_operators.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import try: from collections.abc import Sequence except Exception: from collections import Sequence import cv2 import numpy as np import math import copy import random import uuid from numbers import Number, Integral from ...modeling.keypoint_utils import get_affine_mat_kernel, warp_affine_joints, get_affine_transform, affine_transform, get_warp_matrix from ppdet.core.workspace import serializable from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) registered_ops = [] __all__ = [ 'CropAndFlipImages', 'PermuteImages', 'RandomFlipHalfBody3DTransformImages' ] import matplotlib.pyplot as plt from PIL import Image, ImageDraw from mpl_toolkits.mplot3d import Axes3D def register_keypointop(cls): return serializable(cls) def register_op(cls): registered_ops.append(cls.__name__) if not hasattr(BaseOperator, cls.__name__): setattr(BaseOperator, cls.__name__, cls) else: raise KeyError("The {} class has been registered.".format(cls.__name__)) return serializable(cls) class BaseOperator(object): def __init__(self, name=None): if name is None: name = self.__class__.__name__ self._id = name + '_' + str(uuid.uuid4())[-6:] def apply(self, sample, context=None): """ Process a sample. Args: sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx} context (dict): info about this sample processing Returns: result (dict): a processed sample """ return sample def __call__(self, sample, context=None): """ Process a sample. Args: sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx} context (dict): info about this sample processing Returns: result (dict): a processed sample """ if isinstance(sample, Sequence): # for batch_size for i in range(len(sample)): sample[i] = self.apply(sample[i], context) else: # image.shape changed sample = self.apply(sample, context) return sample def __str__(self): return str(self._id) @register_keypointop class CropAndFlipImages(object): """Crop all images""" def __init__(self, crop_range, flip_pairs=None): super(CropAndFlipImages, self).__init__() self.crop_range = crop_range self.flip_pairs = flip_pairs def __call__(self, records): # tuple images = records["image"] images = images[:, :, ::-1, :] images = images[:, :, self.crop_range[0]:self.crop_range[1]] records["image"] = images if "kps2d" in records.keys(): kps2d = records["kps2d"] width, height = images.shape[2], images.shape[1] kps2d = np.array(kps2d) kps2d[:, :, 0] = kps2d[:, :, 0] - self.crop_range[0] for pair in self.flip_pairs: kps2d[:, pair[0], :], kps2d[:,pair[1], :] = \ kps2d[:,pair[1], :], kps2d[:,pair[0], :].copy() records["kps2d"] = kps2d return records @register_op class PermuteImages(BaseOperator): def __init__(self): """ Change the channel to be (batch_size, C, H, W) #(6, 3, 1080, 1920) """ super(PermuteImages, self).__init__() def apply(self, sample, context=None): images = sample["image"] images = images.transpose((0, 3, 1, 2)) sample["image"] = images return sample @register_keypointop class RandomFlipHalfBody3DTransformImages(object): """apply data augment to images and coords to achieve the flip, scale, rotate and half body transform effect for training image Args: trainsize (list):[w, h], Image target size upper_body_ids (list): The upper body joint ids flip_pairs (list): The left-right joints exchange order list pixel_std (int): The pixel std of the scale scale (float): The scale factor to transform the image rot (int): The rotate factor to transform the image num_joints_half_body (int): The joints threshold of the half body transform prob_half_body (float): The threshold of the half body transform flip (bool): Whether to flip the image Returns: records(dict): contain the image and coords after tranformed """ def __init__(self, trainsize, upper_body_ids, flip_pairs, pixel_std, scale=0.35, rot=40, num_joints_half_body=8, prob_half_body=0.3, flip=True, rot_prob=0.6, do_occlusion=False): super(RandomFlipHalfBody3DTransformImages, self).__init__() self.trainsize = trainsize self.upper_body_ids = upper_body_ids self.flip_pairs = flip_pairs self.pixel_std = pixel_std self.scale = scale self.rot = rot self.num_joints_half_body = num_joints_half_body self.prob_half_body = prob_half_body self.flip = flip self.aspect_ratio = trainsize[0] * 1.0 / trainsize[1] self.rot_prob = rot_prob self.do_occlusion = do_occlusion def halfbody_transform(self, joints, joints_vis): upper_joints = [] lower_joints = [] for joint_id in range(joints.shape[0]): if joints_vis[joint_id][0] > 0: if joint_id in self.upper_body_ids: upper_joints.append(joints[joint_id]) else: lower_joints.append(joints[joint_id]) if np.random.randn() < 0.5 and len(upper_joints) > 2: selected_joints = upper_joints else: selected_joints = lower_joints if len( lower_joints) > 2 else upper_joints if len(selected_joints) < 2: return None, None selected_joints = np.array(selected_joints, dtype=np.float32) center = selected_joints.mean(axis=0)[:2] left_top = np.amin(selected_joints, axis=0) right_bottom = np.amax(selected_joints, axis=0) w = right_bottom[0] - left_top[0] h = right_bottom[1] - left_top[1] if w > self.aspect_ratio * h: h = w * 1.0 / self.aspect_ratio elif w < self.aspect_ratio * h: w = h * self.aspect_ratio scale = np.array( [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std], dtype=np.float32) scale = scale * 1.5 return center, scale def flip_joints(self, joints, joints_vis, width, matched_parts, kps2d=None): # joints: (6, 24, 3),(num_frames, num_joints, 3) joints[:, :, 0] = width - joints[:, :, 0] - 1 # x if kps2d is not None: kps2d[:, :, 0] = width - kps2d[:, :, 0] - 1 for pair in matched_parts: joints[:, pair[0], :], joints[:,pair[1], :] = \ joints[:,pair[1], :], joints[:,pair[0], :].copy() joints_vis[:,pair[0], :], joints_vis[:,pair[1], :] = \ joints_vis[:,pair[1], :], joints_vis[:,pair[0], :].copy() if kps2d is not None: kps2d[:, pair[0], :], kps2d[:,pair[1], :] = \ kps2d[:,pair[1], :], kps2d[:,pair[0], :].copy() # move to zero joints -= joints[:, [0], :] # (batch_size, 24, 3),numpy.ndarray return joints, joints_vis, kps2d def __call__(self, records): images = records[ 'image'] #kps3d, kps3d_vis, images. images.shape(num_frames, width, height, 3) joints = records['kps3d'] joints_vis = records['kps3d_vis'] kps2d = None if 'kps2d' in records.keys(): kps2d = records['kps2d'] if self.flip and np.random.random() <= 0.5: images = images[:, :, ::-1, :] # 图像水平翻转 (6, 1080, 810, 3) joints, joints_vis, kps2d = self.flip_joints( joints, joints_vis, images.shape[2], self.flip_pairs, kps2d) # 关键点左右对称翻转 occlusion = False if self.do_occlusion and random.random() <= 0.5: # 随机遮挡 height = images[0].shape[0] width = images[0].shape[1] occlusion = True while True: area_min = 0.0 area_max = 0.2 synth_area = (random.random() * (area_max - area_min) + area_min) * width * height ratio_min = 0.3 ratio_max = 1 / 0.3 synth_ratio = (random.random() * (ratio_max - ratio_min) + ratio_min) synth_h = math.sqrt(synth_area * synth_ratio) synth_w = math.sqrt(synth_area / synth_ratio) synth_xmin = random.random() * (width - synth_w - 1) synth_ymin = random.random() * (height - synth_h - 1) if synth_xmin >= 0 and synth_ymin >= 0 and synth_xmin + synth_w < width and synth_ymin + synth_h < height: xmin = int(synth_xmin) ymin = int(synth_ymin) w = int(synth_w) h = int(synth_h) mask = np.random.rand(h, w, 3) * 255 images[:, ymin:ymin + h, xmin:xmin + w, :] = mask[ None, :, :, :] break records['image'] = images records['kps3d'] = joints records['kps3d_vis'] = joints_vis if kps2d is not None: records['kps2d'] = kps2d return records ================================================ FILE: ppdet/data/transform/mot_operators.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function try: from collections.abc import Sequence except Exception: from collections import Sequence from numbers import Integral import cv2 import copy import numpy as np import random import math from .operators import BaseOperator, register_op from .batch_operators import Gt2TTFTarget from ppdet.modeling.bbox_utils import bbox_iou_np_expand from ppdet.utils.logger import setup_logger from .op_helper import gaussian_radius logger = setup_logger(__name__) __all__ = [ 'RGBReverse', 'LetterBoxResize', 'MOTRandomAffine', 'Gt2JDETargetThres', 'Gt2JDETargetMax', 'Gt2FairMOTTarget' ] @register_op class RGBReverse(BaseOperator): """RGB to BGR, or BGR to RGB, sensitive to MOTRandomAffine """ def __init__(self): super(RGBReverse, self).__init__() def apply(self, sample, context=None): im = sample['image'] sample['image'] = np.ascontiguousarray(im[:, :, ::-1]) return sample @register_op class LetterBoxResize(BaseOperator): def __init__(self, target_size): """ Resize image to target size, convert normalized xywh to pixel xyxy format ([x_center, y_center, width, height] -> [x0, y0, x1, y1]). Args: target_size (int|list): image target size. """ super(LetterBoxResize, self).__init__() if not isinstance(target_size, (Integral, Sequence)): raise TypeError( "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}". format(type(target_size))) if isinstance(target_size, Integral): target_size = [target_size, target_size] self.target_size = target_size def apply_image(self, img, height, width, color=(127.5, 127.5, 127.5)): # letterbox: resize a rectangular image to a padded rectangular shape = img.shape[:2] # [height, width] ratio_h = float(height) / shape[0] ratio_w = float(width) / shape[1] ratio = min(ratio_h, ratio_w) new_shape = (round(shape[1] * ratio), round(shape[0] * ratio)) # [width, height] padw = (width - new_shape[0]) / 2 padh = (height - new_shape[1]) / 2 top, bottom = round(padh - 0.1), round(padh + 0.1) left, right = round(padw - 0.1), round(padw + 0.1) img = cv2.resize( img, new_shape, interpolation=cv2.INTER_AREA) # resized, no border img = cv2.copyMakeBorder( img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # padded rectangular return img, ratio, padw, padh def apply_bbox(self, bbox0, h, w, ratio, padw, padh): bboxes = bbox0.copy() bboxes[:, 0] = ratio * w * (bbox0[:, 0] - bbox0[:, 2] / 2) + padw bboxes[:, 1] = ratio * h * (bbox0[:, 1] - bbox0[:, 3] / 2) + padh bboxes[:, 2] = ratio * w * (bbox0[:, 0] + bbox0[:, 2] / 2) + padw bboxes[:, 3] = ratio * h * (bbox0[:, 1] + bbox0[:, 3] / 2) + padh return bboxes def apply(self, sample, context=None): """ Resize the image numpy. """ im = sample['image'] h, w = sample['im_shape'] if not isinstance(im, np.ndarray): raise TypeError("{}: image type is not numpy.".format(self)) if len(im.shape) != 3: from PIL import UnidentifiedImageError raise UnidentifiedImageError( '{}: image is not 3-dimensional.'.format(self)) # apply image height, width = self.target_size img, ratio, padw, padh = self.apply_image( im, height=height, width=width) sample['image'] = img new_shape = (round(h * ratio), round(w * ratio)) sample['im_shape'] = np.asarray(new_shape, dtype=np.float32) sample['scale_factor'] = np.asarray([ratio, ratio], dtype=np.float32) # apply bbox if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], h, w, ratio, padw, padh) return sample @register_op class MOTRandomAffine(BaseOperator): """ Affine transform to image and coords to achieve the rotate, scale and shift effect for training image. Args: degrees (list[2]): the rotate range to apply, transform range is [min, max] translate (list[2]): the translate range to apply, transform range is [min, max] scale (list[2]): the scale range to apply, transform range is [min, max] shear (list[2]): the shear range to apply, transform range is [min, max] borderValue (list[3]): value used in case of a constant border when appling the perspective transformation reject_outside (bool): reject warped bounding bboxes outside of image Returns: records(dict): contain the image and coords after tranformed """ def __init__(self, degrees=(-5, 5), translate=(0.10, 0.10), scale=(0.50, 1.20), shear=(-2, 2), borderValue=(127.5, 127.5, 127.5), reject_outside=True): super(MOTRandomAffine, self).__init__() self.degrees = degrees self.translate = translate self.scale = scale self.shear = shear self.borderValue = borderValue self.reject_outside = reject_outside def apply(self, sample, context=None): # https://medium.com/uruvideo/dataset-augmentation-with-random-homographies-a8f4b44830d4 border = 0 # width of added border (optional) img = sample['image'] height, width = img.shape[0], img.shape[1] # Rotation and Scale R = np.eye(3) a = random.random() * (self.degrees[1] - self.degrees[0] ) + self.degrees[0] s = random.random() * (self.scale[1] - self.scale[0]) + self.scale[0] R[:2] = cv2.getRotationMatrix2D( angle=a, center=(width / 2, height / 2), scale=s) # Translation T = np.eye(3) T[0, 2] = ( random.random() * 2 - 1 ) * self.translate[0] * height + border # x translation (pixels) T[1, 2] = ( random.random() * 2 - 1 ) * self.translate[1] * width + border # y translation (pixels) # Shear S = np.eye(3) S[0, 1] = math.tan((random.random() * (self.shear[1] - self.shear[0]) + self.shear[0]) * math.pi / 180) # x shear (deg) S[1, 0] = math.tan((random.random() * (self.shear[1] - self.shear[0]) + self.shear[0]) * math.pi / 180) # y shear (deg) M = S @T @R # Combined rotation matrix. ORDER IS IMPORTANT HERE!! imw = cv2.warpPerspective( img, M, dsize=(width, height), flags=cv2.INTER_LINEAR, borderValue=self.borderValue) # BGR order borderValue if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: targets = sample['gt_bbox'] n = targets.shape[0] points = targets.copy() area0 = (points[:, 2] - points[:, 0]) * ( points[:, 3] - points[:, 1]) # warp points xy = np.ones((n * 4, 3)) xy[:, :2] = points[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape( n * 4, 2) # x1y1, x2y2, x1y2, x2y1 xy = (xy @M.T)[:, :2].reshape(n, 8) # create new boxes x = xy[:, [0, 2, 4, 6]] y = xy[:, [1, 3, 5, 7]] xy = np.concatenate( (x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T # apply angle-based reduction radians = a * math.pi / 180 reduction = max(abs(math.sin(radians)), abs(math.cos(radians)))**0.5 x = (xy[:, 2] + xy[:, 0]) / 2 y = (xy[:, 3] + xy[:, 1]) / 2 w = (xy[:, 2] - xy[:, 0]) * reduction h = (xy[:, 3] - xy[:, 1]) * reduction xy = np.concatenate( (x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T # reject warped points outside of image if self.reject_outside: np.clip(xy[:, 0], 0, width, out=xy[:, 0]) np.clip(xy[:, 2], 0, width, out=xy[:, 2]) np.clip(xy[:, 1], 0, height, out=xy[:, 1]) np.clip(xy[:, 3], 0, height, out=xy[:, 3]) w = xy[:, 2] - xy[:, 0] h = xy[:, 3] - xy[:, 1] area = w * h ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16)) i = (w > 4) & (h > 4) & (area / (area0 + 1e-16) > 0.1) & (ar < 10) if sum(i) > 0: sample['gt_bbox'] = xy[i].astype(sample['gt_bbox'].dtype) sample['gt_class'] = sample['gt_class'][i] if 'difficult' in sample: sample['difficult'] = sample['difficult'][i] if 'gt_ide' in sample: sample['gt_ide'] = sample['gt_ide'][i] if 'is_crowd' in sample: sample['is_crowd'] = sample['is_crowd'][i] sample['image'] = imw return sample else: return sample @register_op class Gt2JDETargetThres(BaseOperator): __shared__ = ['num_classes'] """ Generate JDE targets by groud truth data when training Args: anchors (list): anchors of JDE model anchor_masks (list): anchor_masks of JDE model downsample_ratios (list): downsample ratios of JDE model ide_thresh (float): thresh of identity, higher is groud truth fg_thresh (float): thresh of foreground, higher is foreground bg_thresh (float): thresh of background, lower is background num_classes (int): number of classes """ def __init__(self, anchors, anchor_masks, downsample_ratios, ide_thresh=0.5, fg_thresh=0.5, bg_thresh=0.4, num_classes=1): super(Gt2JDETargetThres, self).__init__() self.anchors = anchors self.anchor_masks = anchor_masks self.downsample_ratios = downsample_ratios self.ide_thresh = ide_thresh self.fg_thresh = fg_thresh self.bg_thresh = bg_thresh self.num_classes = num_classes def generate_anchor(self, nGh, nGw, anchor_hw): nA = len(anchor_hw) yy, xx = np.meshgrid(np.arange(nGh), np.arange(nGw)) mesh = np.stack([xx.T, yy.T], axis=0) # [2, nGh, nGw] mesh = np.repeat(mesh[None, :], nA, axis=0) # [nA, 2, nGh, nGw] anchor_offset_mesh = anchor_hw[:, :, None][:, :, :, None] anchor_offset_mesh = np.repeat(anchor_offset_mesh, nGh, axis=-2) anchor_offset_mesh = np.repeat(anchor_offset_mesh, nGw, axis=-1) anchor_mesh = np.concatenate( [mesh, anchor_offset_mesh], axis=1) # [nA, 4, nGh, nGw] return anchor_mesh def encode_delta(self, gt_box_list, fg_anchor_list): px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \ fg_anchor_list[:, 2], fg_anchor_list[:,3] gx, gy, gw, gh = gt_box_list[:, 0], gt_box_list[:, 1], \ gt_box_list[:, 2], gt_box_list[:, 3] dx = (gx - px) / pw dy = (gy - py) / ph dw = np.log(gw / pw) dh = np.log(gh / ph) return np.stack([dx, dy, dw, dh], axis=1) def pad_box(self, sample, num_max): assert 'gt_bbox' in sample bbox = sample['gt_bbox'] gt_num = len(bbox) pad_bbox = np.zeros((num_max, 4), dtype=np.float32) if gt_num > 0: pad_bbox[:gt_num, :] = bbox[:gt_num, :] sample['gt_bbox'] = pad_bbox if 'gt_score' in sample: pad_score = np.zeros((num_max, ), dtype=np.float32) if gt_num > 0: pad_score[:gt_num] = sample['gt_score'][:gt_num, 0] sample['gt_score'] = pad_score if 'difficult' in sample: pad_diff = np.zeros((num_max, ), dtype=np.int32) if gt_num > 0: pad_diff[:gt_num] = sample['difficult'][:gt_num, 0] sample['difficult'] = pad_diff if 'is_crowd' in sample: pad_crowd = np.zeros((num_max, ), dtype=np.int32) if gt_num > 0: pad_crowd[:gt_num] = sample['is_crowd'][:gt_num, 0] sample['is_crowd'] = pad_crowd if 'gt_ide' in sample: pad_ide = np.zeros((num_max, ), dtype=np.int32) if gt_num > 0: pad_ide[:gt_num] = sample['gt_ide'][:gt_num, 0] sample['gt_ide'] = pad_ide return sample def __call__(self, samples, context=None): assert len(self.anchor_masks) == len(self.downsample_ratios), \ "anchor_masks', and 'downsample_ratios' should have same length." h, w = samples[0]['image'].shape[1:3] num_max = 0 for sample in samples: num_max = max(num_max, len(sample['gt_bbox'])) for sample in samples: gt_bbox = sample['gt_bbox'] gt_ide = sample['gt_ide'] for i, (anchor_hw, downsample_ratio ) in enumerate(zip(self.anchors, self.downsample_ratios)): anchor_hw = np.array( anchor_hw, dtype=np.float32) / downsample_ratio nA = len(anchor_hw) nGh, nGw = int(h / downsample_ratio), int(w / downsample_ratio) tbox = np.zeros((nA, nGh, nGw, 4), dtype=np.float32) tconf = np.zeros((nA, nGh, nGw), dtype=np.float32) tid = -np.ones((nA, nGh, nGw, 1), dtype=np.float32) gxy, gwh = gt_bbox[:, 0:2].copy(), gt_bbox[:, 2:4].copy() gxy[:, 0] = gxy[:, 0] * nGw gxy[:, 1] = gxy[:, 1] * nGh gwh[:, 0] = gwh[:, 0] * nGw gwh[:, 1] = gwh[:, 1] * nGh gxy[:, 0] = np.clip(gxy[:, 0], 0, nGw - 1) gxy[:, 1] = np.clip(gxy[:, 1], 0, nGh - 1) tboxes = np.concatenate([gxy, gwh], axis=1) anchor_mesh = self.generate_anchor(nGh, nGw, anchor_hw) anchor_list = np.transpose(anchor_mesh, (0, 2, 3, 1)).reshape(-1, 4) iou_pdist = bbox_iou_np_expand( anchor_list, tboxes, x1y1x2y2=False) iou_max = np.max(iou_pdist, axis=1) max_gt_index = np.argmax(iou_pdist, axis=1) iou_map = iou_max.reshape(nA, nGh, nGw) gt_index_map = max_gt_index.reshape(nA, nGh, nGw) id_index = iou_map > self.ide_thresh fg_index = iou_map > self.fg_thresh bg_index = iou_map < self.bg_thresh ign_index = (iou_map < self.fg_thresh) * ( iou_map > self.bg_thresh) tconf[fg_index] = 1 tconf[bg_index] = 0 tconf[ign_index] = -1 gt_index = gt_index_map[fg_index] gt_box_list = tboxes[gt_index] gt_id_list = gt_ide[gt_index_map[id_index]] if np.sum(fg_index) > 0: tid[id_index] = gt_id_list fg_anchor_list = anchor_list.reshape(nA, nGh, nGw, 4)[fg_index] delta_target = self.encode_delta(gt_box_list, fg_anchor_list) tbox[fg_index] = delta_target sample['tbox{}'.format(i)] = tbox sample['tconf{}'.format(i)] = tconf sample['tide{}'.format(i)] = tid sample.pop('gt_class') sample = self.pad_box(sample, num_max) return samples @register_op class Gt2JDETargetMax(BaseOperator): __shared__ = ['num_classes'] """ Generate JDE targets by groud truth data when evaluating Args: anchors (list): anchors of JDE model anchor_masks (list): anchor_masks of JDE model downsample_ratios (list): downsample ratios of JDE model max_iou_thresh (float): iou thresh for high quality anchor num_classes (int): number of classes """ def __init__(self, anchors, anchor_masks, downsample_ratios, max_iou_thresh=0.60, num_classes=1): super(Gt2JDETargetMax, self).__init__() self.anchors = anchors self.anchor_masks = anchor_masks self.downsample_ratios = downsample_ratios self.max_iou_thresh = max_iou_thresh self.num_classes = num_classes def __call__(self, samples, context=None): assert len(self.anchor_masks) == len(self.downsample_ratios), \ "anchor_masks', and 'downsample_ratios' should have same length." h, w = samples[0]['image'].shape[1:3] for sample in samples: gt_bbox = sample['gt_bbox'] gt_ide = sample['gt_ide'] for i, (anchor_hw, downsample_ratio ) in enumerate(zip(self.anchors, self.downsample_ratios)): anchor_hw = np.array( anchor_hw, dtype=np.float32) / downsample_ratio nA = len(anchor_hw) nGh, nGw = int(h / downsample_ratio), int(w / downsample_ratio) tbox = np.zeros((nA, nGh, nGw, 4), dtype=np.float32) tconf = np.zeros((nA, nGh, nGw), dtype=np.float32) tid = -np.ones((nA, nGh, nGw, 1), dtype=np.float32) gxy, gwh = gt_bbox[:, 0:2].copy(), gt_bbox[:, 2:4].copy() gxy[:, 0] = gxy[:, 0] * nGw gxy[:, 1] = gxy[:, 1] * nGh gwh[:, 0] = gwh[:, 0] * nGw gwh[:, 1] = gwh[:, 1] * nGh gi = np.clip(gxy[:, 0], 0, nGw - 1).astype(int) gj = np.clip(gxy[:, 1], 0, nGh - 1).astype(int) # iou of targets-anchors (using wh only) box1 = gwh box2 = anchor_hw[:, None, :] inter_area = np.minimum(box1, box2).prod(2) iou = inter_area / ( box1.prod(1) + box2.prod(2) - inter_area + 1e-16) # Select best iou_pred and anchor iou_best = iou.max(0) # best anchor [0-2] for each target a = np.argmax(iou, axis=0) # Select best unique target-anchor combinations iou_order = np.argsort(-iou_best) # best to worst # Unique anchor selection u = np.stack((gi, gj, a), 0)[:, iou_order] _, first_unique = np.unique(u, axis=1, return_index=True) mask = iou_order[first_unique] # best anchor must share significant commonality (iou) with target # TODO: examine arbitrary threshold idx = mask[iou_best[mask] > self.max_iou_thresh] if len(idx) > 0: a_i, gj_i, gi_i = a[idx], gj[idx], gi[idx] t_box = gt_bbox[idx] t_id = gt_ide[idx] if len(t_box.shape) == 1: t_box = t_box.reshape(1, 4) gxy, gwh = t_box[:, 0:2].copy(), t_box[:, 2:4].copy() gxy[:, 0] = gxy[:, 0] * nGw gxy[:, 1] = gxy[:, 1] * nGh gwh[:, 0] = gwh[:, 0] * nGw gwh[:, 1] = gwh[:, 1] * nGh # XY coordinates tbox[:, :, :, 0:2][a_i, gj_i, gi_i] = gxy - gxy.astype(int) # Width and height in yolo method tbox[:, :, :, 2:4][a_i, gj_i, gi_i] = np.log(gwh / anchor_hw[a_i]) tconf[a_i, gj_i, gi_i] = 1 tid[a_i, gj_i, gi_i] = t_id sample['tbox{}'.format(i)] = tbox sample['tconf{}'.format(i)] = tconf sample['tide{}'.format(i)] = tid class Gt2FairMOTTarget(Gt2TTFTarget): __shared__ = ['num_classes'] """ Generate FairMOT targets by ground truth data. Difference between Gt2FairMOTTarget and Gt2TTFTarget are: 1. the gaussian kernal radius to generate a heatmap. 2. the targets needed during training. Args: num_classes(int): the number of classes. down_ratio(int): the down ratio from images to heatmap, 4 by default. max_objs(int): the maximum number of ground truth objects in a image, 500 by default. """ def __init__(self, num_classes=1, down_ratio=4, max_objs=500): super(Gt2TTFTarget, self).__init__() self.down_ratio = down_ratio self.num_classes = num_classes self.max_objs = max_objs def __call__(self, samples, context=None): for b_id, sample in enumerate(samples): output_h = sample['image'].shape[1] // self.down_ratio output_w = sample['image'].shape[2] // self.down_ratio heatmap = np.zeros( (self.num_classes, output_h, output_w), dtype='float32') bbox_size = np.zeros((self.max_objs, 4), dtype=np.float32) center_offset = np.zeros((self.max_objs, 2), dtype=np.float32) index = np.zeros((self.max_objs, ), dtype=np.int64) index_mask = np.zeros((self.max_objs, ), dtype=np.int32) reid = np.zeros((self.max_objs, ), dtype=np.int64) bbox_xys = np.zeros((self.max_objs, 4), dtype=np.float32) if self.num_classes > 1: # each category corresponds to a set of track ids cls_tr_ids = np.zeros( (self.num_classes, output_h, output_w), dtype=np.int64) cls_id_map = np.full((output_h, output_w), -1, dtype=np.int64) gt_bbox = sample['gt_bbox'] gt_class = sample['gt_class'] gt_ide = sample['gt_ide'] for k in range(len(gt_bbox)): cls_id = gt_class[k][0] bbox = gt_bbox[k] ide = gt_ide[k][0] bbox[[0, 2]] = bbox[[0, 2]] * output_w bbox[[1, 3]] = bbox[[1, 3]] * output_h bbox_amodal = copy.deepcopy(bbox) bbox_amodal[0] = bbox_amodal[0] - bbox_amodal[2] / 2. bbox_amodal[1] = bbox_amodal[1] - bbox_amodal[3] / 2. bbox_amodal[2] = bbox_amodal[0] + bbox_amodal[2] bbox_amodal[3] = bbox_amodal[1] + bbox_amodal[3] bbox[0] = np.clip(bbox[0], 0, output_w - 1) bbox[1] = np.clip(bbox[1], 0, output_h - 1) h = bbox[3] w = bbox[2] bbox_xy = copy.deepcopy(bbox) bbox_xy[0] = bbox_xy[0] - bbox_xy[2] / 2 bbox_xy[1] = bbox_xy[1] - bbox_xy[3] / 2 bbox_xy[2] = bbox_xy[0] + bbox_xy[2] bbox_xy[3] = bbox_xy[1] + bbox_xy[3] if h > 0 and w > 0: radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7) radius = max(0, int(radius)) ct = np.array([bbox[0], bbox[1]], dtype=np.float32) ct_int = ct.astype(np.int32) self.draw_truncate_gaussian(heatmap[cls_id], ct_int, radius, radius) bbox_size[k] = ct[0] - bbox_amodal[0], ct[1] - bbox_amodal[1], \ bbox_amodal[2] - ct[0], bbox_amodal[3] - ct[1] index[k] = ct_int[1] * output_w + ct_int[0] center_offset[k] = ct - ct_int index_mask[k] = 1 reid[k] = ide bbox_xys[k] = bbox_xy if self.num_classes > 1: cls_id_map[ct_int[1], ct_int[0]] = cls_id cls_tr_ids[cls_id][ct_int[1]][ct_int[0]] = ide - 1 # track id start from 0 sample['heatmap'] = heatmap sample['index'] = index sample['offset'] = center_offset sample['size'] = bbox_size sample['index_mask'] = index_mask sample['reid'] = reid if self.num_classes > 1: sample['cls_id_map'] = cls_id_map sample['cls_tr_ids'] = cls_tr_ids sample['bbox_xys'] = bbox_xys sample.pop('is_crowd', None) sample.pop('difficult', None) sample.pop('gt_class', None) sample.pop('gt_bbox', None) sample.pop('gt_score', None) sample.pop('gt_ide', None) return samples ================================================ FILE: ppdet/data/transform/op_helper.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # this file contains helper methods for BBOX processing from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import random import math import cv2 def meet_emit_constraint(src_bbox, sample_bbox): center_x = (src_bbox[2] + src_bbox[0]) / 2 center_y = (src_bbox[3] + src_bbox[1]) / 2 if center_x >= sample_bbox[0] and \ center_x <= sample_bbox[2] and \ center_y >= sample_bbox[1] and \ center_y <= sample_bbox[3]: return True return False def clip_bbox(src_bbox): src_bbox[0] = max(min(src_bbox[0], 1.0), 0.0) src_bbox[1] = max(min(src_bbox[1], 1.0), 0.0) src_bbox[2] = max(min(src_bbox[2], 1.0), 0.0) src_bbox[3] = max(min(src_bbox[3], 1.0), 0.0) return src_bbox def bbox_area(src_bbox): if src_bbox[2] < src_bbox[0] or src_bbox[3] < src_bbox[1]: return 0. else: width = src_bbox[2] - src_bbox[0] height = src_bbox[3] - src_bbox[1] return width * height def is_overlap(object_bbox, sample_bbox): if object_bbox[0] >= sample_bbox[2] or \ object_bbox[2] <= sample_bbox[0] or \ object_bbox[1] >= sample_bbox[3] or \ object_bbox[3] <= sample_bbox[1]: return False else: return True def filter_and_process(sample_bbox, bboxes, labels, scores=None, keypoints=None): new_bboxes = [] new_labels = [] new_scores = [] new_keypoints = [] new_kp_ignore = [] for i in range(len(bboxes)): new_bbox = [0, 0, 0, 0] obj_bbox = [bboxes[i][0], bboxes[i][1], bboxes[i][2], bboxes[i][3]] if not meet_emit_constraint(obj_bbox, sample_bbox): continue if not is_overlap(obj_bbox, sample_bbox): continue sample_width = sample_bbox[2] - sample_bbox[0] sample_height = sample_bbox[3] - sample_bbox[1] new_bbox[0] = (obj_bbox[0] - sample_bbox[0]) / sample_width new_bbox[1] = (obj_bbox[1] - sample_bbox[1]) / sample_height new_bbox[2] = (obj_bbox[2] - sample_bbox[0]) / sample_width new_bbox[3] = (obj_bbox[3] - sample_bbox[1]) / sample_height new_bbox = clip_bbox(new_bbox) if bbox_area(new_bbox) > 0: new_bboxes.append(new_bbox) new_labels.append([labels[i][0]]) if scores is not None: new_scores.append([scores[i][0]]) if keypoints is not None: sample_keypoint = keypoints[0][i] for j in range(len(sample_keypoint)): kp_len = sample_height if j % 2 else sample_width sample_coord = sample_bbox[1] if j % 2 else sample_bbox[0] sample_keypoint[j] = ( sample_keypoint[j] - sample_coord) / kp_len sample_keypoint[j] = max(min(sample_keypoint[j], 1.0), 0.0) new_keypoints.append(sample_keypoint) new_kp_ignore.append(keypoints[1][i]) bboxes = np.array(new_bboxes) labels = np.array(new_labels) scores = np.array(new_scores) if keypoints is not None: keypoints = np.array(new_keypoints) new_kp_ignore = np.array(new_kp_ignore) return bboxes, labels, scores, (keypoints, new_kp_ignore) return bboxes, labels, scores def bbox_area_sampling(bboxes, labels, scores, target_size, min_size): new_bboxes = [] new_labels = [] new_scores = [] for i, bbox in enumerate(bboxes): w = float((bbox[2] - bbox[0]) * target_size) h = float((bbox[3] - bbox[1]) * target_size) if w * h < float(min_size * min_size): continue else: new_bboxes.append(bbox) new_labels.append(labels[i]) if scores is not None and scores.size != 0: new_scores.append(scores[i]) bboxes = np.array(new_bboxes) labels = np.array(new_labels) scores = np.array(new_scores) return bboxes, labels, scores def generate_sample_bbox(sampler): scale = np.random.uniform(sampler[2], sampler[3]) aspect_ratio = np.random.uniform(sampler[4], sampler[5]) aspect_ratio = max(aspect_ratio, (scale**2.0)) aspect_ratio = min(aspect_ratio, 1 / (scale**2.0)) bbox_width = scale * (aspect_ratio**0.5) bbox_height = scale / (aspect_ratio**0.5) xmin_bound = 1 - bbox_width ymin_bound = 1 - bbox_height xmin = np.random.uniform(0, xmin_bound) ymin = np.random.uniform(0, ymin_bound) xmax = xmin + bbox_width ymax = ymin + bbox_height sampled_bbox = [xmin, ymin, xmax, ymax] return sampled_bbox def generate_sample_bbox_square(sampler, image_width, image_height): scale = np.random.uniform(sampler[2], sampler[3]) aspect_ratio = np.random.uniform(sampler[4], sampler[5]) aspect_ratio = max(aspect_ratio, (scale**2.0)) aspect_ratio = min(aspect_ratio, 1 / (scale**2.0)) bbox_width = scale * (aspect_ratio**0.5) bbox_height = scale / (aspect_ratio**0.5) if image_height < image_width: bbox_width = bbox_height * image_height / image_width else: bbox_height = bbox_width * image_width / image_height xmin_bound = 1 - bbox_width ymin_bound = 1 - bbox_height xmin = np.random.uniform(0, xmin_bound) ymin = np.random.uniform(0, ymin_bound) xmax = xmin + bbox_width ymax = ymin + bbox_height sampled_bbox = [xmin, ymin, xmax, ymax] return sampled_bbox def data_anchor_sampling(bbox_labels, image_width, image_height, scale_array, resize_width): num_gt = len(bbox_labels) # np.random.randint range: [low, high) rand_idx = np.random.randint(0, num_gt) if num_gt != 0 else 0 if num_gt != 0: norm_xmin = bbox_labels[rand_idx][0] norm_ymin = bbox_labels[rand_idx][1] norm_xmax = bbox_labels[rand_idx][2] norm_ymax = bbox_labels[rand_idx][3] xmin = norm_xmin * image_width ymin = norm_ymin * image_height wid = image_width * (norm_xmax - norm_xmin) hei = image_height * (norm_ymax - norm_ymin) range_size = 0 area = wid * hei for scale_ind in range(0, len(scale_array) - 1): if area > scale_array[scale_ind] ** 2 and area < \ scale_array[scale_ind + 1] ** 2: range_size = scale_ind + 1 break if area > scale_array[len(scale_array) - 2]**2: range_size = len(scale_array) - 2 scale_choose = 0.0 if range_size == 0: rand_idx_size = 0 else: # np.random.randint range: [low, high) rng_rand_size = np.random.randint(0, range_size + 1) rand_idx_size = rng_rand_size % (range_size + 1) if rand_idx_size == range_size: min_resize_val = scale_array[rand_idx_size] / 2.0 max_resize_val = min(2.0 * scale_array[rand_idx_size], 2 * math.sqrt(wid * hei)) scale_choose = random.uniform(min_resize_val, max_resize_val) else: min_resize_val = scale_array[rand_idx_size] / 2.0 max_resize_val = 2.0 * scale_array[rand_idx_size] scale_choose = random.uniform(min_resize_val, max_resize_val) sample_bbox_size = wid * resize_width / scale_choose w_off_orig = 0.0 h_off_orig = 0.0 if sample_bbox_size < max(image_height, image_width): if wid <= sample_bbox_size: w_off_orig = np.random.uniform(xmin + wid - sample_bbox_size, xmin) else: w_off_orig = np.random.uniform(xmin, xmin + wid - sample_bbox_size) if hei <= sample_bbox_size: h_off_orig = np.random.uniform(ymin + hei - sample_bbox_size, ymin) else: h_off_orig = np.random.uniform(ymin, ymin + hei - sample_bbox_size) else: w_off_orig = np.random.uniform(image_width - sample_bbox_size, 0.0) h_off_orig = np.random.uniform(image_height - sample_bbox_size, 0.0) w_off_orig = math.floor(w_off_orig) h_off_orig = math.floor(h_off_orig) # Figure out top left coordinates. w_off = float(w_off_orig / image_width) h_off = float(h_off_orig / image_height) sampled_bbox = [ w_off, h_off, w_off + float(sample_bbox_size / image_width), h_off + float(sample_bbox_size / image_height) ] return sampled_bbox else: return 0 def jaccard_overlap(sample_bbox, object_bbox): if sample_bbox[0] >= object_bbox[2] or \ sample_bbox[2] <= object_bbox[0] or \ sample_bbox[1] >= object_bbox[3] or \ sample_bbox[3] <= object_bbox[1]: return 0 intersect_xmin = max(sample_bbox[0], object_bbox[0]) intersect_ymin = max(sample_bbox[1], object_bbox[1]) intersect_xmax = min(sample_bbox[2], object_bbox[2]) intersect_ymax = min(sample_bbox[3], object_bbox[3]) intersect_size = (intersect_xmax - intersect_xmin) * ( intersect_ymax - intersect_ymin) sample_bbox_size = bbox_area(sample_bbox) object_bbox_size = bbox_area(object_bbox) overlap = intersect_size / ( sample_bbox_size + object_bbox_size - intersect_size) return overlap def intersect_bbox(bbox1, bbox2): if bbox2[0] > bbox1[2] or bbox2[2] < bbox1[0] or \ bbox2[1] > bbox1[3] or bbox2[3] < bbox1[1]: intersection_box = [0.0, 0.0, 0.0, 0.0] else: intersection_box = [ max(bbox1[0], bbox2[0]), max(bbox1[1], bbox2[1]), min(bbox1[2], bbox2[2]), min(bbox1[3], bbox2[3]) ] return intersection_box def bbox_coverage(bbox1, bbox2): inter_box = intersect_bbox(bbox1, bbox2) intersect_size = bbox_area(inter_box) if intersect_size > 0: bbox1_size = bbox_area(bbox1) return intersect_size / bbox1_size else: return 0. def satisfy_sample_constraint(sampler, sample_bbox, gt_bboxes, satisfy_all=False): if sampler[6] == 0 and sampler[7] == 0: return True satisfied = [] for i in range(len(gt_bboxes)): object_bbox = [ gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3] ] overlap = jaccard_overlap(sample_bbox, object_bbox) if sampler[6] != 0 and \ overlap < sampler[6]: satisfied.append(False) continue if sampler[7] != 0 and \ overlap > sampler[7]: satisfied.append(False) continue satisfied.append(True) if not satisfy_all: return True if satisfy_all: return np.all(satisfied) else: return False def satisfy_sample_constraint_coverage(sampler, sample_bbox, gt_bboxes): if sampler[6] == 0 and sampler[7] == 0: has_jaccard_overlap = False else: has_jaccard_overlap = True if sampler[8] == 0 and sampler[9] == 0: has_object_coverage = False else: has_object_coverage = True if not has_jaccard_overlap and not has_object_coverage: return True found = False for i in range(len(gt_bboxes)): object_bbox = [ gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3] ] if has_jaccard_overlap: overlap = jaccard_overlap(sample_bbox, object_bbox) if sampler[6] != 0 and \ overlap < sampler[6]: continue if sampler[7] != 0 and \ overlap > sampler[7]: continue found = True if has_object_coverage: object_coverage = bbox_coverage(object_bbox, sample_bbox) if sampler[8] != 0 and \ object_coverage < sampler[8]: continue if sampler[9] != 0 and \ object_coverage > sampler[9]: continue found = True if found: return True return found def crop_image_sampling(img, sample_bbox, image_width, image_height, target_size): # no clipping here xmin = int(sample_bbox[0] * image_width) xmax = int(sample_bbox[2] * image_width) ymin = int(sample_bbox[1] * image_height) ymax = int(sample_bbox[3] * image_height) w_off = xmin h_off = ymin width = xmax - xmin height = ymax - ymin cross_xmin = max(0.0, float(w_off)) cross_ymin = max(0.0, float(h_off)) cross_xmax = min(float(w_off + width - 1.0), float(image_width)) cross_ymax = min(float(h_off + height - 1.0), float(image_height)) cross_width = cross_xmax - cross_xmin cross_height = cross_ymax - cross_ymin roi_xmin = 0 if w_off >= 0 else abs(w_off) roi_ymin = 0 if h_off >= 0 else abs(h_off) roi_width = cross_width roi_height = cross_height roi_y1 = int(roi_ymin) roi_y2 = int(roi_ymin + roi_height) roi_x1 = int(roi_xmin) roi_x2 = int(roi_xmin + roi_width) cross_y1 = int(cross_ymin) cross_y2 = int(cross_ymin + cross_height) cross_x1 = int(cross_xmin) cross_x2 = int(cross_xmin + cross_width) sample_img = np.zeros((height, width, 3)) sample_img[roi_y1: roi_y2, roi_x1: roi_x2] = \ img[cross_y1: cross_y2, cross_x1: cross_x2] sample_img = cv2.resize( sample_img, (target_size, target_size), interpolation=cv2.INTER_AREA) return sample_img def is_poly(segm): assert isinstance(segm, (list, dict)), \ "Invalid segm type: {}".format(type(segm)) return isinstance(segm, list) def gaussian_radius(bbox_size, min_overlap): height, width = bbox_size a1 = 1 b1 = (height + width) c1 = width * height * (1 - min_overlap) / (1 + min_overlap) sq1 = np.sqrt(b1**2 - 4 * a1 * c1) radius1 = (b1 + sq1) / (2 * a1) a2 = 4 b2 = 2 * (height + width) c2 = (1 - min_overlap) * width * height sq2 = np.sqrt(b2**2 - 4 * a2 * c2) radius2 = (b2 + sq2) / 2 a3 = 4 * min_overlap b3 = -2 * min_overlap * (height + width) c3 = (min_overlap - 1) * width * height sq3 = np.sqrt(b3**2 - 4 * a3 * c3) radius3 = (b3 + sq3) / 2 return min(radius1, radius2, radius3) def draw_gaussian(heatmap, center, radius, k=1, delte=6): diameter = 2 * radius + 1 sigma = diameter / delte gaussian = gaussian2D((diameter, diameter), sigma_x=sigma, sigma_y=sigma) x, y = center height, width = heatmap.shape[0:2] left, right = min(x, radius), min(width - x, radius + 1) top, bottom = min(y, radius), min(height - y, radius + 1) masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] masked_gaussian = gaussian[radius - top:radius + bottom, radius - left: radius + right] np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap) def gaussian2D(shape, sigma_x=1, sigma_y=1): m, n = [(ss - 1.) / 2. for ss in shape] y, x = np.ogrid[-m:m + 1, -n:n + 1] h = np.exp(-(x * x / (2 * sigma_x * sigma_x) + y * y / (2 * sigma_y * sigma_y))) h[h < np.finfo(h.dtype).eps * h.max()] = 0 return h def draw_umich_gaussian(heatmap, center, radius, k=1): """ draw_umich_gaussian, refer to https://github.com/xingyizhou/CenterNet/blob/master/src/lib/utils/image.py#L126 """ diameter = 2 * radius + 1 gaussian = gaussian2D( (diameter, diameter), sigma_x=diameter / 6, sigma_y=diameter / 6) x, y = int(center[0]), int(center[1]) height, width = heatmap.shape[0:2] left, right = min(x, radius), min(width - x, radius + 1) top, bottom = min(y, radius), min(height - y, radius + 1) masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] masked_gaussian = gaussian[radius - top:radius + bottom, radius - left: radius + right] if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0: np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap) return heatmap def get_border(border, size): i = 1 while size - border // i <= border // i: i *= 2 return border // i ================================================ FILE: ppdet/data/transform/operators.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # function: # operators to process sample, # eg: decode/resize/crop image from __future__ import absolute_import from __future__ import print_function from __future__ import division try: from collections.abc import Sequence except Exception: from collections import Sequence from numbers import Number, Integral import uuid import random import math import numpy as np import os import copy import logging import cv2 from PIL import Image, ImageDraw, ImageEnhance from pycocotools import mask import pickle import threading MUTEX = threading.Lock() import paddle from ppdet.core.workspace import serializable from ..reader import Compose from .op_helper import (satisfy_sample_constraint, filter_and_process, generate_sample_bbox, clip_bbox, data_anchor_sampling, satisfy_sample_constraint_coverage, crop_image_sampling, generate_sample_bbox_square, bbox_area_sampling, is_poly, get_border) from ppdet.utils.logger import setup_logger from ppdet.utils.compact import imagedraw_textsize_c from ppdet.modeling.keypoint_utils import get_affine_transform, affine_transform logger = setup_logger(__name__) registered_ops = [] def register_op(cls): registered_ops.append(cls.__name__) if not hasattr(BaseOperator, cls.__name__): setattr(BaseOperator, cls.__name__, cls) else: raise KeyError("The {} class has been registered.".format(cls.__name__)) return serializable(cls) class BboxError(ValueError): pass class ImageError(ValueError): pass class BaseOperator(object): def __init__(self, name=None): if name is None: name = self.__class__.__name__ self._id = name + '_' + str(uuid.uuid4())[-6:] def apply(self, sample, context=None): """ Process a sample. Args: sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx} context (dict): info about this sample processing Returns: result (dict): a processed sample """ return sample def __call__(self, sample, context=None): """ Process a sample. Args: sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx} context (dict): info about this sample processing Returns: result (dict): a processed sample """ if isinstance(sample, Sequence): for i in range(len(sample)): sample[i] = self.apply(sample[i], context) else: sample = self.apply(sample, context) return sample def __str__(self): return str(self._id) @register_op class Decode(BaseOperator): def __init__(self, rtn_im_file=False): """ Transform the image data to numpy format following the rgb format """ super(Decode, self).__init__() self.rtn_im_file = rtn_im_file def apply(self, sample, context=None): """ load image if 'im_file' field is not empty but 'image' is""" if 'image' not in sample: with open(sample['im_file'], 'rb') as f: sample['image'] = f.read() if not self.rtn_im_file: sample.pop('im_file') try: im = sample['image'] data = np.frombuffer(im, dtype='uint8') im = cv2.imdecode(data, 1) # BGR mode, but need RGB mode if 'keep_ori_im' in sample and sample['keep_ori_im']: sample['ori_image'] = im im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) except: im = sample['image'] sample['image'] = im if 'h' not in sample: sample['h'] = im.shape[0] elif sample['h'] != im.shape[0]: logger.warning( "The actual image height: {} is not equal to the " "height: {} in annotation, and update sample['h'] by actual " "image height.".format(im.shape[0], sample['h'])) sample['h'] = im.shape[0] if 'w' not in sample: sample['w'] = im.shape[1] elif sample['w'] != im.shape[1]: logger.warning( "The actual image width: {} is not equal to the " "width: {} in annotation, and update sample['w'] by actual " "image width.".format(im.shape[1], sample['w'])) sample['w'] = im.shape[1] sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32) sample['scale_factor'] = np.array([1., 1.], dtype=np.float32) return sample def _make_dirs(dirname): try: from pathlib import Path except ImportError: from pathlib2 import Path Path(dirname).mkdir(exist_ok=True) @register_op class DecodeCache(BaseOperator): def __init__(self, cache_root=None): '''decode image and caching ''' super(DecodeCache, self).__init__() self.use_cache = False if cache_root is None else True self.cache_root = cache_root if cache_root is not None: _make_dirs(cache_root) def apply(self, sample, context=None): if self.use_cache and os.path.exists( self.cache_path(self.cache_root, sample['im_file'])): path = self.cache_path(self.cache_root, sample['im_file']) im = self.load(path) else: if 'image' not in sample: with open(sample['im_file'], 'rb') as f: sample['image'] = f.read() im = sample['image'] data = np.frombuffer(im, dtype='uint8') im = cv2.imdecode(data, 1) # BGR mode, but need RGB mode if 'keep_ori_im' in sample and sample['keep_ori_im']: sample['ori_image'] = im im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) if self.use_cache and not os.path.exists( self.cache_path(self.cache_root, sample['im_file'])): path = self.cache_path(self.cache_root, sample['im_file']) self.dump(im, path) sample['image'] = im sample['h'] = im.shape[0] sample['w'] = im.shape[1] sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32) sample['scale_factor'] = np.array([1., 1.], dtype=np.float32) sample.pop('im_file') return sample @staticmethod def cache_path(dir_oot, im_file): return os.path.join(dir_oot, os.path.basename(im_file) + '.pkl') @staticmethod def load(path): with open(path, 'rb') as f: im = pickle.load(f) return im @staticmethod def dump(obj, path): MUTEX.acquire() try: with open(path, 'wb') as f: pickle.dump(obj, f) except Exception as e: logger.warning('dump {} occurs exception {}'.format(path, str(e))) finally: MUTEX.release() @register_op class SniperDecodeCrop(BaseOperator): def __init__(self): super(SniperDecodeCrop, self).__init__() def __call__(self, sample, context=None): if 'image' not in sample: with open(sample['im_file'], 'rb') as f: sample['image'] = f.read() sample.pop('im_file') im = sample['image'] data = np.frombuffer(im, dtype='uint8') im = cv2.imdecode(data, cv2.IMREAD_COLOR) # BGR mode, but need RGB mode if 'keep_ori_im' in sample and sample['keep_ori_im']: sample['ori_image'] = im im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) chip = sample['chip'] x1, y1, x2, y2 = [int(xi) for xi in chip] im = im[max(y1, 0):min(y2, im.shape[0]), max(x1, 0):min(x2, im.shape[ 1]), :] sample['image'] = im h = im.shape[0] w = im.shape[1] # sample['im_info'] = [h, w, 1.0] sample['h'] = h sample['w'] = w sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32) sample['scale_factor'] = np.array([1., 1.], dtype=np.float32) return sample @register_op class Permute(BaseOperator): def __init__(self): """ Change the channel to be (C, H, W) """ super(Permute, self).__init__() def apply(self, sample, context=None): im = sample['image'] im = im.transpose((2, 0, 1)) sample['image'] = im if 'pre_image' in sample: pre_im = sample['pre_image'] pre_im = pre_im.transpose((2, 0, 1)) sample['pre_image'] = pre_im return sample @register_op class Lighting(BaseOperator): """ Lighting the image by eigenvalues and eigenvectors Args: eigval (list): eigenvalues eigvec (list): eigenvectors alphastd (float): random weight of lighting, 0.1 by default """ def __init__(self, eigval, eigvec, alphastd=0.1): super(Lighting, self).__init__() self.alphastd = alphastd self.eigval = np.array(eigval).astype('float32') self.eigvec = np.array(eigvec).astype('float32') def apply(self, sample, context=None): alpha = np.random.normal(scale=self.alphastd, size=(3, )) sample['image'] += np.dot(self.eigvec, self.eigval * alpha) if 'pre_image' in sample: sample['pre_image'] += np.dot(self.eigvec, self.eigval * alpha) return sample @register_op class RandomErasingImage(BaseOperator): def __init__(self, prob=0.5, lower=0.02, higher=0.4, aspect_ratio=0.3): """ Random Erasing Data Augmentation, see https://arxiv.org/abs/1708.04896 Args: prob (float): probability to carry out random erasing lower (float): lower limit of the erasing area ratio higher (float): upper limit of the erasing area ratio aspect_ratio (float): aspect ratio of the erasing region """ super(RandomErasingImage, self).__init__() self.prob = prob self.lower = lower self.higher = higher self.aspect_ratio = aspect_ratio def apply(self, sample, context=None): gt_bbox = sample['gt_bbox'] im = sample['image'] if not isinstance(im, np.ndarray): raise TypeError("{}: image is not a numpy array.".format(self)) if len(im.shape) != 3: raise ImageError("{}: image is not 3-dimensional.".format(self)) for idx in range(gt_bbox.shape[0]): if self.prob <= np.random.rand(): continue x1, y1, x2, y2 = gt_bbox[idx, :] w_bbox = x2 - x1 h_bbox = y2 - y1 area = w_bbox * h_bbox target_area = random.uniform(self.lower, self.higher) * area aspect_ratio = random.uniform(self.aspect_ratio, 1 / self.aspect_ratio) h = int(round(math.sqrt(target_area * aspect_ratio))) w = int(round(math.sqrt(target_area / aspect_ratio))) if w < w_bbox and h < h_bbox: off_y1 = random.randint(0, int(h_bbox - h)) off_x1 = random.randint(0, int(w_bbox - w)) im[int(y1 + off_y1):int(y1 + off_y1 + h), int(x1 + off_x1):int( x1 + off_x1 + w), :] = 0 sample['image'] = im return sample @register_op class NormalizeImage(BaseOperator): def __init__(self, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], is_scale=True, norm_type='mean_std'): """ Args: mean (list): the pixel mean std (list): the pixel variance is_scale (bool): scale the pixel to [0,1] norm_type (str): type in ['mean_std', 'none'] """ super(NormalizeImage, self).__init__() self.mean = mean self.std = std self.is_scale = is_scale self.norm_type = norm_type if not (isinstance(self.mean, list) and isinstance(self.std, list) and isinstance(self.is_scale, bool) and self.norm_type in ['mean_std', 'none']): raise TypeError("{}: input type is invalid.".format(self)) from functools import reduce if reduce(lambda x, y: x * y, self.std) == 0: raise ValueError('{}: std is invalid!'.format(self)) def apply(self, sample, context=None): """Normalize the image. Operators: 1.(optional) Scale the pixel to [0,1] 2.(optional) Each pixel minus mean and is divided by std """ im = sample['image'] im = im.astype(np.float32, copy=False) if self.is_scale: scale = 1.0 / 255.0 im *= scale if self.norm_type == 'mean_std': mean = np.array(self.mean)[np.newaxis, np.newaxis, :] std = np.array(self.std)[np.newaxis, np.newaxis, :] im -= mean im /= std sample['image'] = im if 'pre_image' in sample: pre_im = sample['pre_image'] pre_im = pre_im.astype(np.float32, copy=False) if self.is_scale: scale = 1.0 / 255.0 pre_im *= scale if self.norm_type == 'mean_std': mean = np.array(self.mean)[np.newaxis, np.newaxis, :] std = np.array(self.std)[np.newaxis, np.newaxis, :] pre_im -= mean pre_im /= std sample['pre_image'] = pre_im return sample @register_op class GridMask(BaseOperator): def __init__(self, use_h=True, use_w=True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7, upper_iter=360000): """ GridMask Data Augmentation, see https://arxiv.org/abs/2001.04086 Args: use_h (bool): whether to mask vertically use_w (boo;): whether to mask horizontally rotate (float): angle for the mask to rotate offset (float): mask offset ratio (float): mask ratio mode (int): gridmask mode prob (float): max probability to carry out gridmask upper_iter (int): suggested to be equal to global max_iter """ super(GridMask, self).__init__() self.use_h = use_h self.use_w = use_w self.rotate = rotate self.offset = offset self.ratio = ratio self.mode = mode self.prob = prob self.upper_iter = upper_iter from .gridmask_utils import Gridmask self.gridmask_op = Gridmask( use_h, use_w, rotate=rotate, offset=offset, ratio=ratio, mode=mode, prob=prob, upper_iter=upper_iter) def apply(self, sample, context=None): sample['image'] = self.gridmask_op(sample['image'], sample['curr_iter']) return sample @register_op class RandomDistort(BaseOperator): """Random color distortion. Args: hue (list): hue settings. in [lower, upper, probability] format. saturation (list): saturation settings. in [lower, upper, probability] format. contrast (list): contrast settings. in [lower, upper, probability] format. brightness (list): brightness settings. in [lower, upper, probability] format. random_apply (bool): whether to apply in random (yolo) or fixed (SSD) order. count (int): the number of doing distrot. random_channel (bool): whether to swap channels randomly. prob (float): the probability of enhancing the sample. """ def __init__(self, hue=[-18, 18, 0.5], saturation=[0.5, 1.5, 0.5], contrast=[0.5, 1.5, 0.5], brightness=[0.5, 1.5, 0.5], random_apply=True, count=4, random_channel=False, prob=1.0): super(RandomDistort, self).__init__() self.hue = hue self.saturation = saturation self.contrast = contrast self.brightness = brightness self.random_apply = random_apply self.count = count self.random_channel = random_channel self.prob = prob def apply_hue(self, img): low, high, prob = self.hue if np.random.uniform(0., 1.) < prob: return img delta = np.random.uniform(low, high) img = np.array(img.convert('HSV')) img[:, :, 0] = img[:, :, 0] + delta img = Image.fromarray(img, mode='HSV').convert('RGB') return img def apply_saturation(self, img): low, high, prob = self.saturation if np.random.uniform(0., 1.) < prob: return img delta = np.random.uniform(low, high) img = ImageEnhance.Color(img).enhance(delta) return img def apply_contrast(self, img): low, high, prob = self.contrast if np.random.uniform(0., 1.) < prob: return img delta = np.random.uniform(low, high) img = ImageEnhance.Contrast(img).enhance(delta) return img def apply_brightness(self, img): low, high, prob = self.brightness if np.random.uniform(0., 1.) < prob: return img delta = np.random.uniform(low, high) img = ImageEnhance.Brightness(img).enhance(delta) return img def apply(self, sample, context=None): if random.random() > self.prob: return sample img = sample['image'] img = Image.fromarray(img.astype(np.uint8)) if self.random_apply: functions = [ self.apply_brightness, self.apply_contrast, self.apply_saturation, self.apply_hue ] distortions = np.random.permutation(functions)[:self.count] for func in distortions: img = func(img) img = np.asarray(img).astype(np.float32) sample['image'] = img return sample img = self.apply_brightness(img) mode = np.random.randint(0, 2) if mode: img = self.apply_contrast(img) img = self.apply_saturation(img) img = self.apply_hue(img) if not mode: img = self.apply_contrast(img) img = np.asarray(img).astype(np.float32) if self.random_channel: if np.random.randint(0, 2): img = img[..., np.random.permutation(3)] sample['image'] = img return sample @register_op class PhotoMetricDistortion(BaseOperator): """Apply photometric distortion to image sequentially, every transformation is applied with a probability of 0.5. The position of random contrast is in second or second to last. 1. random brightness 2. random contrast (mode 0) 3. convert color from BGR to HSV 4. random saturation 5. random hue 6. convert color from HSV to BGR 7. random contrast (mode 1) 8. randomly swap channels Args: brightness_delta (int): delta of brightness. contrast_range (tuple): range of contrast. saturation_range (tuple): range of saturation. hue_delta (int): delta of hue. """ def __init__(self, brightness_delta=32, contrast_range=(0.5, 1.5), saturation_range=(0.5, 1.5), hue_delta=18): super(PhotoMetricDistortion, self).__init__() self.brightness_delta = brightness_delta self.contrast_lower, self.contrast_upper = contrast_range self.saturation_lower, self.saturation_upper = saturation_range self.hue_delta = hue_delta def apply(self, results, context=None): """Call function to perform photometric distortion on images. Args: results (dict): Result dict from loading pipeline. Returns: dict: Result dict with images distorted. """ img = results['image'] img = img.astype(np.float32) # random brightness if np.random.randint(2): delta = np.random.uniform(-self.brightness_delta, self.brightness_delta) img += delta # mode == 0 --> do random contrast first # mode == 1 --> do random contrast last mode = np.random.randint(2) if mode == 1: if np.random.randint(2): alpha = np.random.uniform(self.contrast_lower, self.contrast_upper) img *= alpha # convert color from BGR to HSV img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) # random saturation if np.random.randint(2): img[..., 1] *= np.random.uniform(self.saturation_lower, self.saturation_upper) # random hue if np.random.randint(2): img[..., 0] += np.random.uniform(-self.hue_delta, self.hue_delta) img[..., 0][img[..., 0] > 360] -= 360 img[..., 0][img[..., 0] < 0] += 360 # convert color from HSV to BGR img = cv2.cvtColor(img, cv2.COLOR_HSV2BGR) # random contrast if mode == 0: if np.random.randint(2): alpha = np.random.uniform(self.contrast_lower, self.contrast_upper) img *= alpha # randomly swap channels if np.random.randint(2): img = img[..., np.random.permutation(3)] results['image'] = img return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(\nbrightness_delta={self.brightness_delta},\n' repr_str += 'contrast_range=' repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n' repr_str += 'saturation_range=' repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n' repr_str += f'hue_delta={self.hue_delta})' return repr_str @register_op class AutoAugment(BaseOperator): def __init__(self, autoaug_type="v1"): """ Args: autoaug_type (str): autoaug type, support v0, v1, v2, v3, test """ super(AutoAugment, self).__init__() self.autoaug_type = autoaug_type def apply(self, sample, context=None): """ Learning Data Augmentation Strategies for Object Detection, see https://arxiv.org/abs/1906.11172 """ im = sample['image'] gt_bbox = sample['gt_bbox'] if not isinstance(im, np.ndarray): raise TypeError("{}: image is not a numpy array.".format(self)) if len(im.shape) != 3: raise ImageError("{}: image is not 3-dimensional.".format(self)) if len(gt_bbox) == 0: return sample height, width, _ = im.shape norm_gt_bbox = np.ones_like(gt_bbox, dtype=np.float32) norm_gt_bbox[:, 0] = gt_bbox[:, 1] / float(height) norm_gt_bbox[:, 1] = gt_bbox[:, 0] / float(width) norm_gt_bbox[:, 2] = gt_bbox[:, 3] / float(height) norm_gt_bbox[:, 3] = gt_bbox[:, 2] / float(width) from .autoaugment_utils import distort_image_with_autoaugment im, norm_gt_bbox = distort_image_with_autoaugment(im, norm_gt_bbox, self.autoaug_type) gt_bbox[:, 0] = norm_gt_bbox[:, 1] * float(width) gt_bbox[:, 1] = norm_gt_bbox[:, 0] * float(height) gt_bbox[:, 2] = norm_gt_bbox[:, 3] * float(width) gt_bbox[:, 3] = norm_gt_bbox[:, 2] * float(height) sample['image'] = im sample['gt_bbox'] = gt_bbox return sample @register_op class RandomFlip(BaseOperator): def __init__(self, prob=0.5): """ Args: prob (float): the probability of flipping image """ super(RandomFlip, self).__init__() self.prob = prob if not (isinstance(self.prob, float)): raise TypeError("{}: input type is invalid.".format(self)) def apply_segm(self, segms, height, width): def _flip_poly(poly, width): flipped_poly = np.array(poly) flipped_poly[0::2] = width - np.array(poly[0::2]) return flipped_poly.tolist() def _flip_rle(rle, height, width): if 'counts' in rle and type(rle['counts']) == list: rle = mask_util.frPyObjects(rle, height, width) mask = mask_util.decode(rle) mask = mask[:, ::-1] rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8)) return rle flipped_segms = [] for segm in segms: if is_poly(segm): # Polygon format flipped_segms.append([_flip_poly(poly, width) for poly in segm]) else: # RLE format import pycocotools.mask as mask_util flipped_segms.append(_flip_rle(segm, height, width)) return flipped_segms def apply_keypoint(self, gt_keypoint, width): for i in range(gt_keypoint.shape[1]): if i % 2 == 0: old_x = gt_keypoint[:, i].copy() gt_keypoint[:, i] = width - old_x return gt_keypoint def apply_image(self, image): return image[:, ::-1, :] def apply_bbox(self, bbox, width): oldx1 = bbox[:, 0].copy() oldx2 = bbox[:, 2].copy() bbox[:, 0] = width - oldx2 bbox[:, 2] = width - oldx1 return bbox def apply(self, sample, context=None): """Filp the image and bounding box. Operators: 1. Flip the image numpy. 2. Transform the bboxes' x coordinates. (Must judge whether the coordinates are normalized!) 3. Transform the segmentations' x coordinates. (Must judge whether the coordinates are normalized!) Output: sample: the image, bounding box and segmentation part in sample are flipped. """ if np.random.uniform(0, 1) < self.prob: im = sample['image'] height, width = im.shape[:2] im = self.apply_image(im) if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], width) if 'gt_poly' in sample and len(sample['gt_poly']) > 0: sample['gt_poly'] = self.apply_segm(sample['gt_poly'], height, width) if 'gt_keypoint' in sample and len(sample['gt_keypoint']) > 0: sample['gt_keypoint'] = self.apply_keypoint( sample['gt_keypoint'], width) if 'semantic' in sample and sample['semantic']: sample['semantic'] = sample['semantic'][:, ::-1] if 'gt_segm' in sample and sample['gt_segm'].any(): sample['gt_segm'] = sample['gt_segm'][:, :, ::-1] sample['flipped'] = True sample['image'] = im return sample @register_op class Resize(BaseOperator): def __init__(self, target_size, keep_ratio, interp=cv2.INTER_LINEAR): """ Resize image to target size. if keep_ratio is True, resize the image's long side to the maximum of target_size if keep_ratio is False, resize the image to target size(h, w) Args: target_size (int|list): image target size keep_ratio (bool): whether keep_ratio or not, default true interp (int): the interpolation method """ super(Resize, self).__init__() self.keep_ratio = keep_ratio self.interp = interp if not isinstance(target_size, (Integral, Sequence)): raise TypeError( "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}". format(type(target_size))) if isinstance(target_size, Integral): target_size = [target_size, target_size] self.target_size = target_size def apply_image(self, image, scale): im_scale_x, im_scale_y = scale return cv2.resize( image, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=self.interp) def apply_bbox(self, bbox, scale, size): im_scale_x, im_scale_y = scale resize_w, resize_h = size bbox[:, 0::2] *= im_scale_x bbox[:, 1::2] *= im_scale_y bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w) bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h) return bbox def apply_area(self, area, scale): im_scale_x, im_scale_y = scale return area * im_scale_x * im_scale_y def apply_joints(self, joints, scale, size): im_scale_x, im_scale_y = scale resize_w, resize_h = size joints[..., 0] *= im_scale_x joints[..., 1] *= im_scale_y joints[..., 0] = np.clip(joints[..., 0], 0, resize_w) joints[..., 1] = np.clip(joints[..., 1], 0, resize_h) return joints def apply_segm(self, segms, im_size, scale): def _resize_poly(poly, im_scale_x, im_scale_y): resized_poly = np.array(poly).astype('float32') resized_poly[0::2] *= im_scale_x resized_poly[1::2] *= im_scale_y return resized_poly.tolist() def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y): if 'counts' in rle and type(rle['counts']) == list: rle = mask_util.frPyObjects(rle, im_h, im_w) mask = mask_util.decode(rle) mask = cv2.resize( mask, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=self.interp) rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8)) return rle im_h, im_w = im_size im_scale_x, im_scale_y = scale resized_segms = [] for segm in segms: if is_poly(segm): # Polygon format resized_segms.append([ _resize_poly(poly, im_scale_x, im_scale_y) for poly in segm ]) else: # RLE format import pycocotools.mask as mask_util resized_segms.append( _resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y)) return resized_segms def apply(self, sample, context=None): """ Resize the image numpy. """ im = sample['image'] if not isinstance(im, np.ndarray): raise TypeError("{}: image type is not numpy.".format(self)) # apply image if len(im.shape) == 3: im_shape = im.shape else: im_shape = im[0].shape if self.keep_ratio: im_size_min = np.min(im_shape[0:2]) im_size_max = np.max(im_shape[0:2]) target_size_min = np.min(self.target_size) target_size_max = np.max(self.target_size) im_scale = min(target_size_min / im_size_min, target_size_max / im_size_max) resize_h = int(im_scale * float(im_shape[0]) + 0.5) resize_w = int(im_scale * float(im_shape[1]) + 0.5) else: resize_h, resize_w = self.target_size im_scale_y = resize_h / im_shape[0] im_scale_x = resize_w / im_shape[1] if len(im.shape) == 3: im = self.apply_image(sample['image'], [im_scale_x, im_scale_y]) sample['image'] = im.astype(np.float32) else: resized_images = [] for one_im in im: applied_im = self.apply_image(one_im, [im_scale_x, im_scale_y]) resized_images.append(applied_im) sample['image'] = np.array(resized_images) # 2d keypoints resize if 'kps2d' in sample.keys(): kps2d = sample['kps2d'] kps2d[:, :, 0] = kps2d[:, :, 0] * im_scale_x kps2d[:, :, 1] = kps2d[:, :, 1] * im_scale_y sample['kps2d'] = kps2d sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32) if 'scale_factor' in sample: scale_factor = sample['scale_factor'] sample['scale_factor'] = np.asarray( [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x], dtype=np.float32) else: sample['scale_factor'] = np.asarray( [im_scale_y, im_scale_x], dtype=np.float32) # apply bbox if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], [im_scale_x, im_scale_y], [resize_w, resize_h]) # apply areas if 'gt_areas' in sample: sample['gt_areas'] = self.apply_area(sample['gt_areas'], [im_scale_x, im_scale_y]) # apply polygon if 'gt_poly' in sample and len(sample['gt_poly']) > 0: sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2], [im_scale_x, im_scale_y]) # apply semantic if 'semantic' in sample and sample['semantic']: semantic = sample['semantic'] semantic = cv2.resize( semantic.astype('float32'), None, None, fx=im_scale_x, fy=im_scale_y, interpolation=self.interp) semantic = np.asarray(semantic).astype('int32') semantic = np.expand_dims(semantic, 0) sample['semantic'] = semantic # apply gt_segm if 'gt_segm' in sample and len(sample['gt_segm']) > 0: masks = [ cv2.resize( gt_segm, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=cv2.INTER_NEAREST) for gt_segm in sample['gt_segm'] ] sample['gt_segm'] = np.asarray(masks).astype(np.uint8) if 'gt_joints' in sample: sample['gt_joints'] = self.apply_joints(sample['gt_joints'], [im_scale_x, im_scale_y], [resize_w, resize_h]) return sample @register_op class MultiscaleTestResize(BaseOperator): def __init__(self, origin_target_size=[800, 1333], target_size=[], interp=cv2.INTER_LINEAR, use_flip=True): """ Rescale image to the each size in target size, and capped at max_size. Args: origin_target_size (list): origin target size of image target_size (list): A list of target sizes of image. interp (int): the interpolation method. use_flip (bool): whether use flip augmentation. """ super(MultiscaleTestResize, self).__init__() self.interp = interp self.use_flip = use_flip if not isinstance(target_size, Sequence): raise TypeError( "Type of target_size is invalid. Must be List or Tuple, now is {}". format(type(target_size))) self.target_size = target_size if not isinstance(origin_target_size, Sequence): raise TypeError( "Type of origin_target_size is invalid. Must be List or Tuple, now is {}". format(type(origin_target_size))) self.origin_target_size = origin_target_size def apply(self, sample, context=None): """ Resize the image numpy for multi-scale test. """ samples = [] resizer = Resize( self.origin_target_size, keep_ratio=True, interp=self.interp) samples.append(resizer(sample.copy(), context)) if self.use_flip: flipper = RandomFlip(1.1) samples.append(flipper(sample.copy(), context=context)) for size in self.target_size: resizer = Resize(size, keep_ratio=True, interp=self.interp) samples.append(resizer(sample.copy(), context)) return samples @register_op class RandomResize(BaseOperator): def __init__(self, target_size, keep_ratio=True, interp=cv2.INTER_LINEAR, random_range=False, random_size=True, random_interp=False): """ Resize image to target size randomly. random target_size and interpolation method Args: target_size (int, list, tuple): image target size, if random size is True, must be list or tuple keep_ratio (bool): whether keep_raio or not, default true interp (int): the interpolation method random_range (bool): whether random select target size of image, the target_size must be a [[min_short_edge, long_edge], [max_short_edge, long_edge]] random_size (bool): whether random select target size of image random_interp (bool): whether random select interpolation method """ super(RandomResize, self).__init__() self.keep_ratio = keep_ratio self.interp = interp self.interps = [ cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_AREA, cv2.INTER_CUBIC, cv2.INTER_LANCZOS4, ] assert isinstance(target_size, ( Integral, Sequence)), "target_size must be Integer, List or Tuple" if (random_range or random_size) and not isinstance(target_size, Sequence): raise TypeError( "Type of target_size is invalid when random_size or random_range is True. Must be List or Tuple, now is {}". format(type(target_size))) if random_range and not len(target_size) == 2: raise TypeError( "target_size must be two list as [[min_short_edge, long_edge], [max_short_edge, long_edge]] when random_range is True." ) self.target_size = target_size self.random_range = random_range self.random_size = random_size self.random_interp = random_interp def apply(self, sample, context=None): """ Resize the image numpy. """ if self.random_range: short_edge = np.random.randint(self.target_size[0][0], self.target_size[1][0] + 1) long_edge = max(self.target_size[0][1], self.target_size[1][1] + 1) target_size = [short_edge, long_edge] else: if self.random_size: target_size = random.choice(self.target_size) else: target_size = self.target_size if self.random_interp: interp = random.choice(self.interps) else: interp = self.interp resizer = Resize(target_size, self.keep_ratio, interp) return resizer(sample, context=context) @register_op class RandomExpand(BaseOperator): """Random expand the canvas. Args: ratio (float): maximum expansion ratio. prob (float): probability to expand. fill_value (list): color value used to fill the canvas. in RGB order. """ def __init__(self, ratio=4., prob=0.5, fill_value=(127.5, 127.5, 127.5)): super(RandomExpand, self).__init__() assert ratio > 1.01, "expand ratio must be larger than 1.01" self.ratio = ratio self.prob = prob assert isinstance(fill_value, (Number, Sequence)), \ "fill value must be either float or sequence" if isinstance(fill_value, Number): fill_value = (fill_value, ) * 3 if not isinstance(fill_value, tuple): fill_value = tuple(fill_value) self.fill_value = fill_value def apply(self, sample, context=None): if np.random.uniform(0., 1.) < self.prob: return sample im = sample['image'] height, width = im.shape[:2] ratio = np.random.uniform(1., self.ratio) h = int(height * ratio) w = int(width * ratio) if not h > height or not w > width: return sample y = np.random.randint(0, h - height) x = np.random.randint(0, w - width) offsets, size = [x, y], [h, w] pad = Pad(size, pad_mode=-1, offsets=offsets, fill_value=self.fill_value) return pad(sample, context=context) @register_op class CropWithSampling(BaseOperator): def __init__(self, batch_sampler, satisfy_all=False, avoid_no_bbox=True): """ Args: batch_sampler (list): Multiple sets of different parameters for cropping. satisfy_all (bool): whether all boxes must satisfy. e.g.[[1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0], [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 1.0], [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 1.0], [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 1.0], [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 1.0], [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 1.0], [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0]] [max sample, max trial, min scale, max scale, min aspect ratio, max aspect ratio, min overlap, max overlap] avoid_no_bbox (bool): whether to avoid the situation where the box does not appear. """ super(CropWithSampling, self).__init__() self.batch_sampler = batch_sampler self.satisfy_all = satisfy_all self.avoid_no_bbox = avoid_no_bbox def apply(self, sample, context): """ Crop the image and modify bounding box. Operators: 1. Scale the image width and height. 2. Crop the image according to a radom sample. 3. Rescale the bounding box. 4. Determine if the new bbox is satisfied in the new image. Returns: sample: the image, bounding box are replaced. """ assert 'image' in sample, "image data not found" im = sample['image'] gt_bbox = sample['gt_bbox'] gt_class = sample['gt_class'] im_height, im_width = im.shape[:2] gt_score = None if 'gt_score' in sample: gt_score = sample['gt_score'] sampled_bbox = [] gt_bbox = gt_bbox.tolist() for sampler in self.batch_sampler: found = 0 for i in range(sampler[1]): if found >= sampler[0]: break sample_bbox = generate_sample_bbox(sampler) if satisfy_sample_constraint(sampler, sample_bbox, gt_bbox, self.satisfy_all): sampled_bbox.append(sample_bbox) found = found + 1 im = np.array(im) while sampled_bbox: idx = int(np.random.uniform(0, len(sampled_bbox))) sample_bbox = sampled_bbox.pop(idx) sample_bbox = clip_bbox(sample_bbox) crop_bbox, crop_class, crop_score = \ filter_and_process(sample_bbox, gt_bbox, gt_class, scores=gt_score) if self.avoid_no_bbox: if len(crop_bbox) < 1: continue xmin = int(sample_bbox[0] * im_width) xmax = int(sample_bbox[2] * im_width) ymin = int(sample_bbox[1] * im_height) ymax = int(sample_bbox[3] * im_height) im = im[ymin:ymax, xmin:xmax] sample['image'] = im sample['gt_bbox'] = crop_bbox sample['gt_class'] = crop_class sample['gt_score'] = crop_score return sample return sample @register_op class CropWithDataAchorSampling(BaseOperator): def __init__(self, batch_sampler, anchor_sampler=None, target_size=None, das_anchor_scales=[16, 32, 64, 128], sampling_prob=0.5, min_size=8., avoid_no_bbox=True): """ Args: anchor_sampler (list): anchor_sampling sets of different parameters for cropping. batch_sampler (list): Multiple sets of different parameters for cropping. e.g.[[1, 10, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.2, 0.0]] [[1, 50, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0]] [max sample, max trial, min scale, max scale, min aspect ratio, max aspect ratio, min overlap, max overlap, min coverage, max coverage] target_size (int): target image size. das_anchor_scales (list[float]): a list of anchor scales in data anchor smapling. min_size (float): minimum size of sampled bbox. avoid_no_bbox (bool): whether to avoid the situation where the box does not appear. """ super(CropWithDataAchorSampling, self).__init__() self.anchor_sampler = anchor_sampler self.batch_sampler = batch_sampler self.target_size = target_size self.sampling_prob = sampling_prob self.min_size = min_size self.avoid_no_bbox = avoid_no_bbox self.das_anchor_scales = np.array(das_anchor_scales) def apply(self, sample, context): """ Crop the image and modify bounding box. Operators: 1. Scale the image width and height. 2. Crop the image according to a radom sample. 3. Rescale the bounding box. 4. Determine if the new bbox is satisfied in the new image. Returns: sample: the image, bounding box are replaced. """ assert 'image' in sample, "image data not found" im = sample['image'] gt_bbox = sample['gt_bbox'] gt_class = sample['gt_class'] image_height, image_width = im.shape[:2] gt_bbox[:, 0] /= image_width gt_bbox[:, 1] /= image_height gt_bbox[:, 2] /= image_width gt_bbox[:, 3] /= image_height gt_score = None if 'gt_score' in sample: gt_score = sample['gt_score'] sampled_bbox = [] gt_bbox = gt_bbox.tolist() prob = np.random.uniform(0., 1.) if prob > self.sampling_prob: # anchor sampling assert self.anchor_sampler for sampler in self.anchor_sampler: found = 0 for i in range(sampler[1]): if found >= sampler[0]: break sample_bbox = data_anchor_sampling( gt_bbox, image_width, image_height, self.das_anchor_scales, self.target_size) if sample_bbox == 0: break if satisfy_sample_constraint_coverage(sampler, sample_bbox, gt_bbox): sampled_bbox.append(sample_bbox) found = found + 1 im = np.array(im) while sampled_bbox: idx = int(np.random.uniform(0, len(sampled_bbox))) sample_bbox = sampled_bbox.pop(idx) if 'gt_keypoint' in sample.keys(): keypoints = (sample['gt_keypoint'], sample['keypoint_ignore']) crop_bbox, crop_class, crop_score, gt_keypoints = \ filter_and_process(sample_bbox, gt_bbox, gt_class, scores=gt_score, keypoints=keypoints) else: crop_bbox, crop_class, crop_score = filter_and_process( sample_bbox, gt_bbox, gt_class, scores=gt_score) crop_bbox, crop_class, crop_score = bbox_area_sampling( crop_bbox, crop_class, crop_score, self.target_size, self.min_size) if self.avoid_no_bbox: if len(crop_bbox) < 1: continue im = crop_image_sampling(im, sample_bbox, image_width, image_height, self.target_size) height, width = im.shape[:2] crop_bbox[:, 0] *= width crop_bbox[:, 1] *= height crop_bbox[:, 2] *= width crop_bbox[:, 3] *= height sample['image'] = im sample['gt_bbox'] = crop_bbox sample['gt_class'] = crop_class if 'gt_score' in sample: sample['gt_score'] = crop_score if 'gt_keypoint' in sample.keys(): sample['gt_keypoint'] = gt_keypoints[0] sample['keypoint_ignore'] = gt_keypoints[1] return sample return sample else: for sampler in self.batch_sampler: found = 0 for i in range(sampler[1]): if found >= sampler[0]: break sample_bbox = generate_sample_bbox_square( sampler, image_width, image_height) if satisfy_sample_constraint_coverage(sampler, sample_bbox, gt_bbox): sampled_bbox.append(sample_bbox) found = found + 1 im = np.array(im) while sampled_bbox: idx = int(np.random.uniform(0, len(sampled_bbox))) sample_bbox = sampled_bbox.pop(idx) sample_bbox = clip_bbox(sample_bbox) if 'gt_keypoint' in sample.keys(): keypoints = (sample['gt_keypoint'], sample['keypoint_ignore']) crop_bbox, crop_class, crop_score, gt_keypoints = \ filter_and_process(sample_bbox, gt_bbox, gt_class, scores=gt_score, keypoints=keypoints) else: crop_bbox, crop_class, crop_score = filter_and_process( sample_bbox, gt_bbox, gt_class, scores=gt_score) # sampling bbox according the bbox area crop_bbox, crop_class, crop_score = bbox_area_sampling( crop_bbox, crop_class, crop_score, self.target_size, self.min_size) if self.avoid_no_bbox: if len(crop_bbox) < 1: continue xmin = int(sample_bbox[0] * image_width) xmax = int(sample_bbox[2] * image_width) ymin = int(sample_bbox[1] * image_height) ymax = int(sample_bbox[3] * image_height) im = im[ymin:ymax, xmin:xmax] height, width = im.shape[:2] crop_bbox[:, 0] *= width crop_bbox[:, 1] *= height crop_bbox[:, 2] *= width crop_bbox[:, 3] *= height sample['image'] = im sample['gt_bbox'] = crop_bbox sample['gt_class'] = crop_class if 'gt_score' in sample: sample['gt_score'] = crop_score if 'gt_keypoint' in sample.keys(): sample['gt_keypoint'] = gt_keypoints[0] sample['keypoint_ignore'] = gt_keypoints[1] return sample return sample @register_op class RandomCrop(BaseOperator): """Random crop image and bboxes. Args: aspect_ratio (list): aspect ratio of cropped region. in [min, max] format. thresholds (list): iou thresholds for decide a valid bbox crop. scaling (list): ratio between a cropped region and the original image. in [min, max] format. num_attempts (int): number of tries before giving up. allow_no_crop (bool): allow return without actually cropping them. cover_all_box (bool): ensure all bboxes are covered in the final crop. is_mask_crop(bool): whether crop the segmentation. """ def __init__(self, aspect_ratio=[.5, 2.], thresholds=[.0, .1, .3, .5, .7, .9], scaling=[.3, 1.], num_attempts=50, allow_no_crop=True, cover_all_box=False, is_mask_crop=False, ioumode="iou", prob=1.0): super(RandomCrop, self).__init__() self.aspect_ratio = aspect_ratio self.thresholds = thresholds self.scaling = scaling self.num_attempts = num_attempts self.allow_no_crop = allow_no_crop self.cover_all_box = cover_all_box self.is_mask_crop = is_mask_crop self.ioumode = ioumode self.prob = prob def crop_segms(self, segms, valid_ids, crop, height, width): def _crop_poly(segm, crop): xmin, ymin, xmax, ymax = crop crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin] crop_p = np.array(crop_coord).reshape(4, 2) crop_p = Polygon(crop_p) crop_segm = list() for poly in segm: poly = np.array(poly).reshape(len(poly) // 2, 2) polygon = Polygon(poly) if not polygon.is_valid: exterior = polygon.exterior multi_lines = exterior.intersection(exterior) polygons = shapely.ops.polygonize(multi_lines) polygon = MultiPolygon(polygons) multi_polygon = list() if isinstance(polygon, MultiPolygon): multi_polygon = copy.deepcopy(polygon) else: multi_polygon.append(copy.deepcopy(polygon)) for per_polygon in multi_polygon: inter = per_polygon.intersection(crop_p) if not inter: continue if isinstance(inter, (MultiPolygon, GeometryCollection)): for part in inter: if not isinstance(part, Polygon): continue part = np.squeeze( np.array(part.exterior.coords[:-1]).reshape(1, -1)) part[0::2] -= xmin part[1::2] -= ymin crop_segm.append(part.tolist()) elif isinstance(inter, Polygon): crop_poly = np.squeeze( np.array(inter.exterior.coords[:-1]).reshape(1, -1)) crop_poly[0::2] -= xmin crop_poly[1::2] -= ymin crop_segm.append(crop_poly.tolist()) else: continue return crop_segm def _crop_rle(rle, crop, height, width): if 'counts' in rle and type(rle['counts']) == list: rle = mask_util.frPyObjects(rle, height, width) mask = mask_util.decode(rle) mask = mask[crop[1]:crop[3], crop[0]:crop[2]] rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8)) return rle crop_segms = [] for id in valid_ids: segm = self.polygon_to_rle(segms[id], height, width) if is_poly(segm): import copy import shapely.ops from shapely.geometry import Polygon, MultiPolygon, GeometryCollection logging.getLogger("shapely").setLevel(logging.WARNING) # Polygon format crop_segms.append(_crop_poly(segm, crop)) else: # RLE format import pycocotools.mask as mask_util res = _crop_rle(segm, crop, height, width) crop_segms.append(self.rle_to_polygon(res)) return crop_segms def polygon_to_rle(self, polygons, height, width): # Create an empty mask mask_img = np.zeros((height, width), dtype=np.uint8) # Fill the polygon in the mask for polygon in polygons: contour = np.array(polygon).reshape((-1, 1, 2)).astype(int) cv2.drawContours(mask_img, [contour], 0, 255, -1) # Convert binary mask to RLE rle = mask.encode(np.asfortranarray(mask_img)) return rle def rle_to_polygon(self, rle_mask, min_area=5): binary_mask = mask.decode(rle_mask).squeeze() # Find contours in the binary mask contours, _ = cv2.findContours( binary_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) polygons = [] for contour in contours: # Convert contour to polygon and filter small areas if cv2.contourArea(contour) >= min_area: # Flatten list and add to polygons polygon = contour.flatten().tolist() if len(polygon) > 4: polygons.append(polygon) return polygons def set_fake_bboxes(self, sample): sample['gt_bbox'] = np.array( [ [32, 32, 128, 128], [32, 32, 128, 256], [32, 64, 128, 128], [32, 64, 128, 256], [64, 64, 128, 256], [64, 64, 256, 256], [64, 32, 128, 256], [64, 32, 128, 256], [96, 32, 128, 256], [96, 32, 128, 256], ], dtype=np.float32) sample['gt_class'] = np.array( [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]], np.int32) return sample def apply(self, sample, context=None): if random.random() > self.prob: return sample if 'gt_bbox' not in sample: # only used in semi-det as unsup data sample = self.set_fake_bboxes(sample) sample = self.random_crop(sample, fake_bboxes=True) del sample['gt_bbox'] del sample['gt_class'] return sample if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0: return sample sample = self.random_crop(sample) return sample def random_crop(self, sample, fake_bboxes=False): h, w = sample['image'].shape[:2] gt_bbox = sample['gt_bbox'] # NOTE Original method attempts to generate one candidate for each # threshold then randomly sample one from the resulting list. # Here a short circuit approach is taken, i.e., randomly choose a # threshold and attempt to find a valid crop, and simply return the # first one found. # The probability is not exactly the same, kinda resembling the # "Monty Hall" problem. Actually carrying out the attempts will affect # observability (just like opening doors in the "Monty Hall" game). thresholds = list(self.thresholds) if self.allow_no_crop: thresholds.append('no_crop') np.random.shuffle(thresholds) for thresh in thresholds: if thresh == 'no_crop': return sample found = False for i in range(self.num_attempts): scale = np.random.uniform(*self.scaling) if self.aspect_ratio is not None: min_ar, max_ar = self.aspect_ratio aspect_ratio = np.random.uniform( max(min_ar, scale**2), min(max_ar, scale**-2)) h_scale = scale / np.sqrt(aspect_ratio) w_scale = scale * np.sqrt(aspect_ratio) else: h_scale = np.random.uniform(*self.scaling) w_scale = np.random.uniform(*self.scaling) crop_h = h * h_scale crop_w = w * w_scale if self.aspect_ratio is None: if crop_h / crop_w < 0.5 or crop_h / crop_w > 2.0: continue crop_h = int(crop_h) crop_w = int(crop_w) crop_y = np.random.randint(0, h - crop_h) crop_x = np.random.randint(0, w - crop_w) crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h] if self.ioumode == "iof": iou = self._gtcropiou_matrix( gt_bbox, np.array( [crop_box], dtype=np.float32)) elif self.ioumode == "iou": iou = self._iou_matrix( gt_bbox, np.array( [crop_box], dtype=np.float32)) if iou.max() < thresh: continue if self.cover_all_box and iou.min() < thresh: continue cropped_box, valid_ids = self._crop_box_with_center_constraint( gt_bbox, np.array( crop_box, dtype=np.float32)) if valid_ids.size > 0: found = True break if found: if self.is_mask_crop and 'gt_poly' in sample and len(sample[ 'gt_poly']) > 0: crop_polys = self.crop_segms( sample['gt_poly'], valid_ids, np.array( crop_box, dtype=np.int64), h, w) if [] in crop_polys: delete_id = list() valid_polys = list() for id, crop_poly in enumerate(crop_polys): if crop_poly == []: delete_id.append(id) else: valid_polys.append(crop_poly) valid_ids = np.delete(valid_ids, delete_id) if len(valid_polys) == 0: return sample sample['gt_poly'] = valid_polys else: sample['gt_poly'] = crop_polys if 'gt_segm' in sample: sample['gt_segm'] = self._crop_segm(sample['gt_segm'], crop_box) sample['gt_segm'] = np.take( sample['gt_segm'], valid_ids, axis=0) sample['image'] = self._crop_image(sample['image'], crop_box) if fake_bboxes == True: return sample sample['gt_bbox'] = np.take(cropped_box, valid_ids, axis=0) sample['gt_class'] = np.take( sample['gt_class'], valid_ids, axis=0) if 'gt_score' in sample: sample['gt_score'] = np.take( sample['gt_score'], valid_ids, axis=0) if 'is_crowd' in sample: sample['is_crowd'] = np.take( sample['is_crowd'], valid_ids, axis=0) if 'difficult' in sample: sample['difficult'] = np.take( sample['difficult'], valid_ids, axis=0) if 'gt_joints' in sample: sample['gt_joints'] = self._crop_joints(sample['gt_joints'], crop_box) return sample return sample def _iou_matrix(self, a, b): tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2]) br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2) area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) area_o = (area_a[:, np.newaxis] + area_b - area_i) return area_i / (area_o + 1e-10) def _gtcropiou_matrix(self, a, b): tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2]) br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2) area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) area_o = (area_a[:, np.newaxis] + area_b - area_i) return area_i / (area_a + 1e-10) def _crop_box_with_center_constraint(self, box, crop): cropped_box = box.copy() cropped_box[:, :2] = np.maximum(box[:, :2], crop[:2]) cropped_box[:, 2:] = np.minimum(box[:, 2:], crop[2:]) cropped_box[:, :2] -= crop[:2] cropped_box[:, 2:] -= crop[:2] centers = (box[:, :2] + box[:, 2:]) / 2 valid = np.logical_and(crop[:2] <= centers, centers < crop[2:]).all(axis=1) valid = np.logical_and( valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1)) return cropped_box, np.where(valid)[0] def _crop_image(self, img, crop): x1, y1, x2, y2 = crop return img[y1:y2, x1:x2, :] def _crop_segm(self, segm, crop): x1, y1, x2, y2 = crop return segm[:, y1:y2, x1:x2] def _crop_joints(self, joints, crop): x1, y1, x2, y2 = crop joints[joints[..., 0] > x2, :] = 0 joints[joints[..., 1] > y2, :] = 0 joints[joints[..., 0] < x1, :] = 0 joints[joints[..., 1] < y1, :] = 0 joints[..., 0] -= x1 joints[..., 1] -= y1 return joints @register_op class RandomScaledCrop(BaseOperator): """Resize image and bbox based on long side (with optional random scaling), then crop or pad image to target size. Args: target_size (int|list): target size, "hw" format. scale_range (list): random scale range. interp (int): interpolation method, default to `cv2.INTER_LINEAR`. fill_value (float|list|tuple): color value used to fill the canvas, in RGB order. """ def __init__(self, target_size=512, scale_range=[.1, 2.], interp=cv2.INTER_LINEAR, fill_value=(123.675, 116.28, 103.53)): super(RandomScaledCrop, self).__init__() assert isinstance(target_size, ( Integral, Sequence)), "target_size must be Integer, List or Tuple" if isinstance(target_size, Integral): target_size = [target_size, ] * 2 self.target_size = target_size self.scale_range = scale_range self.interp = interp assert isinstance(fill_value, (Number, Sequence)), \ "fill value must be either float or sequence" if isinstance(fill_value, Number): fill_value = (fill_value, ) * 3 if not isinstance(fill_value, tuple): fill_value = tuple(fill_value) self.fill_value = fill_value def apply_image(self, img, output_size, offset_x, offset_y): th, tw = self.target_size rh, rw = output_size img = cv2.resize( img, (rw, rh), interpolation=self.interp).astype(np.float32) canvas = np.ones([th, tw, 3], dtype=np.float32) canvas *= np.array(self.fill_value, dtype=np.float32) canvas[:min(th, rh), :min(tw, rw)] = \ img[offset_y:offset_y + th, offset_x:offset_x + tw] return canvas def apply_bbox(self, gt_bbox, gt_class, scale, offset_x, offset_y): th, tw = self.target_size shift_array = np.array( [ offset_x, offset_y, ] * 2, dtype=np.float32) boxes = gt_bbox * scale - shift_array boxes[:, 0::2] = np.clip(boxes[:, 0::2], 0, tw) boxes[:, 1::2] = np.clip(boxes[:, 1::2], 0, th) # filter boxes with no area area = np.prod(boxes[..., 2:] - boxes[..., :2], axis=1) valid = (area > 1.).nonzero()[0] return boxes[valid], gt_class[valid], valid def apply_segm(self, segms, output_size, offset_x, offset_y, valid=None): th, tw = self.target_size rh, rw = output_size out_segms = [] for segm in segms: segm = cv2.resize(segm, (rw, rh), interpolation=cv2.INTER_NEAREST) segm = segm.astype(np.float32) canvas = np.zeros([th, tw], dtype=segm.dtype) canvas[:min(th, rh), :min(tw, rw)] = \ segm[offset_y:offset_y + th, offset_x:offset_x + tw] out_segms.append(canvas) out_segms = np.stack(out_segms) return out_segms if valid is None else out_segms[valid] def apply(self, sample, context=None): img = sample['image'] h, w = img.shape[:2] random_scale = np.random.uniform(*self.scale_range) target_scale_size = [t * random_scale for t in self.target_size] # Compute actual rescaling applied to image. scale = min(target_scale_size[0] / h, target_scale_size[1] / w) output_size = [int(round(h * scale)), int(round(w * scale))] # get offset offset_x = int( max(0, np.random.uniform(0., output_size[1] - self.target_size[1]))) offset_y = int( max(0, np.random.uniform(0., output_size[0] - self.target_size[0]))) # apply to image sample['image'] = self.apply_image(img, output_size, offset_x, offset_y) # apply to bbox valid = None if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: sample['gt_bbox'], sample['gt_class'], valid = self.apply_bbox( sample['gt_bbox'], sample['gt_class'], scale, offset_x, offset_y) # apply to segm if 'gt_segm' in sample and len(sample['gt_segm']) > 0: sample['gt_segm'] = self.apply_segm(sample['gt_segm'], output_size, offset_x, offset_y, valid) sample['im_shape'] = np.asarray(output_size, dtype=np.float32) scale_factor = sample['scale_factor'] sample['scale_factor'] = np.asarray( [scale_factor[0] * scale, scale_factor[1] * scale], dtype=np.float32) return sample @register_op class Cutmix(BaseOperator): def __init__(self, alpha=1.5, beta=1.5): """ CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features, see https://arxiv.org/abs/1905.04899 Cutmix image and gt_bbbox/gt_score Args: alpha (float): alpha parameter of beta distribute beta (float): beta parameter of beta distribute """ super(Cutmix, self).__init__() self.alpha = alpha self.beta = beta if self.alpha <= 0.0: raise ValueError("alpha shold be positive in {}".format(self)) if self.beta <= 0.0: raise ValueError("beta shold be positive in {}".format(self)) def apply_image(self, img1, img2, factor): """ _rand_bbox """ h = max(img1.shape[0], img2.shape[0]) w = max(img1.shape[1], img2.shape[1]) cut_rat = np.sqrt(1. - factor) cut_w = np.int32(w * cut_rat) cut_h = np.int32(h * cut_rat) # uniform cx = np.random.randint(w) cy = np.random.randint(h) bbx1 = np.clip(cx - cut_w // 2, 0, w - 1) bby1 = np.clip(cy - cut_h // 2, 0, h - 1) bbx2 = np.clip(cx + cut_w // 2, 0, w - 1) bby2 = np.clip(cy + cut_h // 2, 0, h - 1) img_1_pad = np.zeros((h, w, img1.shape[2]), 'float32') img_1_pad[:img1.shape[0], :img1.shape[1], :] = \ img1.astype('float32') img_2_pad = np.zeros((h, w, img2.shape[2]), 'float32') img_2_pad[:img2.shape[0], :img2.shape[1], :] = \ img2.astype('float32') img_1_pad[bby1:bby2, bbx1:bbx2, :] = img_2_pad[bby1:bby2, bbx1:bbx2, :] return img_1_pad def __call__(self, sample, context=None): if not isinstance(sample, Sequence): return sample assert len(sample) == 2, 'cutmix need two samples' factor = np.random.beta(self.alpha, self.beta) factor = max(0.0, min(1.0, factor)) if factor >= 1.0: return sample[0] if factor <= 0.0: return sample[1] img1 = sample[0]['image'] img2 = sample[1]['image'] img = self.apply_image(img1, img2, factor) gt_bbox1 = sample[0]['gt_bbox'] gt_bbox2 = sample[1]['gt_bbox'] gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0) gt_class1 = sample[0]['gt_class'] gt_class2 = sample[1]['gt_class'] gt_class = np.concatenate((gt_class1, gt_class2), axis=0) gt_score1 = np.ones_like(sample[0]['gt_class']) gt_score2 = np.ones_like(sample[1]['gt_class']) gt_score = np.concatenate( (gt_score1 * factor, gt_score2 * (1. - factor)), axis=0) result = copy.deepcopy(sample[0]) result['image'] = img result['gt_bbox'] = gt_bbox result['gt_score'] = gt_score result['gt_class'] = gt_class if 'is_crowd' in sample[0]: is_crowd1 = sample[0]['is_crowd'] is_crowd2 = sample[1]['is_crowd'] is_crowd = np.concatenate((is_crowd1, is_crowd2), axis=0) result['is_crowd'] = is_crowd if 'difficult' in sample[0]: is_difficult1 = sample[0]['difficult'] is_difficult2 = sample[1]['difficult'] is_difficult = np.concatenate( (is_difficult1, is_difficult2), axis=0) result['difficult'] = is_difficult return result @register_op class Mixup(BaseOperator): def __init__(self, alpha=1.5, beta=1.5): """ Mixup image and gt_bbbox/gt_score Args: alpha (float): alpha parameter of beta distribute beta (float): beta parameter of beta distribute """ super(Mixup, self).__init__() self.alpha = alpha self.beta = beta if self.alpha <= 0.0: raise ValueError("alpha shold be positive in {}".format(self)) if self.beta <= 0.0: raise ValueError("beta shold be positive in {}".format(self)) def apply_image(self, img1, img2, factor): h = max(img1.shape[0], img2.shape[0]) w = max(img1.shape[1], img2.shape[1]) img = np.zeros((h, w, img1.shape[2]), 'float32') img[:img1.shape[0], :img1.shape[1], :] = \ img1.astype('float32') * factor img[:img2.shape[0], :img2.shape[1], :] += \ img2.astype('float32') * (1.0 - factor) return img.astype('uint8') def __call__(self, sample, context=None): if not isinstance(sample, Sequence): return sample assert len(sample) == 2, 'mixup need two samples' factor = np.random.beta(self.alpha, self.beta) factor = max(0.0, min(1.0, factor)) if factor >= 1.0: return sample[0] if factor <= 0.0: return sample[1] im = self.apply_image(sample[0]['image'], sample[1]['image'], factor) result = copy.deepcopy(sample[0]) result['image'] = im # apply bbox and score if 'gt_bbox' in sample[0]: gt_bbox1 = sample[0]['gt_bbox'] gt_bbox2 = sample[1]['gt_bbox'] gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0) result['gt_bbox'] = gt_bbox if 'gt_class' in sample[0]: gt_class1 = sample[0]['gt_class'] gt_class2 = sample[1]['gt_class'] gt_class = np.concatenate((gt_class1, gt_class2), axis=0) result['gt_class'] = gt_class gt_score1 = np.ones_like(sample[0]['gt_class']) gt_score2 = np.ones_like(sample[1]['gt_class']) gt_score = np.concatenate( (gt_score1 * factor, gt_score2 * (1. - factor)), axis=0) result['gt_score'] = gt_score.astype('float32') if 'is_crowd' in sample[0]: is_crowd1 = sample[0]['is_crowd'] is_crowd2 = sample[1]['is_crowd'] is_crowd = np.concatenate((is_crowd1, is_crowd2), axis=0) result['is_crowd'] = is_crowd if 'difficult' in sample[0]: is_difficult1 = sample[0]['difficult'] is_difficult2 = sample[1]['difficult'] is_difficult = np.concatenate( (is_difficult1, is_difficult2), axis=0) result['difficult'] = is_difficult if 'gt_ide' in sample[0]: gt_ide1 = sample[0]['gt_ide'] gt_ide2 = sample[1]['gt_ide'] gt_ide = np.concatenate((gt_ide1, gt_ide2), axis=0) result['gt_ide'] = gt_ide return result @register_op class NormalizeBox(BaseOperator): """Transform the bounding box's coornidates to [0,1].""" def __init__(self, retain_origin_box=False): super(NormalizeBox, self).__init__() self.retain_origin_box = retain_origin_box def apply(self, sample, context): im = sample['image'] if 'gt_bbox' in sample.keys(): if self.retain_origin_box: sample['origin_gt_bbox'] = sample['gt_bbox'].copy() sample['origin_gt_class'] = sample['gt_class'].copy() gt_bbox = sample['gt_bbox'] height, width, _ = im.shape for i in range(gt_bbox.shape[0]): gt_bbox[i][0] = gt_bbox[i][0] / width gt_bbox[i][1] = gt_bbox[i][1] / height gt_bbox[i][2] = gt_bbox[i][2] / width gt_bbox[i][3] = gt_bbox[i][3] / height sample['gt_bbox'] = gt_bbox if 'gt_keypoint' in sample.keys(): gt_keypoint = sample['gt_keypoint'] for i in range(gt_keypoint.shape[1]): if i % 2: gt_keypoint[:, i] = gt_keypoint[:, i] / height else: gt_keypoint[:, i] = gt_keypoint[:, i] / width sample['gt_keypoint'] = gt_keypoint return sample else: return sample @register_op class BboxXYXY2XYWH(BaseOperator): """ Convert bbox XYXY format to XYWH format. """ def __init__(self): super(BboxXYXY2XYWH, self).__init__() def apply(self, sample, context=None): if 'gt_bbox' in sample.keys(): bbox = sample['gt_bbox'] bbox[:, 2:4] = bbox[:, 2:4] - bbox[:, :2] bbox[:, :2] = bbox[:, :2] + bbox[:, 2:4] / 2. sample['gt_bbox'] = bbox return sample else: return sample @register_op class PadBox(BaseOperator): def __init__(self, num_max_boxes=50): """ Pad zeros to bboxes if number of bboxes is less than num_max_boxes. Args: num_max_boxes (int): the max number of bboxes """ self.num_max_boxes = num_max_boxes super(PadBox, self).__init__() def apply(self, sample, context=None): assert 'gt_bbox' in sample bbox = sample['gt_bbox'] gt_num = min(self.num_max_boxes, len(bbox)) num_max = self.num_max_boxes # fields = context['fields'] if context else [] pad_bbox = np.zeros((num_max, 4), dtype=np.float32) if gt_num > 0: pad_bbox[:gt_num, :] = bbox[:gt_num, :] sample['gt_bbox'] = pad_bbox if 'gt_class' in sample: pad_class = np.zeros((num_max, ), dtype=np.int32) if gt_num > 0: pad_class[:gt_num] = sample['gt_class'][:gt_num, 0] sample['gt_class'] = pad_class if 'gt_score' in sample: pad_score = np.zeros((num_max, ), dtype=np.float32) if gt_num > 0: pad_score[:gt_num] = sample['gt_score'][:gt_num, 0] sample['gt_score'] = pad_score # in training, for example in op ExpandImage, # the bbox and gt_class is expandded, but the difficult is not, # so, judging by it's length if 'difficult' in sample: pad_diff = np.zeros((num_max, ), dtype=np.int32) if gt_num > 0: pad_diff[:gt_num] = sample['difficult'][:gt_num, 0] sample['difficult'] = pad_diff if 'is_crowd' in sample: pad_crowd = np.zeros((num_max, ), dtype=np.int32) if gt_num > 0: pad_crowd[:gt_num] = sample['is_crowd'][:gt_num, 0] sample['is_crowd'] = pad_crowd if 'gt_ide' in sample: pad_ide = np.zeros((num_max, ), dtype=np.int32) if gt_num > 0: pad_ide[:gt_num] = sample['gt_ide'][:gt_num, 0] sample['gt_ide'] = pad_ide return sample @register_op class DebugVisibleImage(BaseOperator): """ In debug mode, visualize images according to `gt_box`. (Currently only supported when not cropping and flipping image.) """ def __init__(self, output_dir='output/debug', is_normalized=False): super(DebugVisibleImage, self).__init__() self.is_normalized = is_normalized self.output_dir = output_dir if not os.path.isdir(output_dir): os.makedirs(output_dir) if not isinstance(self.is_normalized, bool): raise TypeError("{}: input type is invalid.".format(self)) def apply(self, sample, context=None): image = Image.fromarray(sample['image'].astype(np.uint8)) out_file_name = '{:012d}.jpg'.format(sample['im_id'][0]) width = sample['w'] height = sample['h'] gt_bbox = sample['gt_bbox'] gt_class = sample['gt_class'] draw = ImageDraw.Draw(image) for i in range(gt_bbox.shape[0]): if self.is_normalized: gt_bbox[i][0] = gt_bbox[i][0] * width gt_bbox[i][1] = gt_bbox[i][1] * height gt_bbox[i][2] = gt_bbox[i][2] * width gt_bbox[i][3] = gt_bbox[i][3] * height xmin, ymin, xmax, ymax = gt_bbox[i] draw.line( [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin), (xmin, ymin)], width=2, fill='green') # draw label text = str(gt_class[i][0]) tw, th = imagedraw_textsize_c(draw, text) draw.rectangle( [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill='green') draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255)) if 'gt_keypoint' in sample.keys(): gt_keypoint = sample['gt_keypoint'] if self.is_normalized: for i in range(gt_keypoint.shape[1]): if i % 2: gt_keypoint[:, i] = gt_keypoint[:, i] * height else: gt_keypoint[:, i] = gt_keypoint[:, i] * width for i in range(gt_keypoint.shape[0]): keypoint = gt_keypoint[i] for j in range(int(keypoint.shape[0] / 2)): x1 = round(keypoint[2 * j]).astype(np.int32) y1 = round(keypoint[2 * j + 1]).astype(np.int32) draw.ellipse( (x1, y1, x1 + 5, y1 + 5), fill='green', outline='green') save_path = os.path.join(self.output_dir, out_file_name) image.save(save_path, quality=95) return sample @register_op class Pad(BaseOperator): def __init__(self, size=None, size_divisor=32, pad_mode=0, offsets=None, fill_value=(127.5, 127.5, 127.5)): """ Pad image to a specified size or multiple of size_divisor. Args: size (int, Sequence): image target size, if None, pad to multiple of size_divisor, default None size_divisor (int): size divisor, default 32 pad_mode (int): pad mode, currently only supports four modes [-1, 0, 1, 2]. if -1, use specified offsets if 0, only pad to right and bottom. if 1, pad according to center. if 2, only pad left and top offsets (list): [offset_x, offset_y], specify offset while padding, only supported pad_mode=-1 fill_value (bool): rgb value of pad area, default (127.5, 127.5, 127.5) """ super(Pad, self).__init__() if not isinstance(size, (int, Sequence)): raise TypeError( "Type of target_size is invalid when random_size is True. \ Must be List, now is {}".format(type(size))) if isinstance(size, int): size = [size, size] assert pad_mode in [ -1, 0, 1, 2 ], 'currently only supports four modes [-1, 0, 1, 2]' if pad_mode == -1: assert offsets, 'if pad_mode is -1, offsets should not be None' self.size = size self.size_divisor = size_divisor self.pad_mode = pad_mode self.fill_value = fill_value self.offsets = offsets def apply_segm(self, segms, offsets, im_size, size): def _expand_poly(poly, x, y): expanded_poly = np.array(poly) expanded_poly[0::2] += x expanded_poly[1::2] += y return expanded_poly.tolist() def _expand_rle(rle, x, y, height, width, h, w): if 'counts' in rle and type(rle['counts']) == list: rle = mask_util.frPyObjects(rle, height, width) mask = mask_util.decode(rle) expanded_mask = np.full((h, w), 0).astype(mask.dtype) expanded_mask[y:y + height, x:x + width] = mask rle = mask_util.encode( np.array( expanded_mask, order='F', dtype=np.uint8)) return rle x, y = offsets height, width = im_size h, w = size expanded_segms = [] for segm in segms: if is_poly(segm): # Polygon format expanded_segms.append( [_expand_poly(poly, x, y) for poly in segm]) else: # RLE format import pycocotools.mask as mask_util expanded_segms.append( _expand_rle(segm, x, y, height, width, h, w)) return expanded_segms def apply_bbox(self, bbox, offsets): return bbox + np.array(offsets * 2, dtype=np.float32) def apply_keypoint(self, keypoints, offsets): n = len(keypoints[0]) // 2 return keypoints + np.array(offsets * n, dtype=np.float32) def apply_image(self, image, offsets, im_size, size): x, y = offsets im_h, im_w = im_size h, w = size canvas = np.ones((h, w, 3), dtype=np.float32) canvas *= np.array(self.fill_value, dtype=np.float32) canvas[y:y + im_h, x:x + im_w, :] = image.astype(np.float32) return canvas def apply(self, sample, context=None): im = sample['image'] im_h, im_w = im.shape[:2] if self.size: h, w = self.size assert ( im_h <= h and im_w <= w ), '(h, w) of target size should be greater than (im_h, im_w)' else: h = int(np.ceil(im_h / self.size_divisor) * self.size_divisor) w = int(np.ceil(im_w / self.size_divisor) * self.size_divisor) if h == im_h and w == im_w: sample['image'] = im.astype(np.float32) return sample if self.pad_mode == -1: offset_x, offset_y = self.offsets elif self.pad_mode == 0: offset_y, offset_x = 0, 0 elif self.pad_mode == 1: offset_y, offset_x = (h - im_h) // 2, (w - im_w) // 2 else: offset_y, offset_x = h - im_h, w - im_w offsets, im_size, size = [offset_x, offset_y], [im_h, im_w], [h, w] sample['image'] = self.apply_image(im, offsets, im_size, size) if self.pad_mode == 0: return sample if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], offsets) if 'gt_poly' in sample and len(sample['gt_poly']) > 0: sample['gt_poly'] = self.apply_segm(sample['gt_poly'], offsets, im_size, size) if 'gt_keypoint' in sample and len(sample['gt_keypoint']) > 0: sample['gt_keypoint'] = self.apply_keypoint(sample['gt_keypoint'], offsets) if 'gt_segm' in sample and len(sample['gt_segm']) > 0: masks = [ cv2.copyMakeBorder( gt_segm, offset_y, h - (offset_y + im_h), offset_x, w - (offset_x + im_w), borderType=cv2.BORDER_CONSTANT, value=0) for gt_segm in sample['gt_segm'] ] sample['gt_segm'] = np.asarray(masks, dtype=np.uint8) return sample @register_op class Poly2Mask(BaseOperator): """ gt poly to mask annotations. Args: del_poly (bool): Whether to delete poly after generating mask. Default: False. """ def __init__(self, del_poly=False): super(Poly2Mask, self).__init__() import pycocotools.mask as maskUtils self.maskutils = maskUtils self.del_poly = del_poly def _poly2mask(self, mask_ann, img_h, img_w): if isinstance(mask_ann, list): # polygon -- a single object might consist of multiple parts # we merge all parts into one mask rle code rles = self.maskutils.frPyObjects(mask_ann, img_h, img_w) rle = self.maskutils.merge(rles) elif isinstance(mask_ann['counts'], list): # uncompressed RLE rle = self.maskutils.frPyObjects(mask_ann, img_h, img_w) else: # rle rle = mask_ann mask = self.maskutils.decode(rle) return mask def apply(self, sample, context=None): assert 'gt_poly' in sample im_h, im_w = sample['im_shape'] masks = [ self._poly2mask(gt_poly, im_h, im_w) for gt_poly in sample['gt_poly'] ] sample['gt_segm'] = np.asarray(masks).astype(np.uint8) if self.del_poly: del (sample['gt_poly']) return sample @register_op class AugmentHSV(BaseOperator): """ Augment the SV channel of image data. Args: fraction (float): the fraction for augment. Default: 0.5. is_bgr (bool): whether the image is BGR mode. Default: True. hgain (float): H channel gains sgain (float): S channel gains vgain (float): V channel gains """ def __init__(self, fraction=0.50, is_bgr=True, hgain=None, sgain=None, vgain=None): super(AugmentHSV, self).__init__() self.fraction = fraction self.is_bgr = is_bgr self.hgain = hgain self.sgain = sgain self.vgain = vgain self.use_hsvgain = False if hgain is None else True def apply(self, sample, context=None): img = sample['image'] if self.is_bgr: img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) else: img_hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV) if self.use_hsvgain: hsv_augs = np.random.uniform( -1, 1, 3) * [self.hgain, self.sgain, self.vgain] # random selection of h, s, v hsv_augs *= np.random.randint(0, 2, 3) img_hsv[..., 0] = (img_hsv[..., 0] + hsv_augs[0]) % 180 img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_augs[1], 0, 255) img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_augs[2], 0, 255) else: S = img_hsv[:, :, 1].astype(np.float32) V = img_hsv[:, :, 2].astype(np.float32) a = (random.random() * 2 - 1) * self.fraction + 1 S *= a if a > 1: np.clip(S, a_min=0, a_max=255, out=S) a = (random.random() * 2 - 1) * self.fraction + 1 V *= a if a > 1: np.clip(V, a_min=0, a_max=255, out=V) img_hsv[:, :, 1] = S.astype(np.uint8) img_hsv[:, :, 2] = V.astype(np.uint8) if self.is_bgr: cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img) else: cv2.cvtColor(img_hsv, cv2.COLOR_HSV2RGB, dst=img) sample['image'] = img.astype(np.float32) return sample @register_op class Norm2PixelBbox(BaseOperator): """ Transform the bounding box's coornidates which is in [0,1] to pixels. """ def __init__(self): super(Norm2PixelBbox, self).__init__() def apply(self, sample, context=None): assert 'gt_bbox' in sample bbox = sample['gt_bbox'] height, width = sample['image'].shape[:2] bbox[:, 0::2] = bbox[:, 0::2] * width bbox[:, 1::2] = bbox[:, 1::2] * height sample['gt_bbox'] = bbox return sample @register_op class BboxCXCYWH2XYXY(BaseOperator): """ Convert bbox CXCYWH format to XYXY format. [center_x, center_y, width, height] -> [x0, y0, x1, y1] """ def __init__(self): super(BboxCXCYWH2XYXY, self).__init__() def apply(self, sample, context=None): assert 'gt_bbox' in sample bbox0 = sample['gt_bbox'] bbox = bbox0.copy() bbox[:, :2] = bbox0[:, :2] - bbox0[:, 2:4] / 2. bbox[:, 2:4] = bbox0[:, :2] + bbox0[:, 2:4] / 2. sample['gt_bbox'] = bbox return sample @register_op class RandomResizeCrop(BaseOperator): """Random resize and crop image and bboxes. Args: resizes (list): resize image to one of resizes. if keep_ratio is True and mode is 'long', resize the image's long side to the maximum of target_size, if keep_ratio is True and mode is 'short', resize the image's short side to the minimum of target_size. cropsizes (list): crop sizes after resize, [(min_crop_1, max_crop_1), ...] mode (str): resize mode, `long` or `short`. Details see resizes. prob (float): probability of this op. keep_ratio (bool): whether keep_ratio or not, default true interp (int): the interpolation method thresholds (list): iou thresholds for decide a valid bbox crop. num_attempts (int): number of tries before giving up. allow_no_crop (bool): allow return without actually cropping them. cover_all_box (bool): ensure all bboxes are covered in the final crop. is_mask_crop(bool): whether crop the segmentation. """ def __init__(self, resizes, cropsizes, prob=0.5, mode='short', keep_ratio=True, interp=cv2.INTER_LINEAR, num_attempts=3, cover_all_box=False, allow_no_crop=False, thresholds=[0.3, 0.5, 0.7], is_mask_crop=False, ioumode="iou"): super(RandomResizeCrop, self).__init__() self.resizes = resizes self.cropsizes = cropsizes self.prob = prob self.mode = mode self.ioumode = ioumode self.resizer = Resize(0, keep_ratio=keep_ratio, interp=interp) self.croper = RandomCrop( num_attempts=num_attempts, cover_all_box=cover_all_box, thresholds=thresholds, allow_no_crop=allow_no_crop, is_mask_crop=is_mask_crop) def _format_size(self, size): if isinstance(size, Integral): size = (size, size) return size def apply(self, sample, context=None): if random.random() < self.prob: _resize = self._format_size(random.choice(self.resizes)) _cropsize = self._format_size(random.choice(self.cropsizes)) sample = self._resize( self.resizer, sample, size=_resize, mode=self.mode, context=context) sample = self._random_crop( self.croper, sample, size=_cropsize, context=context) return sample @staticmethod def _random_crop(croper, sample, size, context=None): if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0: return sample self = croper h, w = sample['image'].shape[:2] gt_bbox = sample['gt_bbox'] cropsize = size min_crop = min(cropsize) max_crop = max(cropsize) thresholds = list(self.thresholds) np.random.shuffle(thresholds) for thresh in thresholds: found = False for _ in range(self.num_attempts): crop_h = random.randint(min_crop, min(h, max_crop)) crop_w = random.randint(min_crop, min(w, max_crop)) crop_y = random.randint(0, h - crop_h) crop_x = random.randint(0, w - crop_w) crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h] if self.ioumode == "iof": iou = self._gtcropiou_matrix( gt_bbox, np.array( [crop_box], dtype=np.float32)) elif self.ioumode == "iou": iou = self._iou_matrix( gt_bbox, np.array( [crop_box], dtype=np.float32)) if iou.max() < thresh: continue if self.cover_all_box and iou.min() < thresh: continue cropped_box, valid_ids = self._crop_box_with_center_constraint( gt_bbox, np.array( crop_box, dtype=np.float32)) if valid_ids.size > 0: found = True break if found: if self.is_mask_crop and 'gt_poly' in sample and len(sample[ 'gt_poly']) > 0: crop_polys = self.crop_segms( sample['gt_poly'], valid_ids, np.array( crop_box, dtype=np.int64), h, w) if [] in crop_polys: delete_id = list() valid_polys = list() for id, crop_poly in enumerate(crop_polys): if crop_poly == []: delete_id.append(id) else: valid_polys.append(crop_poly) valid_ids = np.delete(valid_ids, delete_id) if len(valid_polys) == 0: return sample sample['gt_poly'] = valid_polys else: sample['gt_poly'] = crop_polys if 'gt_segm' in sample: sample['gt_segm'] = self._crop_segm(sample['gt_segm'], crop_box) sample['gt_segm'] = np.take( sample['gt_segm'], valid_ids, axis=0) sample['image'] = self._crop_image(sample['image'], crop_box) sample['gt_bbox'] = np.take(cropped_box, valid_ids, axis=0) sample['gt_class'] = np.take( sample['gt_class'], valid_ids, axis=0) if 'gt_score' in sample: sample['gt_score'] = np.take( sample['gt_score'], valid_ids, axis=0) if 'is_crowd' in sample: sample['is_crowd'] = np.take( sample['is_crowd'], valid_ids, axis=0) if 'gt_areas' in sample: sample['gt_areas'] = np.take( sample['gt_areas'], valid_ids, axis=0) if 'gt_joints' in sample: gt_joints = self._crop_joints(sample['gt_joints'], crop_box) sample['gt_joints'] = gt_joints[valid_ids] return sample return sample @staticmethod def _resize(resizer, sample, size, mode='short', context=None): self = resizer im = sample['image'] target_size = size if not isinstance(im, np.ndarray): raise TypeError("{}: image type is not numpy.".format(self)) if len(im.shape) != 3: raise ImageError('{}: image is not 3-dimensional.'.format(self)) # apply image im_shape = im.shape if self.keep_ratio: im_size_min = np.min(im_shape[0:2]) im_size_max = np.max(im_shape[0:2]) target_size_min = np.min(target_size) target_size_max = np.max(target_size) if mode == 'long': im_scale = min(target_size_min / im_size_min, target_size_max / im_size_max) else: im_scale = max(target_size_min / im_size_min, target_size_max / im_size_max) resize_h = int(im_scale * float(im_shape[0]) + 0.5) resize_w = int(im_scale * float(im_shape[1]) + 0.5) im_scale_x = im_scale im_scale_y = im_scale else: resize_h, resize_w = target_size im_scale_y = resize_h / im_shape[0] im_scale_x = resize_w / im_shape[1] im = self.apply_image(sample['image'], [im_scale_x, im_scale_y]) sample['image'] = im sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32) if 'scale_factor' in sample: scale_factor = sample['scale_factor'] sample['scale_factor'] = np.asarray( [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x], dtype=np.float32) else: sample['scale_factor'] = np.asarray( [im_scale_y, im_scale_x], dtype=np.float32) # apply bbox if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], [im_scale_x, im_scale_y], [resize_w, resize_h]) # apply polygon if 'gt_poly' in sample and len(sample['gt_poly']) > 0: sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2], [im_scale_x, im_scale_y]) # apply semantic if 'semantic' in sample and sample['semantic']: semantic = sample['semantic'] semantic = cv2.resize( semantic.astype('float32'), None, None, fx=im_scale_x, fy=im_scale_y, interpolation=self.interp) semantic = np.asarray(semantic).astype('int32') semantic = np.expand_dims(semantic, 0) sample['semantic'] = semantic # apply gt_segm if 'gt_segm' in sample and len(sample['gt_segm']) > 0: masks = [ cv2.resize( gt_segm, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=cv2.INTER_NEAREST) for gt_segm in sample['gt_segm'] ] sample['gt_segm'] = np.asarray(masks).astype(np.uint8) if 'gt_joints' in sample: sample['gt_joints'] = self.apply_joints(sample['gt_joints'], [im_scale_x, im_scale_y], [resize_w, resize_h]) return sample @register_op class RandomSelect(BaseOperator): """ Randomly choose a transformation between transforms1 and transforms2, and the probability of choosing transforms1 is p. The code is based on https://github.com/facebookresearch/detr/blob/main/datasets/transforms.py """ def __init__(self, transforms1, transforms2, p=0.5): super(RandomSelect, self).__init__() self.transforms1 = Compose(transforms1) self.transforms2 = Compose(transforms2) self.p = p def apply(self, sample, context=None): if random.random() < self.p: return self.transforms1(sample) return self.transforms2(sample) @register_op class RandomSelects(BaseOperator): """ Randomly choose a transformation between transforms1 and transforms2, and the probability of choosing transforms1 is p. The code is based on https://github.com/facebookresearch/detr/blob/main/datasets/transforms.py """ def __init__(self, transforms_list, p=None): super(RandomSelects, self).__init__() if p is not None: assert isinstance(p, (list, tuple)) assert len(transforms_list) == len(p) else: assert len(transforms_list) > 0 self.transforms = [Compose(t) for t in transforms_list] self.p = p def apply(self, sample, context=None): if self.p is None: return random.choice(self.transforms)(sample) else: prob = random.random() for p, t in zip(self.p, self.transforms): if prob <= p: return t(sample) @register_op class RandomShortSideResize(BaseOperator): def __init__(self, short_side_sizes, max_size=None, interp=cv2.INTER_LINEAR, random_interp=False): """ Resize the image randomly according to the short side. If max_size is not None, the long side is scaled according to max_size. The whole process will be keep ratio. Args: short_side_sizes (list|tuple): Image target short side size. max_size (int): The size of the longest side of image after resize. interp (int): The interpolation method. random_interp (bool): Whether random select interpolation method. """ super(RandomShortSideResize, self).__init__() assert isinstance(short_side_sizes, Sequence), "short_side_sizes must be List or Tuple" self.short_side_sizes = short_side_sizes self.max_size = max_size self.interp = interp self.random_interp = random_interp self.interps = [ cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_AREA, cv2.INTER_CUBIC, cv2.INTER_LANCZOS4, ] def get_size_with_aspect_ratio(self, image_shape, size, max_size=None): h, w = image_shape max_clip = False if max_size is not None: min_original_size = float(min((w, h))) max_original_size = float(max((w, h))) if max_original_size / min_original_size * size > max_size: size = int(max_size * min_original_size / max_original_size) max_clip = True if (w <= h and w == size) or (h <= w and h == size): return (w, h) if w < h: ow = size oh = int(round(size * h / w)) if not max_clip else max_size else: oh = size ow = int(round(size * w / h)) if not max_clip else max_size return (ow, oh) def resize(self, sample, target_size, max_size=None, interp=cv2.INTER_LINEAR): im = sample['image'] if not isinstance(im, np.ndarray): raise TypeError("{}: image type is not numpy.".format(self)) if len(im.shape) != 3: raise ImageError('{}: image is not 3-dimensional.'.format(self)) target_size = self.get_size_with_aspect_ratio(im.shape[:2], target_size, max_size) im_scale_y, im_scale_x = target_size[1] / im.shape[0], target_size[ 0] / im.shape[1] sample['image'] = cv2.resize(im, target_size, interpolation=interp) sample['im_shape'] = np.asarray(target_size[::-1], dtype=np.float32) if 'scale_factor' in sample: scale_factor = sample['scale_factor'] sample['scale_factor'] = np.asarray( [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x], dtype=np.float32) else: sample['scale_factor'] = np.asarray( [im_scale_y, im_scale_x], dtype=np.float32) # apply bbox if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: sample['gt_bbox'] = self.apply_bbox( sample['gt_bbox'], [im_scale_x, im_scale_y], target_size) # apply polygon if 'gt_poly' in sample and len(sample['gt_poly']) > 0: sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im.shape[:2], [im_scale_x, im_scale_y]) # apply semantic if 'semantic' in sample and sample['semantic']: semantic = sample['semantic'] semantic = cv2.resize( semantic.astype('float32'), target_size, interpolation=self.interp) semantic = np.asarray(semantic).astype('int32') semantic = np.expand_dims(semantic, 0) sample['semantic'] = semantic # apply gt_segm if 'gt_segm' in sample and len(sample['gt_segm']) > 0: masks = [ cv2.resize( gt_segm, target_size, interpolation=cv2.INTER_NEAREST) for gt_segm in sample['gt_segm'] ] sample['gt_segm'] = np.asarray(masks).astype(np.uint8) if 'gt_joints' in sample: sample['gt_joints'] = self.apply_joints( sample['gt_joints'], [im_scale_x, im_scale_y], target_size) # apply areas if 'gt_areas' in sample: sample['gt_areas'] = self.apply_area(sample['gt_areas'], [im_scale_x, im_scale_y]) return sample def apply_bbox(self, bbox, scale, size): im_scale_x, im_scale_y = scale resize_w, resize_h = size bbox[:, 0::2] *= im_scale_x bbox[:, 1::2] *= im_scale_y bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w) bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h) return bbox.astype('float32') def apply_joints(self, joints, scale, size): im_scale_x, im_scale_y = scale resize_w, resize_h = size joints[..., 0] *= im_scale_x joints[..., 1] *= im_scale_y # joints[joints[..., 0] >= resize_w, :] = 0 # joints[joints[..., 1] >= resize_h, :] = 0 # joints[joints[..., 0] < 0, :] = 0 # joints[joints[..., 1] < 0, :] = 0 joints[..., 0] = np.clip(joints[..., 0], 0, resize_w) joints[..., 1] = np.clip(joints[..., 1], 0, resize_h) return joints def apply_area(self, area, scale): im_scale_x, im_scale_y = scale return area * im_scale_x * im_scale_y def apply_segm(self, segms, im_size, scale): def _resize_poly(poly, im_scale_x, im_scale_y): resized_poly = np.array(poly).astype('float32') resized_poly[0::2] *= im_scale_x resized_poly[1::2] *= im_scale_y return resized_poly.tolist() def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y): if 'counts' in rle and type(rle['counts']) == list: rle = mask_util.frPyObjects(rle, im_h, im_w) mask = mask_util.decode(rle) mask = cv2.resize( mask, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=self.interp) rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8)) return rle im_h, im_w = im_size im_scale_x, im_scale_y = scale resized_segms = [] for segm in segms: if is_poly(segm): # Polygon format resized_segms.append([ _resize_poly(poly, im_scale_x, im_scale_y) for poly in segm ]) else: # RLE format import pycocotools.mask as mask_util resized_segms.append( _resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y)) return resized_segms def apply(self, sample, context=None): target_size = random.choice(self.short_side_sizes) interp = random.choice( self.interps) if self.random_interp else self.interp return self.resize(sample, target_size, self.max_size, interp) @register_op class RandomShortSideRangeResize(RandomShortSideResize): def __init__(self, scales, interp=cv2.INTER_LINEAR, random_interp=False): """ Resize the image randomly according to the short side. If max_size is not None, the long side is scaled according to max_size. The whole process will be keep ratio. Args: short_side_sizes (list|tuple): Image target short side size. interp (int): The interpolation method. random_interp (bool): Whether random select interpolation method. """ super(RandomShortSideRangeResize, self).__init__(scales, None, interp, random_interp) assert isinstance(scales, Sequence), "short_side_sizes must be List or Tuple" self.scales = scales def random_sample(self, img_scales): img_scale_long = [max(s) for s in img_scales] img_scale_short = [min(s) for s in img_scales] long_edge = np.random.randint( min(img_scale_long), max(img_scale_long) + 1) short_edge = np.random.randint( min(img_scale_short), max(img_scale_short) + 1) img_scale = (long_edge, short_edge) return img_scale def apply(self, sample, context=None): long_edge, short_edge = self.random_sample(self.short_side_sizes) # print("target size:{}".format((long_edge, short_edge))) interp = random.choice( self.interps) if self.random_interp else self.interp return self.resize(sample, short_edge, long_edge, interp) @register_op class RandomSizeCrop(BaseOperator): """ Cut the image randomly according to `min_size` and `max_size` Args: min_size (int): Min size for edges of cropped image. max_size (int): Max size for edges of cropped image. If it is set to larger than length of the input image, the output will keep the origin length. keep_empty (bool): Whether to keep the cropped result with no object. If it is set to False, the no-object result will not be returned, replaced by the original input. """ def __init__(self, min_size, max_size, keep_empty=True): super(RandomSizeCrop, self).__init__() self.min_size = min_size self.max_size = max_size self.keep_empty = keep_empty from paddle.vision.transforms.functional import crop as paddle_crop self.paddle_crop = paddle_crop @staticmethod def get_crop_params(img_shape, output_size): """Get parameters for ``crop`` for a random crop. Args: img_shape (list|tuple): Image's height and width. output_size (list|tuple): Expected output size of the crop. Returns: tuple: params (i, j, h, w) to be passed to ``crop`` for random crop. """ h, w = img_shape th, tw = output_size if h + 1 < th or w + 1 < tw: raise ValueError( "Required crop size {} is larger then input image size {}". format((th, tw), (h, w))) if w == tw and h == th: return 0, 0, h, w i = random.randint(0, h - th + 1) j = random.randint(0, w - tw + 1) return i, j, th, tw def crop(self, sample, region): keep_index = None # apply bbox and check whether the cropped result is valid if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: croped_bbox = self.apply_bbox(sample['gt_bbox'], region) bbox = croped_bbox.reshape([-1, 2, 2]) area = (bbox[:, 1, :] - bbox[:, 0, :]).prod(axis=1) keep_index = np.where(area > 0)[0] if not self.keep_empty and len(keep_index) == 0: # When keep_empty is set to False, cropped with no-object will # not be used and return the origin content. return sample sample['gt_bbox'] = croped_bbox[keep_index] if len( keep_index) > 0 else np.zeros( [0, 4], dtype=np.float32) sample['gt_class'] = sample['gt_class'][keep_index] if len( keep_index) > 0 else np.zeros( [0, 1], dtype=np.float32) if 'gt_score' in sample: sample['gt_score'] = sample['gt_score'][keep_index] if len( keep_index) > 0 else np.zeros( [0, 1], dtype=np.float32) if 'is_crowd' in sample: sample['is_crowd'] = sample['is_crowd'][keep_index] if len( keep_index) > 0 else np.zeros( [0, 1], dtype=np.float32) if 'gt_areas' in sample: sample['gt_areas'] = np.take( sample['gt_areas'], keep_index, axis=0) image_shape = sample['image'].shape[:2] sample['image'] = self.paddle_crop(sample['image'], *region) sample['im_shape'] = np.array( sample['image'].shape[:2], dtype=np.float32) # apply polygon if 'gt_poly' in sample and len(sample['gt_poly']) > 0: sample['gt_poly'] = self.apply_segm(sample['gt_poly'], region, image_shape) sample['gt_poly'] = np.array(sample['gt_poly']) if keep_index is not None and len(keep_index) > 0: sample['gt_poly'] = sample['gt_poly'][keep_index] sample['gt_poly'] = sample['gt_poly'].tolist() # apply gt_segm if 'gt_segm' in sample and len(sample['gt_segm']) > 0: i, j, h, w = region sample['gt_segm'] = sample['gt_segm'][:, i:i + h, j:j + w] if keep_index is not None and len(keep_index) > 0: sample['gt_segm'] = sample['gt_segm'][keep_index] if 'gt_joints' in sample: gt_joints = self._crop_joints(sample['gt_joints'], region) sample['gt_joints'] = gt_joints if keep_index is not None: sample['gt_joints'] = sample['gt_joints'][keep_index] return sample def apply_bbox(self, bbox, region): i, j, h, w = region region_size = np.asarray([w, h]) crop_bbox = bbox - np.asarray([j, i, j, i]) crop_bbox = np.minimum(crop_bbox.reshape([-1, 2, 2]), region_size) crop_bbox = crop_bbox.clip(min=0) return crop_bbox.reshape([-1, 4]).astype('float32') def _crop_joints(self, joints, region): y1, x1, h, w = region x2 = x1 + w y2 = y1 + h # x1, y1, x2, y2 = crop joints[..., 0] -= x1 joints[..., 1] -= y1 joints[joints[..., 0] > w, :] = 0 joints[joints[..., 1] > h, :] = 0 joints[joints[..., 0] < 0, :] = 0 joints[joints[..., 1] < 0, :] = 0 return joints def apply_segm(self, segms, region, image_shape): def _crop_poly(segm, crop): xmin, ymin, xmax, ymax = crop crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin] crop_p = np.array(crop_coord).reshape(4, 2) crop_p = Polygon(crop_p) crop_segm = list() for poly in segm: poly = np.array(poly).reshape(len(poly) // 2, 2) polygon = Polygon(poly) if not polygon.is_valid: exterior = polygon.exterior multi_lines = exterior.intersection(exterior) polygons = shapely.ops.polygonize(multi_lines) polygon = MultiPolygon(polygons) multi_polygon = list() if isinstance(polygon, MultiPolygon): multi_polygon = copy.deepcopy(polygon) else: multi_polygon.append(copy.deepcopy(polygon)) for per_polygon in multi_polygon: inter = per_polygon.intersection(crop_p) if not inter: continue if isinstance(inter, (MultiPolygon, GeometryCollection)): for part in inter: if not isinstance(part, Polygon): continue part = np.squeeze( np.array(part.exterior.coords[:-1]).reshape(1, -1)) part[0::2] -= xmin part[1::2] -= ymin crop_segm.append(part.tolist()) elif isinstance(inter, Polygon): crop_poly = np.squeeze( np.array(inter.exterior.coords[:-1]).reshape(1, -1)) crop_poly[0::2] -= xmin crop_poly[1::2] -= ymin crop_segm.append(crop_poly.tolist()) else: continue return crop_segm def _crop_rle(rle, crop, height, width): if 'counts' in rle and type(rle['counts']) == list: rle = mask_util.frPyObjects(rle, height, width) mask = mask_util.decode(rle) mask = mask[crop[1]:crop[3], crop[0]:crop[2]] rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8)) return rle i, j, h, w = region crop = [j, i, j + w, i + h] height, width = image_shape crop_segms = [] for segm in segms: if is_poly(segm): import copy import shapely.ops from shapely.geometry import Polygon, MultiPolygon, GeometryCollection # Polygon format crop_segms.append(_crop_poly(segm, crop)) else: # RLE format import pycocotools.mask as mask_util crop_segms.append(_crop_rle(segm, crop, height, width)) return crop_segms def apply(self, sample, context=None): h = random.randint(self.min_size, min(sample['image'].shape[0], self.max_size)) w = random.randint(self.min_size, min(sample['image'].shape[1], self.max_size)) region = self.get_crop_params(sample['image'].shape[:2], [h, w]) return self.crop(sample, region) @register_op class WarpAffine(BaseOperator): def __init__(self, keep_res=False, pad=31, input_h=512, input_w=512, scale=0.4, shift=0.1, down_ratio=4): """WarpAffine Warp affine the image The code is based on https://github.com/xingyizhou/CenterNet/blob/master/src/lib/datasets/sample/ctdet.py """ super(WarpAffine, self).__init__() self.keep_res = keep_res self.pad = pad self.input_h = input_h self.input_w = input_w self.scale = scale self.shift = shift self.down_ratio = down_ratio def apply(self, sample, context=None): img = sample['image'] img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) h, w = img.shape[:2] if self.keep_res: # True in detection eval/infer input_h = (h | self.pad) + 1 input_w = (w | self.pad) + 1 s = np.array([input_w, input_h], dtype=np.float32) c = np.array([w // 2, h // 2], dtype=np.float32) else: # False in centertrack eval_mot/eval_mot s = max(h, w) * 1.0 input_h, input_w = self.input_h, self.input_w c = np.array([w / 2., h / 2.], dtype=np.float32) trans_input = get_affine_transform(c, s, 0, [input_w, input_h]) img = cv2.resize(img, (w, h)) inp = cv2.warpAffine( img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR) sample['image'] = inp if not self.keep_res: out_h = input_h // self.down_ratio out_w = input_w // self.down_ratio trans_output = get_affine_transform(c, s, 0, [out_w, out_h]) sample.update({ 'center': c, 'scale': s, 'out_height': out_h, 'out_width': out_w, 'inp_height': input_h, 'inp_width': input_w, 'trans_input': trans_input, 'trans_output': trans_output, }) return sample @register_op class FlipWarpAffine(BaseOperator): def __init__(self, keep_res=False, pad=31, input_h=512, input_w=512, not_rand_crop=False, scale=0.4, shift=0.1, flip=0.5, is_scale=True, use_random=True, add_pre_img=False): """FlipWarpAffine 1. Random Crop 2. Flip the image horizontal 3. Warp affine the image 4. (Optinal) Add previous image """ super(FlipWarpAffine, self).__init__() self.keep_res = keep_res self.pad = pad self.input_h = input_h self.input_w = input_w self.not_rand_crop = not_rand_crop self.scale = scale self.shift = shift self.flip = flip self.is_scale = is_scale self.use_random = use_random self.add_pre_img = add_pre_img def __call__(self, samples, context=None): if self.add_pre_img: assert isinstance(samples, Sequence) and len(samples) == 2 sample, pre_sample = samples[0], samples[1] else: sample = samples img = sample['image'] img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0: return sample h, w = img.shape[:2] flipped = 0 if self.keep_res: input_h = (h | self.pad) + 1 input_w = (w | self.pad) + 1 s = np.array([input_w, input_h], dtype=np.float32) c = np.array([w // 2, h // 2], dtype=np.float32) else: # centernet training default s = max(h, w) * 1.0 input_h, input_w = self.input_h, self.input_w c = np.array([w / 2., h / 2.], dtype=np.float32) if self.use_random: gt_bbox = sample['gt_bbox'] if not self.not_rand_crop: # centernet default s = s * np.random.choice(np.arange(0.6, 1.4, 0.1)) w_border = get_border(128, w) h_border = get_border(128, h) c[0] = np.random.randint(low=w_border, high=w - w_border) c[1] = np.random.randint(low=h_border, high=h - h_border) else: sf = self.scale cf = self.shift c[0] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf) c[1] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf) s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf) if np.random.random() < self.flip: img = img[:, ::-1, :] c[0] = w - c[0] - 1 oldx1 = gt_bbox[:, 0].copy() oldx2 = gt_bbox[:, 2].copy() gt_bbox[:, 0] = w - oldx2 - 1 gt_bbox[:, 2] = w - oldx1 - 1 flipped = 1 sample['gt_bbox'] = gt_bbox trans_input = get_affine_transform(c, s, 0, [input_w, input_h]) inp = cv2.warpAffine( img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR) if self.is_scale: inp = (inp.astype(np.float32) / 255.) sample['image'] = inp sample['center'] = c sample['scale'] = s if self.add_pre_img: sample['trans_input'] = trans_input # previous image, use same aug trans_input as current image pre_img = pre_sample['image'] pre_img = cv2.cvtColor(pre_img, cv2.COLOR_RGB2BGR) if flipped: pre_img = pre_img[:, ::-1, :].copy() pre_inp = cv2.warpAffine( pre_img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR) if self.is_scale: pre_inp = (pre_inp.astype(np.float32) / 255.) sample['pre_image'] = pre_inp # if empty gt_bbox if 'gt_bbox' in pre_sample and len(pre_sample['gt_bbox']) == 0: return sample pre_gt_bbox = pre_sample['gt_bbox'] if flipped: pre_oldx1 = pre_gt_bbox[:, 0].copy() pre_oldx2 = pre_gt_bbox[:, 2].copy() pre_gt_bbox[:, 0] = w - pre_oldx1 - 1 pre_gt_bbox[:, 2] = w - pre_oldx2 - 1 sample['pre_gt_bbox'] = pre_gt_bbox sample['pre_gt_class'] = pre_sample['gt_class'] sample['pre_gt_track_id'] = pre_sample['gt_track_id'] del pre_sample return sample @register_op class CenterRandColor(BaseOperator): """Random color for CenterNet series models. Args: saturation (float): saturation settings. contrast (float): contrast settings. brightness (float): brightness settings. """ def __init__(self, saturation=0.4, contrast=0.4, brightness=0.4): super(CenterRandColor, self).__init__() self.saturation = saturation self.contrast = contrast self.brightness = brightness def apply_saturation(self, img, img_gray): alpha = 1. + np.random.uniform( low=-self.saturation, high=self.saturation) self._blend(alpha, img, img_gray[:, :, None]) return img def apply_contrast(self, img, img_gray): alpha = 1. + np.random.uniform(low=-self.contrast, high=self.contrast) img_mean = img_gray.mean() self._blend(alpha, img, img_mean) return img def apply_brightness(self, img, img_gray): alpha = 1 + np.random.uniform( low=-self.brightness, high=self.brightness) img *= alpha return img def _blend(self, alpha, img, img_mean): img *= alpha img_mean *= (1 - alpha) img += img_mean def apply(self, sample, context=None): functions = [ self.apply_brightness, self.apply_contrast, self.apply_saturation, ] img = sample['image'] img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) distortions = np.random.permutation(functions) for func in distortions: img = func(img, img_gray) sample['image'] = img if 'pre_image' in sample: pre_img = sample['pre_image'] pre_img_gray = cv2.cvtColor(pre_img, cv2.COLOR_BGR2GRAY) pre_distortions = np.random.permutation(functions) for func in pre_distortions: pre_img = func(pre_img, pre_img_gray) sample['pre_image'] = pre_img return sample @register_op class Mosaic(BaseOperator): """ Mosaic operator for image and gt_bboxes The code is based on https://github.com/Megvii-BaseDetection/YOLOX/blob/main/yolox/data/datasets/mosaicdetection.py 1. get mosaic coords 2. clip bbox and get mosaic_labels 3. random_affine augment 4. Mixup augment as copypaste (optinal), not used in tiny/nano Args: prob (float): probability of using Mosaic, 1.0 as default input_dim (list[int]): input shape degrees (list[2]): the rotate range to apply, transform range is [min, max] translate (list[2]): the translate range to apply, transform range is [min, max] scale (list[2]): the scale range to apply, transform range is [min, max] shear (list[2]): the shear range to apply, transform range is [min, max] enable_mixup (bool): whether to enable Mixup or not mixup_prob (float): probability of using Mixup, 1.0 as default mixup_scale (list[int]): scale range of Mixup remove_outside_box (bool): whether remove outside boxes, False as default in COCO dataset, True in MOT dataset """ def __init__(self, prob=1.0, input_dim=[640, 640], degrees=[-10, 10], translate=[-0.1, 0.1], scale=[0.1, 2], shear=[-2, 2], enable_mixup=True, mixup_prob=1.0, mixup_scale=[0.5, 1.5], remove_outside_box=False): super(Mosaic, self).__init__() self.prob = prob if isinstance(input_dim, Integral): input_dim = [input_dim, input_dim] self.input_dim = input_dim self.degrees = degrees self.translate = translate self.scale = scale self.shear = shear self.enable_mixup = enable_mixup self.mixup_prob = mixup_prob self.mixup_scale = mixup_scale self.remove_outside_box = remove_outside_box def get_mosaic_coords(self, mosaic_idx, xc, yc, w, h, input_h, input_w): # (x1, y1, x2, y2) means coords in large image, # small_coords means coords in small image in mosaic aug. if mosaic_idx == 0: # top left x1, y1, x2, y2 = max(xc - w, 0), max(yc - h, 0), xc, yc small_coords = w - (x2 - x1), h - (y2 - y1), w, h elif mosaic_idx == 1: # top right x1, y1, x2, y2 = xc, max(yc - h, 0), min(xc + w, input_w * 2), yc small_coords = 0, h - (y2 - y1), min(w, x2 - x1), h elif mosaic_idx == 2: # bottom left x1, y1, x2, y2 = max(xc - w, 0), yc, xc, min(input_h * 2, yc + h) small_coords = w - (x2 - x1), 0, w, min(y2 - y1, h) elif mosaic_idx == 3: # bottom right x1, y1, x2, y2 = xc, yc, min(xc + w, input_w * 2), min(input_h * 2, yc + h) small_coords = 0, 0, min(w, x2 - x1), min(y2 - y1, h) return (x1, y1, x2, y2), small_coords def random_affine_augment(self, img, labels=[], input_dim=[640, 640], degrees=[-10, 10], scales=[0.1, 2], shears=[-2, 2], translates=[-0.1, 0.1]): # random rotation and scale degree = random.uniform(degrees[0], degrees[1]) scale = random.uniform(scales[0], scales[1]) assert scale > 0, "Argument scale should be positive." R = cv2.getRotationMatrix2D(angle=degree, center=(0, 0), scale=scale) M = np.ones([2, 3]) # random shear shear = random.uniform(shears[0], shears[1]) shear_x = math.tan(shear * math.pi / 180) shear_y = math.tan(shear * math.pi / 180) M[0] = R[0] + shear_y * R[1] M[1] = R[1] + shear_x * R[0] # random translation translate = random.uniform(translates[0], translates[1]) translation_x = translate * input_dim[0] translation_y = translate * input_dim[1] M[0, 2] = translation_x M[1, 2] = translation_y # warpAffine img = cv2.warpAffine( img, M, dsize=tuple(input_dim), borderValue=(114, 114, 114)) num_gts = len(labels) if num_gts > 0: # warp corner points corner_points = np.ones((4 * num_gts, 3)) corner_points[:, :2] = labels[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape( 4 * num_gts, 2) # x1y1, x2y2, x1y2, x2y1 # apply affine transform corner_points = corner_points @M.T corner_points = corner_points.reshape(num_gts, 8) # create new boxes corner_xs = corner_points[:, 0::2] corner_ys = corner_points[:, 1::2] new_bboxes = np.concatenate((corner_xs.min(1), corner_ys.min(1), corner_xs.max(1), corner_ys.max(1))) new_bboxes = new_bboxes.reshape(4, num_gts).T # clip boxes new_bboxes[:, 0::2] = np.clip(new_bboxes[:, 0::2], 0, input_dim[0]) new_bboxes[:, 1::2] = np.clip(new_bboxes[:, 1::2], 0, input_dim[1]) labels[:, :4] = new_bboxes return img, labels def __call__(self, sample, context=None): if not isinstance(sample, Sequence): return sample assert len( sample) == 5, "Mosaic needs 5 samples, 4 for mosaic and 1 for mixup." if np.random.uniform(0., 1.) > self.prob: return sample[0] mosaic_gt_bbox, mosaic_gt_class, mosaic_is_crowd, mosaic_difficult = [], [], [], [] input_h, input_w = self.input_dim yc = int(random.uniform(0.5 * input_h, 1.5 * input_h)) xc = int(random.uniform(0.5 * input_w, 1.5 * input_w)) mosaic_img = np.full((input_h * 2, input_w * 2, 3), 114, dtype=np.uint8) # 1. get mosaic coords for mosaic_idx, sp in enumerate(sample[:4]): img = sp['image'] gt_bbox = sp['gt_bbox'] h0, w0 = img.shape[:2] scale = min(1. * input_h / h0, 1. * input_w / w0) img = cv2.resize( img, (int(w0 * scale), int(h0 * scale)), interpolation=cv2.INTER_LINEAR) (h, w, c) = img.shape[:3] # suffix l means large image, while s means small image in mosaic aug. (l_x1, l_y1, l_x2, l_y2), ( s_x1, s_y1, s_x2, s_y2) = self.get_mosaic_coords( mosaic_idx, xc, yc, w, h, input_h, input_w) mosaic_img[l_y1:l_y2, l_x1:l_x2] = img[s_y1:s_y2, s_x1:s_x2] padw, padh = l_x1 - s_x1, l_y1 - s_y1 # Normalized xywh to pixel xyxy format _gt_bbox = gt_bbox.copy() if len(gt_bbox) > 0: _gt_bbox[:, 0] = scale * gt_bbox[:, 0] + padw _gt_bbox[:, 1] = scale * gt_bbox[:, 1] + padh _gt_bbox[:, 2] = scale * gt_bbox[:, 2] + padw _gt_bbox[:, 3] = scale * gt_bbox[:, 3] + padh mosaic_gt_bbox.append(_gt_bbox) mosaic_gt_class.append(sp['gt_class']) if 'is_crowd' in sp: mosaic_is_crowd.append(sp['is_crowd']) if 'difficult' in sp: mosaic_difficult.append(sp['difficult']) # 2. clip bbox and get mosaic_labels([gt_bbox, gt_class, is_crowd]) if len(mosaic_gt_bbox): mosaic_gt_bbox = np.concatenate(mosaic_gt_bbox, 0) mosaic_gt_class = np.concatenate(mosaic_gt_class, 0) if mosaic_is_crowd: mosaic_is_crowd = np.concatenate(mosaic_is_crowd, 0) mosaic_labels = np.concatenate([ mosaic_gt_bbox, mosaic_gt_class.astype(mosaic_gt_bbox.dtype), mosaic_is_crowd.astype(mosaic_gt_bbox.dtype) ], 1) elif mosaic_difficult: mosaic_difficult = np.concatenate(mosaic_difficult, 0) mosaic_labels = np.concatenate([ mosaic_gt_bbox, mosaic_gt_class.astype(mosaic_gt_bbox.dtype), mosaic_difficult.astype(mosaic_gt_bbox.dtype) ], 1) else: mosaic_labels = np.concatenate([ mosaic_gt_bbox, mosaic_gt_class.astype(mosaic_gt_bbox.dtype) ], 1) if self.remove_outside_box: # for MOT dataset flag1 = mosaic_gt_bbox[:, 0] < 2 * input_w flag2 = mosaic_gt_bbox[:, 2] > 0 flag3 = mosaic_gt_bbox[:, 1] < 2 * input_h flag4 = mosaic_gt_bbox[:, 3] > 0 flag_all = flag1 * flag2 * flag3 * flag4 mosaic_labels = mosaic_labels[flag_all] else: mosaic_labels[:, 0] = np.clip(mosaic_labels[:, 0], 0, 2 * input_w) mosaic_labels[:, 1] = np.clip(mosaic_labels[:, 1], 0, 2 * input_h) mosaic_labels[:, 2] = np.clip(mosaic_labels[:, 2], 0, 2 * input_w) mosaic_labels[:, 3] = np.clip(mosaic_labels[:, 3], 0, 2 * input_h) else: mosaic_labels = np.zeros((1, 6)) # 3. random_affine augment mosaic_img, mosaic_labels = self.random_affine_augment( mosaic_img, mosaic_labels, input_dim=self.input_dim, degrees=self.degrees, translates=self.translate, scales=self.scale, shears=self.shear) # 4. Mixup augment as copypaste, https://arxiv.org/abs/2012.07177 # optinal, not used(enable_mixup=False) in tiny/nano if (self.enable_mixup and not len(mosaic_labels) == 0 and random.random() < self.mixup_prob): sample_mixup = sample[4] mixup_img = sample_mixup['image'] if 'is_crowd' in sample_mixup: cp_labels = np.concatenate([ sample_mixup['gt_bbox'], sample_mixup['gt_class'].astype(mosaic_labels.dtype), sample_mixup['is_crowd'].astype(mosaic_labels.dtype) ], 1) elif 'difficult' in sample_mixup: cp_labels = np.concatenate([ sample_mixup['gt_bbox'], sample_mixup['gt_class'].astype(mosaic_labels.dtype), sample_mixup['difficult'].astype(mosaic_labels.dtype) ], 1) else: cp_labels = np.concatenate([ sample_mixup['gt_bbox'], sample_mixup['gt_class'].astype(mosaic_labels.dtype) ], 1) mosaic_img, mosaic_labels = self.mixup_augment( mosaic_img, mosaic_labels, self.input_dim, cp_labels, mixup_img) sample0 = sample[0] sample0['image'] = mosaic_img.astype(np.uint8) # can not be float32 sample0['h'] = float(mosaic_img.shape[0]) sample0['w'] = float(mosaic_img.shape[1]) sample0['im_shape'][0] = sample0['h'] sample0['im_shape'][1] = sample0['w'] sample0['gt_bbox'] = mosaic_labels[:, :4].astype(np.float32) sample0['gt_class'] = mosaic_labels[:, 4:5].astype(np.float32) if 'is_crowd' in sample[0]: sample0['is_crowd'] = mosaic_labels[:, 5:6].astype(np.float32) if 'difficult' in sample[0]: sample0['difficult'] = mosaic_labels[:, 5:6].astype(np.float32) return sample0 def mixup_augment(self, origin_img, origin_labels, input_dim, cp_labels, img): jit_factor = random.uniform(*self.mixup_scale) FLIP = random.uniform(0, 1) > 0.5 if len(img.shape) == 3: cp_img = np.ones( (input_dim[0], input_dim[1], 3), dtype=np.uint8) * 114 else: cp_img = np.ones(input_dim, dtype=np.uint8) * 114 cp_scale_ratio = min(input_dim[0] / img.shape[0], input_dim[1] / img.shape[1]) resized_img = cv2.resize( img, (int(img.shape[1] * cp_scale_ratio), int(img.shape[0] * cp_scale_ratio)), interpolation=cv2.INTER_LINEAR) cp_img[:int(img.shape[0] * cp_scale_ratio), :int(img.shape[ 1] * cp_scale_ratio)] = resized_img cp_img = cv2.resize(cp_img, (int(cp_img.shape[1] * jit_factor), int(cp_img.shape[0] * jit_factor))) cp_scale_ratio *= jit_factor if FLIP: cp_img = cp_img[:, ::-1, :] origin_h, origin_w = cp_img.shape[:2] target_h, target_w = origin_img.shape[:2] padded_img = np.zeros( (max(origin_h, target_h), max(origin_w, target_w), 3), dtype=np.uint8) padded_img[:origin_h, :origin_w] = cp_img x_offset, y_offset = 0, 0 if padded_img.shape[0] > target_h: y_offset = random.randint(0, padded_img.shape[0] - target_h - 1) if padded_img.shape[1] > target_w: x_offset = random.randint(0, padded_img.shape[1] - target_w - 1) padded_cropped_img = padded_img[y_offset:y_offset + target_h, x_offset: x_offset + target_w] # adjust boxes cp_bboxes_origin_np = cp_labels[:, :4].copy() cp_bboxes_origin_np[:, 0::2] = np.clip(cp_bboxes_origin_np[:, 0::2] * cp_scale_ratio, 0, origin_w) cp_bboxes_origin_np[:, 1::2] = np.clip(cp_bboxes_origin_np[:, 1::2] * cp_scale_ratio, 0, origin_h) if FLIP: cp_bboxes_origin_np[:, 0::2] = ( origin_w - cp_bboxes_origin_np[:, 0::2][:, ::-1]) cp_bboxes_transformed_np = cp_bboxes_origin_np.copy() if self.remove_outside_box: # for MOT dataset cp_bboxes_transformed_np[:, 0::2] -= x_offset cp_bboxes_transformed_np[:, 1::2] -= y_offset else: cp_bboxes_transformed_np[:, 0::2] = np.clip( cp_bboxes_transformed_np[:, 0::2] - x_offset, 0, target_w) cp_bboxes_transformed_np[:, 1::2] = np.clip( cp_bboxes_transformed_np[:, 1::2] - y_offset, 0, target_h) cls_labels = cp_labels[:, 4:5].copy() box_labels = cp_bboxes_transformed_np if cp_labels.shape[-1] == 6: crd_labels = cp_labels[:, 5:6].copy() labels = np.hstack((box_labels, cls_labels, crd_labels)) else: labels = np.hstack((box_labels, cls_labels)) if self.remove_outside_box: labels = labels[labels[:, 0] < target_w] labels = labels[labels[:, 2] > 0] labels = labels[labels[:, 1] < target_h] labels = labels[labels[:, 3] > 0] origin_labels = np.vstack((origin_labels, labels)) origin_img = origin_img.astype(np.float32) origin_img = 0.5 * origin_img + 0.5 * padded_cropped_img.astype( np.float32) return origin_img.astype(np.uint8), origin_labels @register_op class PadResize(BaseOperator): """ PadResize for image and gt_bbbox Args: target_size (list[int]): input shape fill_value (float): pixel value of padded image """ def __init__(self, target_size, fill_value=114): super(PadResize, self).__init__() if isinstance(target_size, Integral): target_size = [target_size, target_size] self.target_size = target_size self.fill_value = fill_value def _resize(self, img, bboxes, labels): ratio = min(self.target_size[0] / img.shape[0], self.target_size[1] / img.shape[1]) w, h = int(img.shape[1] * ratio), int(img.shape[0] * ratio) resized_img = cv2.resize(img, (w, h), interpolation=cv2.INTER_LINEAR) if len(bboxes) > 0: bboxes *= ratio mask = np.minimum(bboxes[:, 2] - bboxes[:, 0], bboxes[:, 3] - bboxes[:, 1]) > 1 bboxes = bboxes[mask] labels = labels[mask] return resized_img, bboxes, labels def _pad(self, img): h, w, _ = img.shape if h == self.target_size[0] and w == self.target_size[1]: return img padded_img = np.full( (self.target_size[0], self.target_size[1], 3), self.fill_value, dtype=np.uint8) padded_img[:h, :w] = img return padded_img def apply(self, sample, context=None): image = sample['image'] bboxes = sample['gt_bbox'] labels = sample['gt_class'] image, bboxes, labels = self._resize(image, bboxes, labels) sample['image'] = self._pad(image).astype(np.float32) sample['gt_bbox'] = bboxes sample['gt_class'] = labels return sample @register_op class RandomShift(BaseOperator): """ Randomly shift image Args: prob (float): probability to do random shift. max_shift (int): max shift pixels filter_thr (int): filter gt bboxes if one side is smaller than this """ def __init__(self, prob=0.5, max_shift=32, filter_thr=1): super(RandomShift, self).__init__() self.prob = prob self.max_shift = max_shift self.filter_thr = filter_thr def calc_shift_coor(self, im_h, im_w, shift_h, shift_w): return [ max(0, shift_w), max(0, shift_h), min(im_w, im_w + shift_w), min(im_h, im_h + shift_h) ] def apply(self, sample, context=None): if random.random() > self.prob: return sample im = sample['image'] gt_bbox = sample['gt_bbox'] gt_class = sample['gt_class'] im_h, im_w = im.shape[:2] shift_h = random.randint(-self.max_shift, self.max_shift) shift_w = random.randint(-self.max_shift, self.max_shift) gt_bbox[:, 0::2] += shift_w gt_bbox[:, 1::2] += shift_h gt_bbox[:, 0::2] = np.clip(gt_bbox[:, 0::2], 0, im_w) gt_bbox[:, 1::2] = np.clip(gt_bbox[:, 1::2], 0, im_h) gt_bbox_h = gt_bbox[:, 2] - gt_bbox[:, 0] gt_bbox_w = gt_bbox[:, 3] - gt_bbox[:, 1] keep = (gt_bbox_w > self.filter_thr) & (gt_bbox_h > self.filter_thr) if not keep.any(): return sample gt_bbox = gt_bbox[keep] gt_class = gt_class[keep] # shift image coor_new = self.calc_shift_coor(im_h, im_w, shift_h, shift_w) # shift frame to the opposite direction coor_old = self.calc_shift_coor(im_h, im_w, -shift_h, -shift_w) canvas = np.zeros_like(im) canvas[coor_new[1]:coor_new[3], coor_new[0]:coor_new[2]] \ = im[coor_old[1]:coor_old[3], coor_old[0]:coor_old[2]] sample['image'] = canvas sample['gt_bbox'] = gt_bbox sample['gt_class'] = gt_class return sample @register_op class StrongAugImage(BaseOperator): def __init__(self, transforms): super(StrongAugImage, self).__init__() self.transforms = Compose(transforms) def apply(self, sample, context=None): im = sample im['image'] = sample['image'].astype('uint8') results = self.transforms(im) sample['image'] = results['image'].astype('uint8') return sample @register_op class RandomColorJitter(BaseOperator): def __init__(self, prob=0.8, brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1): super(RandomColorJitter, self).__init__() self.prob = prob self.brightness = brightness self.contrast = contrast self.saturation = saturation self.hue = hue def apply(self, sample, context=None): if np.random.uniform(0, 1) < self.prob: from paddle.vision.transforms import ColorJitter transform = ColorJitter(self.brightness, self.contrast, self.saturation, self.hue) sample['image'] = transform(sample['image'].astype(np.uint8)) sample['image'] = sample['image'].astype(np.float32) return sample @register_op class RandomGrayscale(BaseOperator): def __init__(self, prob=0.2): super(RandomGrayscale, self).__init__() self.prob = prob def apply(self, sample, context=None): if np.random.uniform(0, 1) < self.prob: from paddle.vision.transforms import Grayscale transform = Grayscale(num_output_channels=3) sample['image'] = transform(sample['image']) return sample @register_op class RandomGaussianBlur(BaseOperator): def __init__(self, prob=0.5, sigma=[0.1, 2.0]): super(RandomGaussianBlur, self).__init__() self.prob = prob self.sigma = sigma def apply(self, sample, context=None): if np.random.uniform(0, 1) < self.prob: sigma = np.random.uniform(self.sigma[0], self.sigma[1]) im = cv2.GaussianBlur(sample['image'], (23, 23), sigma) sample['image'] = im return sample @register_op class RandomErasing(BaseOperator): def __init__(self, prob=0.5, scale=(0.02, 0.33), ratio=(0.3, 3.3), value=0, inplace=False): super(RandomErasing, self).__init__() assert isinstance(scale, (tuple, list)), "scale should be a tuple or list" assert (scale[0] >= 0 and scale[1] <= 1 and scale[0] <= scale[1] ), "scale should be of kind (min, max) and in range [0, 1]" assert isinstance(ratio, (tuple, list)), "ratio should be a tuple or list" assert (ratio[0] >= 0 and ratio[0] <= ratio[1]), "ratio should be of kind (min, max)" assert isinstance( value, (Number, str, tuple, list)), "value should be a number, tuple, list or str" if isinstance(value, str) and value != "random": raise ValueError("value must be 'random' when type is str") self.prob = prob self.scale = scale self.ratio = ratio self.value = value self.inplace = inplace def _erase(self, img, i, j, h, w, v, inplace=False): if not inplace: img = img.copy() img[i:i + h, j:j + w, ...] = v return img def _get_param(self, img, scale, ratio, value): shape = np.asarray(img).astype(np.uint8).shape h, w, c = shape[-3], shape[-2], shape[-1] img_area = h * w log_ratio = np.log(ratio) for _ in range(1): erase_area = np.random.uniform(*scale) * img_area aspect_ratio = np.exp(np.random.uniform(*log_ratio)) erase_h = int(round(np.sqrt(erase_area * aspect_ratio))) erase_w = int(round(np.sqrt(erase_area / aspect_ratio))) if erase_h >= h or erase_w >= w: continue if value is None: v = np.random.normal(size=[erase_h, erase_w, c]) * 255 else: v = np.array(value)[None, None, :] top = np.random.randint(0, h - erase_h + 1) left = np.random.randint(0, w - erase_w + 1) return top, left, erase_h, erase_w, v return 0, 0, h, w, img def apply(self, sample, context=None): if random.random() < self.prob: if isinstance(self.value, Number): value = [self.value] elif isinstance(self.value, str): value = None else: value = self.value if value is not None and not (len(value) == 1 or len(value) == 3): raise ValueError( "Value should be a single number or a sequence with length equals to image's channel." ) im = sample['image'] top, left, erase_h, erase_w, v = self._get_param(im, self.scale, self.ratio, value) im = self._erase(im, top, left, erase_h, erase_w, v, self.inplace) sample['image'] = im return sample @register_op class RandomErasingCrop(BaseOperator): def __init__(self): super(RandomErasingCrop, self).__init__() self.transform1 = RandomErasing( prob=0.7, scale=(0.05, 0.2), ratio=(0.3, 3.3), value="random") self.transform2 = RandomErasing( prob=0.5, scale=(0.05, 0.2), ratio=(0.1, 6), value="random") self.transform3 = RandomErasing( prob=0.3, scale=(0.05, 0.2), ratio=(0.05, 8), value="random") def apply(self, sample, context=None): sample = self.transform1(sample) sample = self.transform2(sample) sample = self.transform3(sample) return sample ================================================ FILE: ppdet/data/transform/rotated_operators.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import print_function from __future__ import division try: from collections.abc import Sequence except Exception: from collections import Sequence from numbers import Number, Integral import cv2 import numpy as np import math import copy from .operators import register_op, BaseOperator from ppdet.modeling.rbox_utils import poly2rbox_le135_np, poly2rbox_oc_np, rbox2poly_np from ppdet.utils.logger import setup_logger from ppdet.utils.compact import imagedraw_textsize_c logger = setup_logger(__name__) @register_op class RRotate(BaseOperator): """ Rotate Image, Polygon, Box Args: scale (float): rotate scale angle (float): rotate angle fill_value (int, tuple): fill color auto_bound (bool): whether auto bound or not """ def __init__(self, scale=1.0, angle=0., fill_value=0., auto_bound=True): super(RRotate, self).__init__() self.scale = scale self.angle = angle self.fill_value = fill_value self.auto_bound = auto_bound def get_rotated_matrix(self, angle, scale, h, w): center = ((w - 1) * 0.5, (h - 1) * 0.5) matrix = cv2.getRotationMatrix2D(center, -angle, scale) # calculate the new size cos = np.abs(matrix[0, 0]) sin = np.abs(matrix[0, 1]) new_w = h * sin + w * cos new_h = h * cos + w * sin # calculate offset n_w = int(np.round(new_w)) n_h = int(np.round(new_h)) if self.auto_bound: ratio = min(w / n_w, h / n_h) matrix = cv2.getRotationMatrix2D(center, -angle, ratio) else: matrix[0, 2] += (new_w - w) * 0.5 matrix[1, 2] += (new_h - h) * 0.5 w = n_w h = n_h return matrix, h, w def get_rect_from_pts(self, pts, h, w): """ get minimum rectangle of points """ assert pts.shape[-1] % 2 == 0, 'the dim of input [pts] is not correct' min_x, min_y = np.min(pts[:, 0::2], axis=1), np.min(pts[:, 1::2], axis=1) max_x, max_y = np.max(pts[:, 0::2], axis=1), np.max(pts[:, 1::2], axis=1) min_x, min_y = np.clip(min_x, 0, w), np.clip(min_y, 0, h) max_x, max_y = np.clip(max_x, 0, w), np.clip(max_y, 0, h) boxes = np.stack([min_x, min_y, max_x, max_y], axis=-1) return boxes def apply_image(self, image, matrix, h, w): return cv2.warpAffine( image, matrix, (w, h), borderValue=self.fill_value) def apply_pts(self, pts, matrix, h, w): assert pts.shape[-1] % 2 == 0, 'the dim of input [pts] is not correct' # n is number of samples and m is two times the number of points due to (x, y) _, m = pts.shape # transpose points pts_ = pts.reshape(-1, 2).T # pad 1 to convert the points to homogeneous coordinates padding = np.ones((1, pts_.shape[1]), pts.dtype) rotated_pts = np.matmul(matrix, np.concatenate((pts_, padding), axis=0)) return rotated_pts[:2, :].T.reshape(-1, m) def apply(self, sample, context=None): image = sample['image'] h, w = image.shape[:2] matrix, h, w = self.get_rotated_matrix(self.angle, self.scale, h, w) sample['image'] = self.apply_image(image, matrix, h, w) polys = sample['gt_poly'] # TODO: segment or keypoint to be processed if len(polys) > 0: pts = self.apply_pts(polys, matrix, h, w) sample['gt_poly'] = pts sample['gt_bbox'] = self.get_rect_from_pts(pts, h, w) return sample @register_op class RandomRRotate(BaseOperator): """ Random Rotate Image Args: scale (float, tuple, list): rotate scale scale_mode (str): mode of scale, [range, value, None] angle (float, tuple, list): rotate angle angle_mode (str): mode of angle, [range, value, None] fill_value (float, tuple, list): fill value rotate_prob (float): probability of rotation auto_bound (bool): whether auto bound or not """ def __init__(self, scale=1.0, scale_mode=None, angle=0., angle_mode=None, fill_value=0., rotate_prob=1.0, auto_bound=True): super(RandomRRotate, self).__init__() self.scale = scale self.scale_mode = scale_mode self.angle = angle self.angle_mode = angle_mode self.fill_value = fill_value self.rotate_prob = rotate_prob self.auto_bound = auto_bound def get_angle(self, angle, angle_mode): assert not angle_mode or angle_mode in [ 'range', 'value' ], 'angle mode should be in [range, value, None]' if not angle_mode: return angle elif angle_mode == 'range': low, high = angle return np.random.rand() * (high - low) + low elif angle_mode == 'value': return np.random.choice(angle) def get_scale(self, scale, scale_mode): assert not scale_mode or scale_mode in [ 'range', 'value' ], 'scale mode should be in [range, value, None]' if not scale_mode: return scale elif scale_mode == 'range': low, high = scale return np.random.rand() * (high - low) + low elif scale_mode == 'value': return np.random.choice(scale) def apply(self, sample, context=None): if np.random.rand() > self.rotate_prob: return sample angle = self.get_angle(self.angle, self.angle_mode) scale = self.get_scale(self.scale, self.scale_mode) rotator = RRotate(scale, angle, self.fill_value, self.auto_bound) return rotator(sample) @register_op class Poly2RBox(BaseOperator): """ Polygon to Rotated Box, using new OpenCV definition since 4.5.1 Args: filter_threshold (int, float): threshold to filter annotations filter_mode (str): filter mode, ['area', 'edge'] rbox_type (str): rbox type, ['le135', 'oc'] """ def __init__(self, filter_threshold=4, filter_mode=None, rbox_type='le135'): super(Poly2RBox, self).__init__() self.filter_fn = lambda size: self.filter(size, filter_threshold, filter_mode) self.rbox_fn = poly2rbox_le135_np if rbox_type == 'le135' else poly2rbox_oc_np def filter(self, size, threshold, mode): if mode == 'area': if size[0] * size[1] < threshold: return True elif mode == 'edge': if min(size) < threshold: return True return False def get_rbox(self, polys): valid_ids, rboxes, bboxes = [], [], [] for i, poly in enumerate(polys): cx, cy, w, h, angle = self.rbox_fn(poly) if self.filter_fn((w, h)): continue rboxes.append(np.array([cx, cy, w, h, angle], dtype=np.float32)) valid_ids.append(i) xmin, ymin = min(poly[0::2]), min(poly[1::2]) xmax, ymax = max(poly[0::2]), max(poly[1::2]) bboxes.append(np.array([xmin, ymin, xmax, ymax], dtype=np.float32)) if len(valid_ids) == 0: rboxes = np.zeros((0, 5), dtype=np.float32) bboxes = np.zeros((0, 4), dtype=np.float32) else: rboxes = np.stack(rboxes) bboxes = np.stack(bboxes) return rboxes, bboxes, valid_ids def apply(self, sample, context=None): rboxes, bboxes, valid_ids = self.get_rbox(sample['gt_poly']) sample['gt_rbox'] = rboxes sample['gt_bbox'] = bboxes for k in ['gt_class', 'gt_score', 'gt_poly', 'is_crowd', 'difficult']: if k in sample: sample[k] = sample[k][valid_ids] return sample @register_op class Poly2Array(BaseOperator): """ convert gt_poly to np.array for rotated bboxes """ def __init__(self): super(Poly2Array, self).__init__() def apply(self, sample, context=None): if 'gt_poly' in sample: sample['gt_poly'] = np.array( sample['gt_poly'], dtype=np.float32).reshape((-1, 8)) return sample @register_op class RResize(BaseOperator): def __init__(self, target_size, keep_ratio, interp=cv2.INTER_LINEAR): """ Resize image to target size. if keep_ratio is True, resize the image's long side to the maximum of target_size if keep_ratio is False, resize the image to target size(h, w) Args: target_size (int|list): image target size keep_ratio (bool): whether keep_ratio or not, default true interp (int): the interpolation method """ super(RResize, self).__init__() self.keep_ratio = keep_ratio self.interp = interp if not isinstance(target_size, (Integral, Sequence)): raise TypeError( "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}". format(type(target_size))) if isinstance(target_size, Integral): target_size = [target_size, target_size] self.target_size = target_size def apply_image(self, image, scale): im_scale_x, im_scale_y = scale return cv2.resize( image, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=self.interp) def apply_pts(self, pts, scale, size): im_scale_x, im_scale_y = scale resize_w, resize_h = size pts[:, 0::2] *= im_scale_x pts[:, 1::2] *= im_scale_y pts[:, 0::2] = np.clip(pts[:, 0::2], 0, resize_w) pts[:, 1::2] = np.clip(pts[:, 1::2], 0, resize_h) return pts def apply(self, sample, context=None): """ Resize the image numpy. """ im = sample['image'] if not isinstance(im, np.ndarray): raise TypeError("{}: image type is not numpy.".format(self)) if len(im.shape) != 3: raise ImageError('{}: image is not 3-dimensional.'.format(self)) # apply image im_shape = im.shape if self.keep_ratio: im_size_min = np.min(im_shape[0:2]) im_size_max = np.max(im_shape[0:2]) target_size_min = np.min(self.target_size) target_size_max = np.max(self.target_size) im_scale = min(target_size_min / im_size_min, target_size_max / im_size_max) resize_h = im_scale * float(im_shape[0]) resize_w = im_scale * float(im_shape[1]) im_scale_x = im_scale im_scale_y = im_scale else: resize_h, resize_w = self.target_size im_scale_y = resize_h / im_shape[0] im_scale_x = resize_w / im_shape[1] im = self.apply_image(sample['image'], [im_scale_x, im_scale_y]) sample['image'] = im.astype(np.float32) sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32) if 'scale_factor' in sample: scale_factor = sample['scale_factor'] sample['scale_factor'] = np.asarray( [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x], dtype=np.float32) else: sample['scale_factor'] = np.asarray( [im_scale_y, im_scale_x], dtype=np.float32) # apply bbox if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: sample['gt_bbox'] = self.apply_pts(sample['gt_bbox'], [im_scale_x, im_scale_y], [resize_w, resize_h]) # apply polygon if 'gt_poly' in sample and len(sample['gt_poly']) > 0: sample['gt_poly'] = self.apply_pts(sample['gt_poly'], [im_scale_x, im_scale_y], [resize_w, resize_h]) return sample @register_op class RandomRFlip(BaseOperator): def __init__(self, prob=0.5): """ Args: prob (float): the probability of flipping image """ super(RandomRFlip, self).__init__() self.prob = prob if not (isinstance(self.prob, float)): raise TypeError("{}: input type is invalid.".format(self)) def apply_image(self, image): return image[:, ::-1, :] def apply_pts(self, pts, width): oldx = pts[:, 0::2].copy() pts[:, 0::2] = width - oldx - 1 return pts def apply(self, sample, context=None): """Filp the image and bounding box. Operators: 1. Flip the image numpy. 2. Transform the bboxes' x coordinates. (Must judge whether the coordinates are normalized!) 3. Transform the segmentations' x coordinates. (Must judge whether the coordinates are normalized!) Output: sample: the image, bounding box and segmentation part in sample are flipped. """ if np.random.uniform(0, 1) < self.prob: im = sample['image'] height, width = im.shape[:2] im = self.apply_image(im) if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: sample['gt_bbox'] = self.apply_pts(sample['gt_bbox'], width) if 'gt_poly' in sample and len(sample['gt_poly']) > 0: sample['gt_poly'] = self.apply_pts(sample['gt_poly'], width) sample['flipped'] = True sample['image'] = im return sample @register_op class VisibleRBox(BaseOperator): """ In debug mode, visualize images according to `gt_box`. (Currently only supported when not cropping and flipping image.) """ def __init__(self, output_dir='debug'): super(VisibleRBox, self).__init__() self.output_dir = output_dir if not os.path.isdir(output_dir): os.makedirs(output_dir) def apply(self, sample, context=None): image = Image.fromarray(sample['image'].astype(np.uint8)) out_file_name = '{:012d}.jpg'.format(sample['im_id'][0]) width = sample['w'] height = sample['h'] # gt_poly = sample['gt_rbox'] gt_poly = sample['gt_poly'] gt_class = sample['gt_class'] draw = ImageDraw.Draw(image) for i in range(gt_poly.shape[0]): x1, y1, x2, y2, x3, y3, x4, y4 = gt_poly[i] draw.line( [(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)], width=2, fill='green') # draw label xmin = min(x1, x2, x3, x4) ymin = min(y1, y2, y3, y4) text = str(gt_class[i][0]) tw, th = imagedraw_textsize_c(draw, text) draw.rectangle( [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill='green') draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255)) if 'gt_keypoint' in sample.keys(): gt_keypoint = sample['gt_keypoint'] if self.is_normalized: for i in range(gt_keypoint.shape[1]): if i % 2: gt_keypoint[:, i] = gt_keypoint[:, i] * height else: gt_keypoint[:, i] = gt_keypoint[:, i] * width for i in range(gt_keypoint.shape[0]): keypoint = gt_keypoint[i] for j in range(int(keypoint.shape[0] / 2)): x1 = round(keypoint[2 * j]).astype(np.int32) y1 = round(keypoint[2 * j + 1]).astype(np.int32) draw.ellipse( (x1, y1, x1 + 5, y1 + 5), fill='green', outline='green') save_path = os.path.join(self.output_dir, out_file_name) image.save(save_path, quality=95) return sample @register_op class Rbox2Poly(BaseOperator): """ Convert rbbox format to poly format. """ def __init__(self): super(Rbox2Poly, self).__init__() def apply(self, sample, context=None): assert 'gt_rbox' in sample assert sample['gt_rbox'].shape[1] == 5 rboxes = sample['gt_rbox'] polys = rbox2poly_np(rboxes) sample['gt_poly'] = polys xmin, ymin = polys[:, 0::2].min(1), polys[:, 1::2].min(1) xmax, ymax = polys[:, 0::2].max(1), polys[:, 1::2].max(1) sample['gt_bbox'] = np.stack([xmin, ymin, xmin, ymin], axis=1) return sample ================================================ FILE: ppdet/data/utils.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import numbers import numpy as np try: from collections.abc import Sequence, Mapping except: from collections import Sequence, Mapping def default_collate_fn(batch): """ Default batch collating function for :code:`paddle.io.DataLoader`, get input data as a list of sample datas, each element in list if the data of a sample, and sample data should composed of list, dictionary, string, number, numpy array, this function will parse input data recursively and stack number, numpy array and paddle.Tensor datas as batch datas. e.g. for following input data: [{'image': np.array(shape=[3, 224, 224]), 'label': 1}, {'image': np.array(shape=[3, 224, 224]), 'label': 3}, {'image': np.array(shape=[3, 224, 224]), 'label': 4}, {'image': np.array(shape=[3, 224, 224]), 'label': 5},] This default collate function zipped each number and numpy array field together and stack each field as the batch field as follows: {'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])} Args: batch(list of sample data): batch should be a list of sample data. Returns: Batched data: batched each number, numpy array and paddle.Tensor in input data. """ sample = batch[0] if isinstance(sample, np.ndarray): batch = np.stack(batch, axis=0) return batch elif isinstance(sample, numbers.Number): batch = np.array(batch) return batch elif isinstance(sample, (str, bytes)): return batch elif isinstance(sample, Mapping): return { key: default_collate_fn([d[key] for d in batch]) for key in sample } elif isinstance(sample, Sequence): sample_fields_num = len(sample) if not all(len(sample) == sample_fields_num for sample in iter(batch)): raise RuntimeError( "fileds number not same among samples in a batch") return [default_collate_fn(fields) for fields in zip(*batch)] raise TypeError("batch data con only contains: tensor, numpy.ndarray, " "dict, list, number, but got {}".format(type(sample))) ================================================ FILE: ppdet/engine/__init__.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import trainer from .trainer import * from . import trainer_cot from .trainer_cot import * from . import callbacks from .callbacks import * from . import env from .env import * __all__ = trainer.__all__ \ + callbacks.__all__ \ + env.__all__ from . import tracker from .tracker import * __all__ = __all__ + tracker.__all__ from . import trainer_ssod from .trainer_ssod import * __all__ = __all__ + trainer_ssod.__all__ ================================================ FILE: ppdet/engine/callbacks.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import gc import sys import datetime import six import copy import json import paddle import paddle.distributed as dist from ppdet.utils.checkpoint import save_model, save_semi_model, save_model_info, update_train_results from ppdet.metrics import get_infer_results from ppdet.utils.logger import setup_logger logger = setup_logger('ppdet.engine') __all__ = [ 'Callback', 'ComposeCallback', 'LogPrinter', 'Checkpointer', 'VisualDLWriter', 'SniperProposalsGenerator' ] class Callback(object): def __init__(self, model): self.model = model log_ranks = self.model.cfg.get("log_ranks", '0') if isinstance(log_ranks, str): self.log_ranks = [int(i) for i in log_ranks.split(',')] elif isinstance(log_ranks, int): self.log_ranks = [log_ranks] self.logger = setup_logger('ppdet.engine.callbacks',log_ranks=self.log_ranks) def on_step_begin(self, status): pass def on_step_end(self, status): pass def on_epoch_begin(self, status): pass def on_epoch_end(self, status): pass def on_train_begin(self, status): pass def on_train_end(self, status): pass class ComposeCallback(object): def __init__(self, callbacks): callbacks = [c for c in list(callbacks) if c is not None] for c in callbacks: assert isinstance( c, Callback), "callback should be subclass of Callback" self._callbacks = callbacks def on_step_begin(self, status): for c in self._callbacks: c.on_step_begin(status) def on_step_end(self, status): for c in self._callbacks: c.on_step_end(status) def on_epoch_begin(self, status): for c in self._callbacks: c.on_epoch_begin(status) def on_epoch_end(self, status): for c in self._callbacks: c.on_epoch_end(status) def on_train_begin(self, status): for c in self._callbacks: c.on_train_begin(status) def on_train_end(self, status): for c in self._callbacks: c.on_train_end(status) class LogPrinter(Callback): def __init__(self, model): super(LogPrinter, self).__init__(model) def on_step_end(self, status): if dist.get_world_size() < 2 or dist.get_rank() in self.log_ranks: mode = status['mode'] if mode == 'train': epoch_id = status['epoch_id'] step_id = status['step_id'] steps_per_epoch = status['steps_per_epoch'] training_staus = status['training_staus'] batch_time = status['batch_time'] data_time = status['data_time'] epoches = self.model.cfg.epoch batch_size = self.model.cfg['{}Reader'.format(mode.capitalize( ))]['batch_size'] logs = training_staus.log() space_fmt = ':' + str(len(str(steps_per_epoch))) + 'd' if step_id % self.model.cfg.log_iter == 0: eta_steps = (epoches - epoch_id) * steps_per_epoch - step_id eta_sec = eta_steps * batch_time.global_avg eta_str = str(datetime.timedelta(seconds=int(eta_sec))) ips = float(batch_size) / batch_time.avg max_mem_reserved_str = "" max_mem_allocated_str = "" print_mem_info = self.model.cfg.get("print_mem_info", True) if paddle.device.is_compiled_with_cuda() and print_mem_info: max_mem_reserved_str = f", max_mem_reserved: {paddle.device.cuda.max_memory_reserved() // (1024 ** 2)} MB" max_mem_allocated_str = f", max_mem_allocated: {paddle.device.cuda.max_memory_allocated() // (1024 ** 2)} MB" fmt = ' '.join([ 'Epoch: [{}]', '[{' + space_fmt + '}/{}]', 'learning_rate: {lr:.6f}', '{meters}', 'eta: {eta}', 'batch_cost: {btime}', 'data_cost: {dtime}', 'ips: {ips:.4f} images/s' '{max_mem_reserved_str}' '{max_mem_allocated_str}' ]) fmt = fmt.format( epoch_id, step_id, steps_per_epoch, lr=status['learning_rate'], meters=logs, eta=eta_str, btime=str(batch_time), dtime=str(data_time), ips=ips, max_mem_reserved_str=max_mem_reserved_str, max_mem_allocated_str=max_mem_allocated_str) self.logger.info(fmt) if mode == 'eval': step_id = status['step_id'] if step_id % 100 == 0: self.logger.info("Eval iter: {}".format(step_id)) def on_epoch_end(self, status): if dist.get_world_size() < 2 or dist.get_rank() == 0: mode = status['mode'] if mode == 'eval': sample_num = status['sample_num'] cost_time = status['cost_time'] self.logger.info('Total sample number: {}, average FPS: {}'.format( sample_num, sample_num / cost_time)) class Checkpointer(Callback): def __init__(self, model): super(Checkpointer, self).__init__(model) self.best_ap = -1000. self.save_dir = self.model.cfg.save_dir self.uniform_output_enabled = self.model.cfg.get("uniform_output_enabled", False) if hasattr(self.model.model, 'student_model'): self.weight = self.model.model.student_model else: self.weight = self.model.model def on_epoch_end(self, status): # Checkpointer only performed during training mode = status['mode'] epoch_id = status['epoch_id'] weight = None save_name = None if dist.get_world_size() < 2 or dist.get_rank() == 0: end_epoch = self.model.cfg.epoch save_name = str(epoch_id) if epoch_id != end_epoch - 1 else "model_final" if mode == 'train': end_epoch = self.model.cfg.epoch if ( epoch_id + 1 ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1: save_name = str( epoch_id) if epoch_id != end_epoch - 1 else "model_final" weight = self.weight.state_dict() elif mode == 'eval': for metric in self.model._metrics: map_res = metric.get_results() eval_func = "ap" if 'pose3d' in map_res: key = 'pose3d' eval_func = "mpjpe" elif 'bbox' in map_res: key = 'bbox' elif 'keypoint' in map_res: key = 'keypoint' else: key = 'mask' key = self.model.cfg.get('target_metrics', key) if key not in map_res: logger.warning("Evaluation results empty, this may be due to " \ "training iterations being too few or not " \ "loading the correct weights.") return epoch_ap = map_res[key][0] epoch_metric = { 'metric': abs(epoch_ap), 'epoch': epoch_id + 1 } save_path = os.path.join(os.path.join(self.save_dir, save_name) if self.uniform_output_enabled else self.save_dir, f"{save_name}.pdstates") paddle.save(epoch_metric, save_path) if self.uniform_output_enabled: save_model_info(epoch_metric, self.save_dir, save_name) update_train_results(self.model.cfg, save_name, epoch_metric, done_flag=epoch_id + 1 == self.model.cfg.epoch, ema=self.model.use_ema) if 'save_best_model' in status and status['save_best_model']: if epoch_ap >= self.best_ap: self.best_ap = epoch_ap save_name = 'best_model' weight = self.weight.state_dict() best_metric = { 'metric': abs(self.best_ap), 'epoch': epoch_id + 1 } save_path = os.path.join(os.path.join(self.save_dir, save_name) if self.uniform_output_enabled else self.save_dir, "best_model.pdstates") paddle.save(best_metric, save_path) if self.uniform_output_enabled: save_model_info(best_metric, self.save_dir, save_name) update_train_results(self.model.cfg, save_name, best_metric, done_flag=epoch_id + 1 == self.model.cfg.epoch, ema=self.model.use_ema) logger.info("Best test {} {} is {:0.3f}.".format( key, eval_func, abs(self.best_ap))) if weight: if self.model.use_ema: exchange_save_model = status.get('exchange_save_model', False) if not exchange_save_model: # save model and ema_model save_model( status['weight'], self.model.optimizer, os.path.join(self.save_dir, save_name) if self.uniform_output_enabled else self.save_dir, save_name, epoch_id + 1, ema_model=weight) if self.uniform_output_enabled: self.model.export(output_dir=os.path.join(self.save_dir, save_name, "inference"), for_fd=True) gc.collect() else: # save model(student model) and ema_model(teacher model) # in DenseTeacher SSOD, the teacher model will be higher, # so exchange when saving pdparams student_model = status['weight'] # model teacher_model = weight # ema_model save_model( teacher_model, self.model.optimizer, self.save_dir, save_name, epoch_id + 1, ema_model=student_model) del teacher_model del student_model else: save_model(weight, self.model.optimizer, os.path.join(self.save_dir, save_name) if self.uniform_output_enabled else self.save_dir, save_name, epoch_id + 1) if self.uniform_output_enabled: self.model.export(output_dir=os.path.join(self.save_dir, save_name, "inference"), for_fd=True) gc.collect() class WiferFaceEval(Callback): def __init__(self, model): super(WiferFaceEval, self).__init__(model) def on_epoch_begin(self, status): assert self.model.mode == 'eval', \ "WiferFaceEval can only be set during evaluation" for metric in self.model._metrics: metric.update(self.model.model) sys.exit() class VisualDLWriter(Callback): """ Use VisualDL to log data or image """ def __init__(self, model): super(VisualDLWriter, self).__init__(model) assert six.PY3, "VisualDL requires Python >= 3.5" try: from visualdl import LogWriter except Exception as e: logger.error('visualdl not found, plaese install visualdl. ' 'for example: `pip install visualdl`.') raise e self.vdl_writer = LogWriter( model.cfg.get('vdl_log_dir', 'vdl_log_dir/scalar')) self.vdl_loss_step = 0 self.vdl_mAP_step = 0 self.vdl_image_step = 0 self.vdl_image_frame = 0 def on_step_end(self, status): mode = status['mode'] if dist.get_world_size() < 2 or dist.get_rank() == 0: if mode == 'train': training_staus = status['training_staus'] for loss_name, loss_value in training_staus.get().items(): self.vdl_writer.add_scalar(loss_name, loss_value, self.vdl_loss_step) self.vdl_loss_step += 1 elif mode == 'test': ori_image = status['original_image'] result_image = status['result_image'] self.vdl_writer.add_image( "original/frame_{}".format(self.vdl_image_frame), ori_image, self.vdl_image_step) self.vdl_writer.add_image( "result/frame_{}".format(self.vdl_image_frame), result_image, self.vdl_image_step) self.vdl_image_step += 1 # each frame can display ten pictures at most. if self.vdl_image_step % 10 == 0: self.vdl_image_step = 0 self.vdl_image_frame += 1 def on_epoch_end(self, status): mode = status['mode'] if dist.get_world_size() < 2 or dist.get_rank() == 0: if mode == 'eval': for metric in self.model._metrics: for key, map_value in metric.get_results().items(): self.vdl_writer.add_scalar("{}-mAP".format(key), map_value[0], self.vdl_mAP_step) self.vdl_mAP_step += 1 class WandbCallback(Callback): def __init__(self, model): super(WandbCallback, self).__init__(model) try: import wandb self.wandb = wandb except Exception as e: logger.error('wandb not found, please install wandb. ' 'Use: `pip install wandb`.') raise e self.wandb_params = model.cfg.get('wandb', None) self.save_dir = self.model.cfg.save_dir if self.wandb_params is None: self.wandb_params = {} for k, v in model.cfg.items(): if k.startswith("wandb_"): self.wandb_params.update({k.lstrip("wandb_"): v}) self._run = None if dist.get_world_size() < 2 or dist.get_rank() == 0: _ = self.run self.run.config.update(self.model.cfg) self.run.define_metric("epoch") self.run.define_metric("eval/*", step_metric="epoch") self.best_ap = -1000. self.fps = [] @property def run(self): if self._run is None: if self.wandb.run is not None: logger.info( "There is an ongoing wandb run which will be used" "for logging. Please use `wandb.finish()` to end that" "if the behaviour is not intended") self._run = self.wandb.run else: self._run = self.wandb.init(**self.wandb_params) return self._run def save_model(self, optimizer, save_dir, save_name, last_epoch, ema_model=None, ap=None, fps=None, tags=None): if dist.get_world_size() < 2 or dist.get_rank() == 0: model_path = os.path.join(save_dir, save_name) metadata = {} metadata["last_epoch"] = last_epoch if ap: metadata["ap"] = ap if fps: metadata["fps"] = fps if ema_model is None: ema_artifact = self.wandb.Artifact( name="ema_model-{}".format(self.run.id), type="model", metadata=metadata) model_artifact = self.wandb.Artifact( name="model-{}".format(self.run.id), type="model", metadata=metadata) ema_artifact.add_file(model_path + ".pdema", name="model_ema") model_artifact.add_file(model_path + ".pdparams", name="model") self.run.log_artifact(ema_artifact, aliases=tags) self.run.log_artfact(model_artifact, aliases=tags) else: model_artifact = self.wandb.Artifact( name="model-{}".format(self.run.id), type="model", metadata=metadata) model_artifact.add_file(model_path + ".pdparams", name="model") self.run.log_artifact(model_artifact, aliases=tags) def on_step_end(self, status): mode = status['mode'] if dist.get_world_size() < 2 or dist.get_rank() == 0: if mode == 'train': training_status = status['training_staus'].get() for k, v in training_status.items(): training_status[k] = float(v) # calculate ips, data_cost, batch_cost batch_time = status['batch_time'] data_time = status['data_time'] batch_size = self.model.cfg['{}Reader'.format(mode.capitalize( ))]['batch_size'] ips = float(batch_size) / float(batch_time.avg) data_cost = float(data_time.avg) batch_cost = float(batch_time.avg) metrics = {"train/" + k: v for k, v in training_status.items()} metrics["train/ips"] = ips metrics["train/data_cost"] = data_cost metrics["train/batch_cost"] = batch_cost self.fps.append(ips) self.run.log(metrics) def on_epoch_end(self, status): mode = status['mode'] epoch_id = status['epoch_id'] save_name = None if dist.get_world_size() < 2 or dist.get_rank() == 0: if mode == 'train': fps = sum(self.fps) / len(self.fps) self.fps = [] end_epoch = self.model.cfg.epoch if ( epoch_id + 1 ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1: save_name = str( epoch_id) if epoch_id != end_epoch - 1 else "model_final" tags = ["latest", "epoch_{}".format(epoch_id)] self.save_model( self.model.optimizer, self.save_dir, save_name, epoch_id + 1, self.model.use_ema, fps=fps, tags=tags) if mode == 'eval': sample_num = status['sample_num'] cost_time = status['cost_time'] fps = sample_num / cost_time merged_dict = {} for metric in self.model._metrics: for key, map_value in metric.get_results().items(): merged_dict["eval/{}-mAP".format(key)] = map_value[0] merged_dict["epoch"] = status["epoch_id"] merged_dict["eval/fps"] = sample_num / cost_time self.run.log(merged_dict) if 'save_best_model' in status and status['save_best_model']: for metric in self.model._metrics: map_res = metric.get_results() if 'pose3d' in map_res: key = 'pose3d' elif 'bbox' in map_res: key = 'bbox' elif 'keypoint' in map_res: key = 'keypoint' else: key = 'mask' if key not in map_res: logger.warning("Evaluation results empty, this may be due to " \ "training iterations being too few or not " \ "loading the correct weights.") return if map_res[key][0] >= self.best_ap: self.best_ap = map_res[key][0] save_name = 'best_model' tags = ["best", "epoch_{}".format(epoch_id)] self.save_model( self.model.optimizer, self.save_dir, save_name, last_epoch=epoch_id + 1, ema_model=self.model.use_ema, ap=abs(self.best_ap), fps=fps, tags=tags) def on_train_end(self, status): self.run.finish() class SniperProposalsGenerator(Callback): def __init__(self, model): super(SniperProposalsGenerator, self).__init__(model) ori_dataset = self.model.dataset self.dataset = self._create_new_dataset(ori_dataset) self.loader = self.model.loader self.cfg = self.model.cfg self.infer_model = self.model.model def _create_new_dataset(self, ori_dataset): dataset = copy.deepcopy(ori_dataset) # init anno_cropper dataset.init_anno_cropper() # generate infer roidbs ori_roidbs = dataset.get_ori_roidbs() roidbs = dataset.anno_cropper.crop_infer_anno_records(ori_roidbs) # set new roidbs dataset.set_roidbs(roidbs) return dataset def _eval_with_loader(self, loader): results = [] with paddle.no_grad(): self.infer_model.eval() for step_id, data in enumerate(loader): outs = self.infer_model(data) for key in ['im_shape', 'scale_factor', 'im_id']: outs[key] = data[key] for key, value in outs.items(): if hasattr(value, 'numpy'): outs[key] = value.numpy() results.append(outs) return results def on_train_end(self, status): self.loader.dataset = self.dataset results = self._eval_with_loader(self.loader) results = self.dataset.anno_cropper.aggregate_chips_detections(results) # sniper proposals = [] clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()} for outs in results: batch_res = get_infer_results(outs, clsid2catid) start = 0 for i, im_id in enumerate(outs['im_id']): bbox_num = outs['bbox_num'] end = start + bbox_num[i] bbox_res = batch_res['bbox'][start:end] \ if 'bbox' in batch_res else None if bbox_res: proposals += bbox_res logger.info("save proposals in {}".format(self.cfg.proposals_path)) with open(self.cfg.proposals_path, 'w') as f: json.dump(proposals, f) class SemiLogPrinter(LogPrinter): def __init__(self, model): super(SemiLogPrinter, self).__init__(model) def on_step_end(self, status): if dist.get_world_size() < 2 or dist.get_rank() == 0: mode = status['mode'] if mode == 'train': epoch_id = status['epoch_id'] step_id = status['step_id'] iter_id = status['iter_id'] steps_per_epoch = status['steps_per_epoch'] training_staus = status['training_staus'] batch_time = status['batch_time'] data_time = status['data_time'] epoches = self.model.cfg.epoch batch_size = self.model.cfg['{}Reader'.format(mode.capitalize( ))]['batch_size'] iters = epoches * steps_per_epoch logs = training_staus.log() iter_space_fmt = ':' + str(len(str(iters))) + 'd' space_fmt = ':' + str(len(str(iters))) + 'd' if step_id % self.model.cfg.log_iter == 0: eta_steps = (epoches - epoch_id) * steps_per_epoch - step_id eta_sec = eta_steps * batch_time.global_avg eta_str = str(datetime.timedelta(seconds=int(eta_sec))) ips = float(batch_size) / batch_time.avg fmt = ' '.join([ '{' + iter_space_fmt + '}/{} iters', 'Epoch: [{}]', '[{' + space_fmt + '}/{}]', 'learning_rate: {lr:.6f}', '{meters}', 'eta: {eta}', 'batch_cost: {btime}', 'data_cost: {dtime}', 'ips: {ips:.4f} images/s', ]) fmt = fmt.format( iter_id, iters, epoch_id, step_id, steps_per_epoch, lr=status['learning_rate'], meters=logs, eta=eta_str, btime=str(batch_time), dtime=str(data_time), ips=ips) logger.info(fmt) if mode == 'eval': step_id = status['step_id'] if step_id % 100 == 0: logger.info("Eval iter: {}".format(step_id)) class SemiCheckpointer(Checkpointer): def __init__(self, model): super(SemiCheckpointer, self).__init__(model) cfg = self.model.cfg self.best_ap = 0. self.save_dir = os.path.join(self.model.cfg.save_dir, self.model.cfg.filename) if hasattr(self.model.model, 'student') and hasattr(self.model.model, 'teacher'): self.weight = (self.model.model.teacher, self.model.model.student) elif hasattr(self.model.model, 'student') or hasattr(self.model.model, 'teacher'): raise AttributeError( "model has no attribute 'student' or 'teacher'") else: raise AttributeError( "model has no attribute 'student' and 'teacher'") def every_n_iters(self, iter_id, n): return (iter_id + 1) % n == 0 if n > 0 else False def on_step_end(self, status): # Checkpointer only performed during training mode = status['mode'] eval_interval = status['eval_interval'] save_interval = status['save_interval'] iter_id = status['iter_id'] epoch_id = status['epoch_id'] t_weight = None s_weight = None save_name = None if dist.get_world_size() < 2 or dist.get_rank() == 0: if self.every_n_iters(iter_id, save_interval) and mode == 'train': save_name = "last_epoch" # save_name = str(iter_id + 1) t_weight = self.weight[0].state_dict() s_weight = self.weight[1].state_dict() save_semi_model(t_weight, s_weight, self.model.optimizer, self.save_dir, save_name, epoch_id + 1, iter_id + 1) def on_epoch_end(self, status): # Checkpointer only performed during training mode = status['mode'] eval_interval = status['eval_interval'] save_interval = status['save_interval'] iter_id = status['iter_id'] epoch_id = status['epoch_id'] t_weight = None s_weight = None save_name = None if dist.get_world_size() < 2 or dist.get_rank() == 0: if self.every_n_iters(iter_id, eval_interval) and mode == 'eval': if 'save_best_model' in status and status['save_best_model']: for metric in self.model._metrics: map_res = metric.get_results() if 'bbox' in map_res: key = 'bbox' elif 'keypoint' in map_res: key = 'keypoint' else: key = 'mask' if key not in map_res: logger.warning("Evaluation results empty, this may be due to " \ "training iterations being too few or not " \ "loading the correct weights.") return if map_res[key][0] > self.best_ap: self.best_ap = map_res[key][0] save_name = 'best_model' t_weight = self.weight[0].state_dict() s_weight = self.weight[1].state_dict() logger.info("Best teacher test {} ap is {:0.3f}.". format(key, self.best_ap)) if t_weight and s_weight: save_semi_model(t_weight, s_weight, self.model.optimizer, self.save_dir, save_name, epoch_id + 1, iter_id + 1) ================================================ FILE: ppdet/engine/env.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import random import numpy as np import paddle from paddle.distributed import fleet __all__ = ['init_parallel_env', 'set_random_seed', 'init_fleet_env'] def init_fleet_env(find_unused_parameters=False): strategy = fleet.DistributedStrategy() strategy.find_unused_parameters = find_unused_parameters fleet.init(is_collective=True, strategy=strategy) def init_parallel_env(): env = os.environ dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env if dist: trainer_id = int(env['PADDLE_TRAINER_ID']) local_seed = (99 + trainer_id) random.seed(local_seed) np.random.seed(local_seed) paddle.distributed.init_parallel_env() def set_random_seed(seed): paddle.seed(seed) random.seed(seed) np.random.seed(seed) ================================================ FILE: ppdet/engine/export_utils.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import yaml from collections import OrderedDict import paddle from ppdet.data.source.category import get_categories from ppdet.core.workspace import load_config from ppdet.utils.logger import setup_logger logger = setup_logger('ppdet.engine') # Global dictionary TRT_MIN_SUBGRAPH = { 'YOLO': 3, 'PPYOLOE': 3, 'SSD': 60, 'RCNN': 40, 'RetinaNet': 40, 'S2ANet': 80, 'EfficientDet': 40, 'Face': 3, 'TTFNet': 60, 'FCOS': 16, 'SOLOv2': 60, 'HigherHRNet': 3, 'HRNet': 3, 'DeepSORT': 3, 'ByteTrack': 10, 'CenterTrack': 5, 'JDE': 10, 'FairMOT': 5, 'GFL': 16, 'PicoDet': 3, 'CenterNet': 5, 'TOOD': 5, 'YOLOX': 8, 'YOLOF': 40, 'METRO_Body': 3, 'DETR': 3, 'CLRNet': 3 } KEYPOINT_ARCH = ['HigherHRNet', 'TopDownHRNet'] MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack'] LANE_ARCH = ['CLRNet'] TO_STATIC_SPEC = { 'yolov3_darknet53_270e_coco': [{ 'im_id': paddle.static.InputSpec( name='im_id', shape=[-1, 1], dtype='float32'), 'is_crowd': paddle.static.InputSpec( name='is_crowd', shape=[-1, 50], dtype='float32'), 'gt_bbox': paddle.static.InputSpec( name='gt_bbox', shape=[-1, 50, 4], dtype='float32'), 'curr_iter': paddle.static.InputSpec( name='curr_iter', shape=[-1], dtype='float32'), 'curr_epoch': paddle.static.InputSpec( name='curr_epoch', shape=[-1], dtype='int64'), 'image': paddle.static.InputSpec( name='image', shape=[-1, 3, -1, -1], dtype='float32'), 'im_shape': paddle.static.InputSpec( name='im_shape', shape=[-1, 2], dtype='float32'), 'scale_factor': paddle.static.InputSpec( name='scale_factor', shape=[-1, 2], dtype='float32'), 'target0': paddle.static.InputSpec( name='target0', shape=[-1, 3, 86, -1, -1], dtype='float32'), 'target1': paddle.static.InputSpec( name='target1', shape=[-1, 3, 86, -1, -1], dtype='float32'), 'target2': paddle.static.InputSpec( name='target2', shape=[-1, 3, 86, -1, -1], dtype='float32'), }], 'tinypose_128x96': [{ 'center': paddle.static.InputSpec( name='center', shape=[-1, 2], dtype='float32'), 'scale': paddle.static.InputSpec( name='scale', shape=[-1, 2], dtype='float32'), 'im_id': paddle.static.InputSpec( name='im_id', shape=[-1, 1], dtype='float32'), 'image': paddle.static.InputSpec( name='image', shape=[-1, 3, 128, 96], dtype='float32'), 'score': paddle.static.InputSpec( name='score', shape=[-1], dtype='float32'), 'rotate': paddle.static.InputSpec( name='rotate', shape=[-1], dtype='float32'), 'target': paddle.static.InputSpec( name='target', shape=[-1, 17, 32, 24], dtype='float32'), 'target_weight': paddle.static.InputSpec( name='target_weight', shape=[-1, 17, 1], dtype='float32'), }], 'fcos_r50_fpn_1x_coco': [{ 'im_id': paddle.static.InputSpec( name='im_id', shape=[-1, 1], dtype='float32'), 'curr_iter': paddle.static.InputSpec( name='curr_iter', shape=[-1], dtype='float32'), 'curr_epoch': paddle.static.InputSpec( name='curr_epoch', shape=[-1], dtype='int64'), 'image': paddle.static.InputSpec( name='image', shape=[-1, 3, -1, -1], dtype='float32'), 'im_shape': paddle.static.InputSpec( name='im_shape', shape=[-1, 2], dtype='float32'), 'scale_factor': paddle.static.InputSpec( name='scale_factor', shape=[-1, 2], dtype='float32'), 'reg_target0': paddle.static.InputSpec( name='reg_target0', shape=[-1, 160, 160, 4], dtype='float32'), 'labels0': paddle.static.InputSpec( name='labels0', shape=[-1, 160, 160, 1], dtype='int32'), 'centerness0': paddle.static.InputSpec( name='centerness0', shape=[-1, 160, 160, 1], dtype='float32'), 'reg_target1': paddle.static.InputSpec( name='reg_target1', shape=[-1, 80, 80, 4], dtype='float32'), 'labels1': paddle.static.InputSpec( name='labels1', shape=[-1, 80, 80, 1], dtype='int32'), 'centerness1': paddle.static.InputSpec( name='centerness1', shape=[-1, 80, 80, 1], dtype='float32'), 'reg_target2': paddle.static.InputSpec( name='reg_target2', shape=[-1, 40, 40, 4], dtype='float32'), 'labels2': paddle.static.InputSpec( name='labels2', shape=[-1, 40, 40, 1], dtype='int32'), 'centerness2': paddle.static.InputSpec( name='centerness2', shape=[-1, 40, 40, 1], dtype='float32'), 'reg_target3': paddle.static.InputSpec( name='reg_target3', shape=[-1, 20, 20, 4], dtype='float32'), 'labels3': paddle.static.InputSpec( name='labels3', shape=[-1, 20, 20, 1], dtype='int32'), 'centerness3': paddle.static.InputSpec( name='centerness3', shape=[-1, 20, 20, 1], dtype='float32'), 'reg_target4': paddle.static.InputSpec( name='reg_target4', shape=[-1, 10, 10, 4], dtype='float32'), 'labels4': paddle.static.InputSpec( name='labels4', shape=[-1, 10, 10, 1], dtype='int32'), 'centerness4': paddle.static.InputSpec( name='centerness4', shape=[-1, 10, 10, 1], dtype='float32'), }], 'picodet_s_320_coco_lcnet': [{ 'im_id': paddle.static.InputSpec( name='im_id', shape=[-1, 1], dtype='float32'), 'is_crowd': paddle.static.InputSpec( name='is_crowd', shape=[-1, -1, 1], dtype='float32'), 'gt_class': paddle.static.InputSpec( name='gt_class', shape=[-1, -1, 1], dtype='int32'), 'gt_bbox': paddle.static.InputSpec( name='gt_bbox', shape=[-1, -1, 4], dtype='float32'), 'curr_iter': paddle.static.InputSpec( name='curr_iter', shape=[-1], dtype='float32'), 'curr_epoch': paddle.static.InputSpec( name='curr_epoch', shape=[-1], dtype='int64'), 'image': paddle.static.InputSpec( name='image', shape=[-1, 3, -1, -1], dtype='float32'), 'im_shape': paddle.static.InputSpec( name='im_shape', shape=[-1, 2], dtype='float32'), 'scale_factor': paddle.static.InputSpec( name='scale_factor', shape=[-1, 2], dtype='float32'), 'pad_gt_mask': paddle.static.InputSpec( name='pad_gt_mask', shape=[-1, -1, 1], dtype='float32'), }], 'ppyoloe_crn_s_300e_coco': [{ 'im_id': paddle.static.InputSpec( name='im_id', shape=[-1, 1], dtype='float32'), 'is_crowd': paddle.static.InputSpec( name='is_crowd', shape=[-1, -1, 1], dtype='float32'), 'gt_class': paddle.static.InputSpec( name='gt_class', shape=[-1, -1, 1], dtype='int32'), 'gt_bbox': paddle.static.InputSpec( name='gt_bbox', shape=[-1, -1, 4], dtype='float32'), 'curr_iter': paddle.static.InputSpec( name='curr_iter', shape=[-1], dtype='float32'), 'curr_epoch': paddle.static.InputSpec( name='curr_epoch', shape=[-1], dtype='int64'), 'image': paddle.static.InputSpec( name='image', shape=[-1, 3, -1, -1], dtype='float32'), 'im_shape': paddle.static.InputSpec( name='im_shape', shape=[-1, 2], dtype='float32'), 'scale_factor': paddle.static.InputSpec( name='scale_factor', shape=[-1, 2], dtype='float32'), 'pad_gt_mask': paddle.static.InputSpec( name='pad_gt_mask', shape=[-1, -1, 1], dtype='float32'), }], } def apply_to_static(config, model): filename = config.get('filename', None) spec = TO_STATIC_SPEC.get(filename, None) model = paddle.jit.to_static(model, input_spec=spec) logger.info("Successfully to apply @to_static with specs: {}".format(spec)) return model def _prune_input_spec(input_spec, program, targets): # try to prune static program to figure out pruned input spec # so we perform following operations in static mode device = paddle.get_device() paddle.enable_static() paddle.set_device(device) pruned_input_spec = [{}] program = program.clone() program = program._prune(targets=targets) global_block = program.global_block() pir_value_set = set() if paddle.framework.use_pir_api(): for op in global_block.ops: if op.name() == 'pd_op.data': pir_value_set.insert(op.attrs()["name"]) for name, spec in input_spec[0].items(): if paddle.framework.use_pir_api(): if name in pir_value_set: pruned_input_spec[0][name] = spec else: try: v = global_block.var(name) pruned_input_spec[0][name] = spec except Exception: pass paddle.disable_static(place=device) return pruned_input_spec def _parse_reader(reader_cfg, dataset_cfg, metric, arch, image_shape): preprocess_list = [] label_list = [] if arch != "lane_arch": anno_file = dataset_cfg.get_anno() clsid2catid, catid2name = get_categories(metric, anno_file, arch) label_list = [str(cat) for cat in catid2name.values()] fuse_normalize = reader_cfg.get('fuse_normalize', False) sample_transforms = reader_cfg['sample_transforms'] hpi_dynamic_shape = None for st in sample_transforms[1:]: for key, value in st.items(): p = {'type': key} if key == 'Resize': if int(image_shape[1]) != -1: value['target_size'] = image_shape[1:] hpi_dynamic_shape = image_shape[1:] value['interp'] = value.get('interp', 1) # cv2.INTER_LINEAR if fuse_normalize and key == 'NormalizeImage': continue p.update(value) preprocess_list.append(p) batch_transforms = reader_cfg.get('batch_transforms', None) if batch_transforms: for bt in batch_transforms: for key, value in bt.items(): # for deploy/infer, use PadStride(stride) instead PadBatch(pad_to_stride) if key == 'PadBatch': preprocess_list.append({ 'type': 'PadStride', 'stride': value['pad_to_stride'] }) break elif key == "CULaneResize": # cut and resize p = {'type': key} p.update(value) p.update({"cut_height": dataset_cfg.cut_height}) preprocess_list.append(p) break return preprocess_list, label_list, hpi_dynamic_shape def _parse_tracker(tracker_cfg): tracker_params = {} for k, v in tracker_cfg.items(): tracker_params.update({k: v}) return tracker_params def _dump_infer_config(config, path, image_shape, model): arch_state = False from ppdet.core.config.yaml_helpers import setup_orderdict setup_orderdict() use_dynamic_shape = True if image_shape[2] == -1 else False infer_cfg = OrderedDict({ 'mode': 'paddle', 'draw_threshold': 0.5, 'metric': config['metric'], 'use_dynamic_shape': use_dynamic_shape }) if config.get('pdx_model_name', None): infer_cfg["Global"] = {"model_name": config["pdx_model_name"]} export_onnx = config.get('export_onnx', False) export_eb = config.get('export_eb', False) infer_arch = config['architecture'] if 'RCNN' in infer_arch and export_onnx: logger.warning( "Exporting RCNN model to ONNX only support batch_size = 1") infer_cfg['export_onnx'] = True infer_cfg['export_eb'] = export_eb if infer_arch in MOT_ARCH: if infer_arch == 'DeepSORT': tracker_cfg = config['DeepSORTTracker'] elif infer_arch == 'CenterTrack': tracker_cfg = config['CenterTracker'] else: tracker_cfg = config['JDETracker'] infer_cfg['tracker'] = _parse_tracker(tracker_cfg) for arch, min_subgraph_size in TRT_MIN_SUBGRAPH.items(): if arch in infer_arch: infer_cfg['arch'] = arch infer_cfg['min_subgraph_size'] = min_subgraph_size arch_state = True break if infer_arch == 'PPYOLOEWithAuxHead': infer_arch = 'PPYOLOE' if infer_arch in ['PPYOLOE', 'YOLOX', 'YOLOF']: infer_cfg['arch'] = infer_arch infer_cfg['min_subgraph_size'] = TRT_MIN_SUBGRAPH[infer_arch] arch_state = True if infer_arch == 'DETR' and config.get('with_mask', False): infer_cfg['mask'] = True if not arch_state: logger.error( 'Architecture: {} is not supported for exporting model now.\n'. format(infer_arch) + 'Please set TRT_MIN_SUBGRAPH in ppdet/engine/export_utils.py') os._exit(0) if 'mask_head' in config[config['architecture']] and config[config[ 'architecture']]['mask_head']: infer_cfg['mask'] = True if 'with_mask' in config[config['architecture']] and config[config[ 'architecture']]['with_mask']: infer_cfg['mask'] = True label_arch = 'detection_arch' if infer_arch in KEYPOINT_ARCH: label_arch = 'keypoint_arch' if infer_arch in LANE_ARCH: infer_cfg['arch'] = infer_arch infer_cfg['min_subgraph_size'] = TRT_MIN_SUBGRAPH[infer_arch] infer_cfg['img_w'] = config['img_w'] infer_cfg['ori_img_h'] = config['ori_img_h'] infer_cfg['cut_height'] = config['cut_height'] label_arch = 'lane_arch' head_name = "CLRHead" infer_cfg['conf_threshold'] = config[head_name]['conf_threshold'] infer_cfg['nms_thres'] = config[head_name]['nms_thres'] infer_cfg['max_lanes'] = config[head_name]['max_lanes'] infer_cfg['num_points'] = config[head_name]['num_points'] arch_state = True if infer_arch in MOT_ARCH: if config['metric'] in ['COCO', 'VOC']: # MOT model run as Detector reader_cfg = config['TestReader'] dataset_cfg = config['TestDataset'] else: # 'metric' in ['MOT', 'MCMOT', 'KITTI'] label_arch = 'mot_arch' reader_cfg = config['TestMOTReader'] dataset_cfg = config['TestMOTDataset'] else: reader_cfg = config['TestReader'] dataset_cfg = config['TestDataset'] infer_cfg['Preprocess'], infer_cfg['label_list'], hpi_dynamic_shape = _parse_reader( reader_cfg, dataset_cfg, config['metric'], label_arch, image_shape[1:]) if config.get("uniform_output_enabled", None): def get_dynamic_shapes(hpi_shape): return [[1, 3] + hpi_shape, [1, 3] + hpi_shape, [8, 3] + hpi_shape] dynamic_shapes = get_dynamic_shapes(hpi_dynamic_shape) if hpi_dynamic_shape else [ [1, 3, 320, 320], [1, 3, 640, 640], [8, 3, 1280, 1280] ] shapes = { "image": dynamic_shapes, "im_shape": [[1, 2], [1, 2], [8, 2]], "scale_factor": [[1, 2], [1, 2], [8, 2]] } trt_dynamic_shape = [ [dim for _ in range(shape[0]) for dim in shape[2:]] for shape in dynamic_shapes ] trt_dynamic_shape_input_data = { "im_shape": trt_dynamic_shape, "scale_factor": [ [2, 2], [1, 1], [0.67 for _ in range(2 * shapes["scale_factor"][-1][0])] ] } hpi_config = OrderedDict({ "backend_configs": OrderedDict({ "paddle_infer": OrderedDict({ "trt_dynamic_shapes": shapes, "trt_dynamic_shape_input_data": trt_dynamic_shape_input_data }), "tensorrt": OrderedDict({ "dynamic_shapes": shapes }) }) }) infer_cfg["Hpi"] = hpi_config if infer_arch == 'PicoDet': if hasattr(config, 'export') and config['export'].get( 'post_process', False) and not config['export'].get('benchmark', False): infer_cfg['arch'] = 'GFL' head_name = 'PicoHeadV2' if config['PicoHeadV2'] else 'PicoHead' infer_cfg['NMS'] = config[head_name]['nms'] # In order to speed up the prediction, the threshold of nms # is adjusted here, which can be changed in infer_cfg.yml config[head_name]['nms']["score_threshold"] = 0.3 config[head_name]['nms']["nms_threshold"] = 0.5 infer_cfg['fpn_stride'] = config[head_name]['fpn_stride'] yaml.dump(infer_cfg, open(path, 'w')) logger.info("Export inference config file to {}".format(os.path.join(path))) ================================================ FILE: ppdet/engine/naive_sync_bn.py ================================================ import paddle.distributed as dist import math import paddle import paddle.nn as nn class _AllReduce(paddle.autograd.PyLayer): @staticmethod def forward(ctx, input): input_list = [paddle.zeros_like(input) for k in range(dist.get_world_size())] # Use allgather instead of allreduce since I don't trust in-place operations .. dist.all_gather(input_list, input, sync_op=True) inputs = paddle.stack(input_list, axis=0) return paddle.sum(inputs, axis=0) @staticmethod def backward(ctx, grad_output): dist.all_reduce(grad_output, sync_op=True) return grad_output def differentiable_all_reduce(input): """ Differentiable counterpart of `dist.all_reduce`. """ if ( not dist.is_available() or not dist.is_initialized() or dist.get_world_size() == 1 ): return input return _AllReduce.apply(input) class NaiveSyncBatchNorm(nn.BatchNorm2D): def __init__(self, *args, stats_mode="", **kwargs): super().__init__(*args, **kwargs) assert stats_mode in ["", "N"] self._stats_mode = stats_mode def forward(self, input): if dist.get_world_size() == 1 or not self.training: return super(NaiveSyncBatchNorm, self).forward(input) B, C = input.shape[0], input.shape[1] mean = paddle.mean(input, axis=[0, 2, 3]) meansqr = paddle.mean(input * input, axis=[0, 2, 3]) if self._stats_mode == "": assert B > 0, 'SyncBatchNorm(stats_mode="") does not support zero batch size.' vec = paddle.concat([mean, meansqr], axis=0) vec = differentiable_all_reduce(vec) * (1.0 / dist.get_world_size()) mean, meansqr = paddle.split(vec, [C, C]) momentum = 1 - self._momentum # NOTE: paddle has reverse momentum defination else: if B == 0: vec = paddle.zeros([2 * C + 1], dtype=mean.dtype) vec = vec + input.sum() # make sure there is gradient w.r.t input else: vec = paddle.concat( [ mean, meansqr, paddle.ones([1], dtype=mean.dtype), ], axis=0, ) vec = differentiable_all_reduce(vec * B) total_batch = vec[-1].detach() momentum = total_batch.clip(max=1) * (1 - self._momentum) # no update if total_batch is 0 mean, meansqr, _ = paddle.split(vec / total_batch.clip(min=1), [C, C, int(vec.shape[0] - 2*C)]) # avoid div-by-zero var = meansqr - mean * mean invstd = paddle.rsqrt(var + self._epsilon) scale = self.weight * invstd bias = self.bias - mean * scale scale = scale.reshape([1, -1, 1, 1]) bias = bias.reshape([1, -1, 1, 1]) tmp_mean = self._mean + momentum * (mean.detach() - self._mean) self._mean.set_value(tmp_mean) tmp_variance = self._variance + (momentum * (var.detach() - self._variance)) self._variance.set_value(tmp_variance) ret = input * scale + bias return ret def convert_syncbn(model): for n, m in model.named_children(): if isinstance(m, nn.layer.norm._BatchNormBase): syncbn = NaiveSyncBatchNorm(m._num_features, m._momentum, m._epsilon, m._weight_attr, m._bias_attr) setattr(model, n, syncbn) else: convert_syncbn(m) ================================================ FILE: ppdet/engine/tracker.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import glob import re import paddle import paddle.nn as nn import numpy as np from tqdm import tqdm from collections import defaultdict from ppdet.core.workspace import create from ppdet.utils.checkpoint import load_weight, load_pretrain_weight from ppdet.modeling.mot.utils import Detection, get_crops, scale_coords, clip_box from ppdet.modeling.mot.utils import MOTTimer, load_det_results, write_mot_results, save_vis_results from ppdet.modeling.mot.tracker import JDETracker, CenterTracker from ppdet.modeling.mot.tracker import DeepSORTTracker, OCSORTTracker, BOTSORTTracker from ppdet.modeling.architectures import YOLOX from ppdet.metrics import Metric, MOTMetric, KITTIMOTMetric, MCMOTMetric from ppdet.data.source.category import get_categories import ppdet.utils.stats as stats from .callbacks import Callback, ComposeCallback from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack'] MOT_ARCH_JDE = MOT_ARCH[:2] MOT_ARCH_SDE = MOT_ARCH[2:4] MOT_DATA_TYPE = ['mot', 'mcmot', 'kitti'] __all__ = ['Tracker'] class Tracker(object): def __init__(self, cfg, mode='eval'): self.cfg = cfg assert mode.lower() in ['test', 'eval'], \ "mode should be 'test' or 'eval'" self.mode = mode.lower() self.optimizer = None # build MOT data loader self.dataset = cfg['{}MOTDataset'.format(self.mode.capitalize())] # build model self.model = create(cfg.architecture) if isinstance(self.model.detector, YOLOX): for k, m in self.model.named_sublayers(): if isinstance(m, nn.BatchNorm2D): m._epsilon = 1e-3 # for amp(fp16) m._momentum = 0.97 # 0.03 in pytorch anno_file = self.dataset.get_anno() clsid2catid, catid2name = get_categories( self.cfg.metric, anno_file=anno_file) self.ids2names = [] for k, v in catid2name.items(): self.ids2names.append(v) self.status = {} self.start_epoch = 0 # initial default callbacks self._init_callbacks() # initial default metrics self._init_metrics() self._reset_metrics() def _init_callbacks(self): self._callbacks = [] self._compose_callback = None def _init_metrics(self): if self.mode in ['test']: self._metrics = [] return if self.cfg.metric == 'MOT': self._metrics = [MOTMetric(), ] elif self.cfg.metric == 'MCMOT': self._metrics = [MCMOTMetric(self.cfg.num_classes), ] elif self.cfg.metric == 'KITTI': self._metrics = [KITTIMOTMetric(), ] else: logger.warning("Metric not support for metric type {}".format( self.cfg.metric)) self._metrics = [] def _reset_metrics(self): for metric in self._metrics: metric.reset() def register_callbacks(self, callbacks): callbacks = [h for h in list(callbacks) if h is not None] for c in callbacks: assert isinstance(c, Callback), \ "metrics shoule be instances of subclass of Metric" self._callbacks.extend(callbacks) self._compose_callback = ComposeCallback(self._callbacks) def register_metrics(self, metrics): metrics = [m for m in list(metrics) if m is not None] for m in metrics: assert isinstance(m, Metric), \ "metrics shoule be instances of subclass of Metric" self._metrics.extend(metrics) def load_weights_jde(self, weights): load_weight(self.model, weights, self.optimizer) def load_weights_sde(self, det_weights, reid_weights): with_detector = self.model.detector is not None with_reid = self.model.reid is not None if with_detector: load_weight(self.model.detector, det_weights) if with_reid: load_weight(self.model.reid, reid_weights) else: load_weight(self.model.reid, reid_weights) def _eval_seq_centertrack(self, dataloader, save_dir=None, show_image=False, frame_rate=30, draw_threshold=0): assert isinstance(self.model.tracker, CenterTracker) if save_dir: if not os.path.exists(save_dir): os.makedirs(save_dir) tracker = self.model.tracker timer = MOTTimer() frame_id = 0 self.status['mode'] = 'track' self.model.eval() results = defaultdict(list) # only support single class now for step_id, data in enumerate(tqdm(dataloader)): self.status['step_id'] = step_id if step_id == 0: self.model.reset_tracking() # forward timer.tic() pred_ret = self.model(data) online_targets = tracker.update(pred_ret) online_tlwhs, online_scores, online_ids = [], [], [] for t in online_targets: bbox = t['bbox'] tlwh = [bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]] tscore = float(t['score']) tid = int(t['tracking_id']) if tlwh[2] * tlwh[3] > 0: online_tlwhs.append(tlwh) online_ids.append(tid) online_scores.append(tscore) timer.toc() # save results results[0].append( (frame_id + 1, online_tlwhs, online_scores, online_ids)) save_vis_results(data, frame_id, online_ids, online_tlwhs, online_scores, timer.average_time, show_image, save_dir, self.cfg.num_classes, self.ids2names) frame_id += 1 return results, frame_id, timer.average_time, timer.calls def _eval_seq_jde(self, dataloader, save_dir=None, show_image=False, frame_rate=30, draw_threshold=0): if save_dir: if not os.path.exists(save_dir): os.makedirs(save_dir) tracker = self.model.tracker tracker.max_time_lost = int(frame_rate / 30.0 * tracker.track_buffer) timer = MOTTimer() frame_id = 0 self.status['mode'] = 'track' self.model.eval() results = defaultdict(list) # support single class and multi classes for step_id, data in enumerate(tqdm(dataloader)): self.status['step_id'] = step_id # forward timer.tic() pred_dets, pred_embs = self.model(data) pred_dets, pred_embs = pred_dets.numpy(), pred_embs.numpy() online_targets_dict = self.model.tracker.update(pred_dets, pred_embs) online_tlwhs = defaultdict(list) online_scores = defaultdict(list) online_ids = defaultdict(list) for cls_id in range(self.cfg.num_classes): online_targets = online_targets_dict[cls_id] for t in online_targets: tlwh = t.tlwh tid = t.track_id tscore = t.score if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[ 3] > tracker.vertical_ratio: continue online_tlwhs[cls_id].append(tlwh) online_ids[cls_id].append(tid) online_scores[cls_id].append(tscore) # save results results[cls_id].append( (frame_id + 1, online_tlwhs[cls_id], online_scores[cls_id], online_ids[cls_id])) timer.toc() save_vis_results(data, frame_id, online_ids, online_tlwhs, online_scores, timer.average_time, show_image, save_dir, self.cfg.num_classes, self.ids2names) frame_id += 1 return results, frame_id, timer.average_time, timer.calls def _eval_seq_sde(self, dataloader, save_dir=None, show_image=False, frame_rate=30, seq_name='', scaled=False, det_file='', draw_threshold=0): if save_dir: if not os.path.exists(save_dir): os.makedirs(save_dir) use_detector = False if not self.model.detector else True use_reid = hasattr(self.model, 'reid') if use_reid and self.model.reid is not None: use_reid = True else: use_reid = False timer = MOTTimer() results = defaultdict(list) frame_id = 0 self.status['mode'] = 'track' self.model.eval() if use_reid: self.model.reid.eval() if not use_detector: dets_list = load_det_results(det_file, len(dataloader)) logger.info('Finish loading detection results file {}.'.format( det_file)) tracker = self.model.tracker for step_id, data in enumerate(tqdm(dataloader)): self.status['step_id'] = step_id ori_image = data['ori_image'] # [bs, H, W, 3] ori_image_shape = data['ori_image'].shape[1:3] # ori_image_shape: [H, W] input_shape = data['image'].shape[2:] # input_shape: [h, w], before data transforms, set in model config im_shape = data['im_shape'][0].numpy() # im_shape: [new_h, new_w], after data transforms scale_factor = data['scale_factor'][0].numpy() empty_detections = False # when it has no detected bboxes, will not inference reid model # and if visualize, use original image instead # forward timer.tic() if not use_detector: dets = dets_list[frame_id] bbox_tlwh = np.array(dets['bbox'], dtype='float32') if bbox_tlwh.shape[0] > 0: # detector outputs: pred_cls_ids, pred_scores, pred_bboxes pred_cls_ids = np.array(dets['cls_id'], dtype='float32') pred_scores = np.array(dets['score'], dtype='float32') pred_bboxes = np.concatenate( (bbox_tlwh[:, 0:2], bbox_tlwh[:, 2:4] + bbox_tlwh[:, 0:2]), axis=1) else: logger.warning( 'Frame {} has not object, try to modify score threshold.'. format(frame_id)) empty_detections = True else: outs = self.model.detector(data) outs['bbox'] = outs['bbox'].numpy() outs['bbox_num'] = outs['bbox_num'].numpy() if len(outs['bbox']) > 0 and empty_detections == False: # detector outputs: pred_cls_ids, pred_scores, pred_bboxes pred_cls_ids = outs['bbox'][:, 0:1] pred_scores = outs['bbox'][:, 1:2] if not scaled: # Note: scaled=False only in JDE YOLOv3 or other detectors # with LetterBoxResize and JDEBBoxPostProcess. # # 'scaled' means whether the coords after detector outputs # have been scaled back to the original image, set True # in general detector, set False in JDE YOLOv3. pred_bboxes = scale_coords(outs['bbox'][:, 2:], input_shape, im_shape, scale_factor) else: pred_bboxes = outs['bbox'][:, 2:] pred_dets_old = np.concatenate( (pred_cls_ids, pred_scores, pred_bboxes), axis=1) else: logger.warning( 'Frame {} has not detected object, try to modify score threshold.'. format(frame_id)) empty_detections = True if not empty_detections: pred_xyxys, keep_idx = clip_box(pred_bboxes, ori_image_shape) if len(keep_idx[0]) == 0: logger.warning( 'Frame {} has not detected object left after clip_box.'. format(frame_id)) empty_detections = True if empty_detections: timer.toc() # if visualize, use original image instead online_ids, online_tlwhs, online_scores = None, None, None save_vis_results(data, frame_id, online_ids, online_tlwhs, online_scores, timer.average_time, show_image, save_dir, self.cfg.num_classes, self.ids2names) frame_id += 1 # thus will not inference reid model continue pred_cls_ids = pred_cls_ids[keep_idx[0]] pred_scores = pred_scores[keep_idx[0]] pred_dets = np.concatenate( (pred_cls_ids, pred_scores, pred_xyxys), axis=1) if use_reid: crops = get_crops( pred_xyxys, ori_image, w=tracker.input_size[0], h=tracker.input_size[1]) crops = paddle.to_tensor(crops) data.update({'crops': crops}) pred_embs = self.model(data)['embeddings'].numpy() else: pred_embs = None if isinstance(tracker, DeepSORTTracker): online_tlwhs, online_scores, online_ids = [], [], [] tracker.predict() online_targets = tracker.update(pred_dets, pred_embs) for t in online_targets: if not t.is_confirmed() or t.time_since_update > 1: continue tlwh = t.to_tlwh() tscore = t.score tid = t.track_id if tscore < draw_threshold: continue if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[ 3] > tracker.vertical_ratio: continue online_tlwhs.append(tlwh) online_scores.append(tscore) online_ids.append(tid) timer.toc() # save results results[0].append( (frame_id + 1, online_tlwhs, online_scores, online_ids)) save_vis_results(data, frame_id, online_ids, online_tlwhs, online_scores, timer.average_time, show_image, save_dir, self.cfg.num_classes, self.ids2names) elif isinstance(tracker, JDETracker): # trick hyperparams only used for MOTChallenge (MOT17, MOT20) Test-set tracker.track_buffer, tracker.conf_thres = get_trick_hyperparams( seq_name, tracker.track_buffer, tracker.conf_thres) online_targets_dict = tracker.update(pred_dets_old, pred_embs) online_tlwhs = defaultdict(list) online_scores = defaultdict(list) online_ids = defaultdict(list) for cls_id in range(self.cfg.num_classes): online_targets = online_targets_dict[cls_id] for t in online_targets: tlwh = t.tlwh tid = t.track_id tscore = t.score if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[ 3] > tracker.vertical_ratio: continue online_tlwhs[cls_id].append(tlwh) online_ids[cls_id].append(tid) online_scores[cls_id].append(tscore) # save results results[cls_id].append( (frame_id + 1, online_tlwhs[cls_id], online_scores[cls_id], online_ids[cls_id])) timer.toc() save_vis_results(data, frame_id, online_ids, online_tlwhs, online_scores, timer.average_time, show_image, save_dir, self.cfg.num_classes, self.ids2names) elif isinstance(tracker, OCSORTTracker): # OC_SORT Tracker online_targets = tracker.update(pred_dets_old, pred_embs) online_tlwhs = [] online_ids = [] online_scores = [] for t in online_targets: tlwh = [t[0], t[1], t[2] - t[0], t[3] - t[1]] tscore = float(t[4]) tid = int(t[5]) if tlwh[2] * tlwh[3] > 0: online_tlwhs.append(tlwh) online_ids.append(tid) online_scores.append(tscore) timer.toc() # save results results[0].append( (frame_id + 1, online_tlwhs, online_scores, online_ids)) save_vis_results(data, frame_id, online_ids, online_tlwhs, online_scores, timer.average_time, show_image, save_dir, self.cfg.num_classes, self.ids2names) elif isinstance(tracker, BOTSORTTracker): # BOTSORT Tracker online_targets = tracker.update( pred_dets_old, img=ori_image.numpy()) online_tlwhs = [] online_ids = [] online_scores = [] for t in online_targets: tlwh = t.tlwh tid = t.track_id tscore = t.score if tlwh[2] * tlwh[3] > 0: online_tlwhs.append(tlwh) online_ids.append(tid) online_scores.append(tscore) timer.toc() # save results results[0].append( (frame_id + 1, online_tlwhs, online_scores, online_ids)) save_vis_results(data, frame_id, online_ids, online_tlwhs, online_scores, timer.average_time, show_image, save_dir, self.cfg.num_classes, self.ids2names) else: raise ValueError(tracker) frame_id += 1 return results, frame_id, timer.average_time, timer.calls def mot_evaluate(self, data_root, seqs, output_dir, data_type='mot', model_type='JDE', save_images=False, save_videos=False, show_image=False, scaled=False, det_results_dir=''): if not os.path.exists(output_dir): os.makedirs(output_dir) result_root = os.path.join(output_dir, 'mot_results') if not os.path.exists(result_root): os.makedirs(result_root) assert data_type in MOT_DATA_TYPE, \ "data_type should be 'mot', 'mcmot' or 'kitti'" assert model_type in MOT_ARCH, \ "model_type should be 'JDE', 'DeepSORT', 'FairMOT' or 'ByteTrack'" # run tracking n_frame = 0 timer_avgs, timer_calls = [], [] for seq in seqs: infer_dir = os.path.join(data_root, seq) if not os.path.exists(infer_dir) or not os.path.isdir(infer_dir): logger.warning("Seq {} error, {} has no images.".format( seq, infer_dir)) continue if os.path.exists(os.path.join(infer_dir, 'img1')): infer_dir = os.path.join(infer_dir, 'img1') frame_rate = 30 seqinfo = os.path.join(data_root, seq, 'seqinfo.ini') if os.path.exists(seqinfo): meta_info = open(seqinfo).read() frame_rate = int(meta_info[meta_info.find('frameRate') + 10: meta_info.find('\nseqLength')]) save_dir = os.path.join(output_dir, 'mot_outputs', seq) if save_images or save_videos else None logger.info('Evaluate seq: {}'.format(seq)) self.dataset.set_images(self.get_infer_images(infer_dir)) dataloader = create('EvalMOTReader')(self.dataset, 0) result_filename = os.path.join(result_root, '{}.txt'.format(seq)) with paddle.no_grad(): if model_type in MOT_ARCH_JDE: results, nf, ta, tc = self._eval_seq_jde( dataloader, save_dir=save_dir, show_image=show_image, frame_rate=frame_rate) elif model_type in MOT_ARCH_SDE: results, nf, ta, tc = self._eval_seq_sde( dataloader, save_dir=save_dir, show_image=show_image, frame_rate=frame_rate, seq_name=seq, scaled=scaled, det_file=os.path.join(det_results_dir, '{}.txt'.format(seq))) elif model_type == 'CenterTrack': results, nf, ta, tc = self._eval_seq_centertrack( dataloader, save_dir=save_dir, show_image=show_image, frame_rate=frame_rate) else: raise ValueError(model_type) write_mot_results(result_filename, results, data_type, self.cfg.num_classes) n_frame += nf timer_avgs.append(ta) timer_calls.append(tc) if save_videos: output_video_path = os.path.join(save_dir, '..', '{}_vis.mp4'.format(seq)) cmd_str = 'ffmpeg -f image2 -i {}/%05d.jpg {}'.format( save_dir, output_video_path) os.system(cmd_str) logger.info('Save video in {}.'.format(output_video_path)) # update metrics for metric in self._metrics: metric.update(data_root, seq, data_type, result_root, result_filename) timer_avgs = np.asarray(timer_avgs) timer_calls = np.asarray(timer_calls) all_time = np.dot(timer_avgs, timer_calls) avg_time = all_time / np.sum(timer_calls) logger.info('Time elapsed: {:.2f} seconds, FPS: {:.2f}'.format( all_time, 1.0 / avg_time)) # accumulate metric to log out for metric in self._metrics: metric.accumulate() metric.log() # reset metric states for metric may performed multiple times self._reset_metrics() def get_infer_images(self, infer_dir): assert infer_dir is None or os.path.isdir(infer_dir), \ "{} is not a directory".format(infer_dir) images = set() assert os.path.isdir(infer_dir), \ "infer_dir {} is not a directory".format(infer_dir) exts = ['jpg', 'jpeg', 'png', 'bmp'] exts += [ext.upper() for ext in exts] for ext in exts: images.update(glob.glob('{}/*.{}'.format(infer_dir, ext))) images = list(images) images.sort() assert len(images) > 0, "no image found in {}".format(infer_dir) logger.info("Found {} inference images in total.".format(len(images))) return images def mot_predict_seq(self, video_file, frame_rate, image_dir, output_dir, data_type='mot', model_type='JDE', save_images=False, save_videos=True, show_image=False, scaled=False, det_results_dir='', draw_threshold=0.5): assert video_file is not None or image_dir is not None, \ "--video_file or --image_dir should be set." assert video_file is None or os.path.isfile(video_file), \ "{} is not a file".format(video_file) assert image_dir is None or os.path.isdir(image_dir), \ "{} is not a directory".format(image_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) result_root = os.path.join(output_dir, 'mot_results') if not os.path.exists(result_root): os.makedirs(result_root) assert data_type in MOT_DATA_TYPE, \ "data_type should be 'mot', 'mcmot' or 'kitti'" assert model_type in MOT_ARCH, \ "model_type should be 'JDE', 'DeepSORT', 'FairMOT' or 'ByteTrack'" # run tracking if video_file: seq = video_file.split('/')[-1].split('.')[0] self.dataset.set_video(video_file, frame_rate) logger.info('Starting tracking video {}'.format(video_file)) elif image_dir: seq = image_dir.split('/')[-1].split('.')[0] if os.path.exists(os.path.join(image_dir, 'img1')): image_dir = os.path.join(image_dir, 'img1') images = [ '{}/{}'.format(image_dir, x) for x in os.listdir(image_dir) ] images.sort() self.dataset.set_images(images) logger.info('Starting tracking folder {}, found {} images'.format( image_dir, len(images))) else: raise ValueError('--video_file or --image_dir should be set.') save_dir = os.path.join(output_dir, 'mot_outputs', seq) if save_images or save_videos else None dataloader = create('TestMOTReader')(self.dataset, 0) result_filename = os.path.join(result_root, '{}.txt'.format(seq)) if frame_rate == -1: frame_rate = self.dataset.frame_rate with paddle.no_grad(): if model_type in MOT_ARCH_JDE: results, nf, ta, tc = self._eval_seq_jde( dataloader, save_dir=save_dir, show_image=show_image, frame_rate=frame_rate, draw_threshold=draw_threshold) elif model_type in MOT_ARCH_SDE: results, nf, ta, tc = self._eval_seq_sde( dataloader, save_dir=save_dir, show_image=show_image, frame_rate=frame_rate, seq_name=seq, scaled=scaled, det_file=os.path.join(det_results_dir, '{}.txt'.format(seq)), draw_threshold=draw_threshold) elif model_type == 'CenterTrack': results, nf, ta, tc = self._eval_seq_centertrack( dataloader, save_dir=save_dir, show_image=show_image, frame_rate=frame_rate) else: raise ValueError(model_type) if save_videos: output_video_path = os.path.join(save_dir, '..', '{}_vis.mp4'.format(seq)) cmd_str = 'ffmpeg -f image2 -i {}/%05d.jpg {}'.format( save_dir, output_video_path) os.system(cmd_str) logger.info('Save video in {}'.format(output_video_path)) write_mot_results(result_filename, results, data_type, self.cfg.num_classes) def get_trick_hyperparams(video_name, ori_buffer, ori_thresh): if video_name[:3] != 'MOT': # only used for MOTChallenge (MOT17, MOT20) Test-set return ori_buffer, ori_thresh video_name = video_name[:8] if 'MOT17-05' in video_name: track_buffer = 14 elif 'MOT17-13' in video_name: track_buffer = 25 else: track_buffer = ori_buffer if 'MOT17-01' in video_name: track_thresh = 0.65 elif 'MOT17-06' in video_name: track_thresh = 0.65 elif 'MOT17-12' in video_name: track_thresh = 0.7 elif 'MOT17-14' in video_name: track_thresh = 0.67 else: track_thresh = ori_thresh if 'MOT20-06' in video_name or 'MOT20-08' in video_name: track_thresh = 0.3 else: track_thresh = ori_thresh return track_buffer, ori_thresh ================================================ FILE: ppdet/engine/trainer.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys import copy import time import yaml from tqdm import tqdm import numpy as np import typing from PIL import Image, ImageOps, ImageFile ImageFile.LOAD_TRUNCATED_IMAGES = True import paddle import paddle.nn as nn import paddle.distributed as dist from paddle.distributed import fleet from paddle.static import InputSpec from ppdet.optimizer import ModelEMA from ppdet.core.workspace import create from ppdet.utils.checkpoint import load_weight, load_pretrain_weight, convert_to_dict from ppdet.utils.visualizer import visualize_results, save_result from ppdet.metrics import get_infer_results, KeyPointTopDownCOCOEval, KeyPointTopDownCOCOWholeBadyHandEval, KeyPointTopDownMPIIEval, Pose3DEval from ppdet.metrics import Metric, COCOMetric, LVISMetric, VOCMetric, WiderFaceMetric, RBoxMetric, JDEDetMetric, SNIPERCOCOMetric, CULaneMetric from ppdet.data.source.sniper_coco import SniperCOCODataSet from ppdet.data.source.category import get_categories import ppdet.utils.stats as stats from ppdet.utils.fuse_utils import fuse_conv_bn from ppdet.utils import profiler from ppdet.modeling.post_process import multiclass_nms from ppdet.modeling.lane_utils import imshow_lanes from .callbacks import Callback, ComposeCallback, LogPrinter, Checkpointer, WiferFaceEval, VisualDLWriter, SniperProposalsGenerator, WandbCallback, SemiCheckpointer, SemiLogPrinter from .export_utils import _dump_infer_config, _prune_input_spec, apply_to_static from .naive_sync_bn import convert_syncbn from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients from ppdet.utils.logger import setup_logger logger = setup_logger('ppdet.engine') __all__ = ['Trainer'] MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack'] class Trainer(object): def __init__(self, cfg, mode='train'): self.cfg = cfg.copy() assert mode.lower() in ['train', 'eval', 'test'], \ "mode should be 'train', 'eval' or 'test'" self.mode = mode.lower() self.optimizer = None self.is_loaded_weights = False self.use_amp = self.cfg.get('amp', False) self.amp_level = self.cfg.get('amp_level', 'O1') self.custom_white_list = self.cfg.get('custom_white_list', None) self.custom_black_list = self.cfg.get('custom_black_list', None) self.use_master_grad = self.cfg.get('master_grad', False) self.uniform_output_enabled = self.cfg.get('uniform_output_enabled', False) if ('slim' in cfg and cfg['slim_type'] == 'PTQ') or self.uniform_output_enabled: self.cfg['TestDataset'] = create('TestDataset')() log_ranks = cfg.get('log_ranks', '0') if isinstance(log_ranks, str): self.log_ranks = [int(i) for i in log_ranks.split(',')] elif isinstance(log_ranks, int): self.log_ranks = [log_ranks] train_results_path = os.path.abspath(os.path.join(self.cfg.save_dir, "train_result.json")) if self.uniform_output_enabled: if os.path.exists(train_results_path) and self.mode == 'train': try: os.remove(train_results_path) except: pass if not os.path.exists(self.cfg.save_dir): os.mkdir(self.cfg.save_dir) with open(os.path.join(self.cfg.save_dir, "config.yaml"), "w") as f: config_dict = convert_to_dict(self.cfg) config_dict = {k: v for k, v in config_dict.items() if v != {}} yaml.dump(config_dict, f) # build data loader capital_mode = self.mode.capitalize() if cfg.architecture in MOT_ARCH and self.mode in [ 'eval', 'test' ] and cfg.metric not in ['COCO', 'VOC']: self.dataset = self.cfg['{}MOTDataset'.format( capital_mode)] = create('{}MOTDataset'.format(capital_mode))() else: self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create( '{}Dataset'.format(capital_mode))() if cfg.architecture == 'DeepSORT' and self.mode == 'train': logger.error('DeepSORT has no need of training on mot dataset.') sys.exit(1) if cfg.architecture == 'FairMOT' and self.mode == 'eval': images = self.parse_mot_images(cfg) self.dataset.set_images(images) if self.mode == 'train': self.loader = create('{}Reader'.format(capital_mode))( self.dataset, cfg.worker_num) if cfg.architecture == 'JDE' and self.mode == 'train': self.cfg['JDEEmbeddingHead'][ 'num_identities'] = self.dataset.num_identities_dict[0] # JDE only support single class MOT now. if cfg.architecture == 'FairMOT' and self.mode == 'train': self.cfg['FairMOTEmbeddingHead'][ 'num_identities_dict'] = self.dataset.num_identities_dict # FairMOT support single class and multi-class MOT now. # build model if 'model' not in self.cfg: self.model = create(cfg.architecture) else: self.model = self.cfg.model self.is_loaded_weights = True if cfg.architecture == 'YOLOX': for k, m in self.model.named_sublayers(): if isinstance(m, nn.BatchNorm2D): m._epsilon = 1e-3 # for amp(fp16) m._momentum = 0.97 # 0.03 in pytorch # reset norm param attr for setting them in optimizer if 'reset_norm_param_attr' in cfg and cfg['reset_norm_param_attr']: self.model = self.reset_norm_param_attr( self.model, weight_attr=None, bias_attr=None) # normalize params for deploy if 'slim' in cfg and cfg['slim_type'] == 'OFA': self.model.model.load_meanstd(cfg['TestReader'][ 'sample_transforms']) elif 'slim' in cfg and cfg['slim_type'] == 'Distill': self.model.student_model.load_meanstd(cfg['TestReader'][ 'sample_transforms']) elif 'slim' in cfg and cfg[ 'slim_type'] == 'DistillPrune' and self.mode == 'train': self.model.student_model.load_meanstd(cfg['TestReader'][ 'sample_transforms']) else: self.model.load_meanstd(cfg['TestReader']['sample_transforms']) # EvalDataset build with BatchSampler to evaluate in single device # TODO: multi-device evaluate if self.mode == 'eval': if cfg.architecture == 'FairMOT': self.loader = create('EvalMOTReader')(self.dataset, 0) elif cfg.architecture == "METRO_Body": reader_name = '{}Reader'.format(self.mode.capitalize()) self.loader = create(reader_name)(self.dataset, cfg.worker_num) else: self._eval_batch_sampler = paddle.io.BatchSampler( self.dataset, batch_size=self.cfg.EvalReader['batch_size']) reader_name = '{}Reader'.format(self.mode.capitalize()) # If metric is VOC, need to be set collate_batch=False. if cfg.metric == 'VOC': self.cfg[reader_name]['collate_batch'] = False self.loader = create(reader_name)(self.dataset, cfg.worker_num, self._eval_batch_sampler) # TestDataset build after user set images, skip loader creation here # get Params print_params = self.cfg.get('print_params', False) if print_params: params = sum([ p.numel() for n, p in self.model.named_parameters() if all([x not in n for x in ['_mean', '_variance', 'aux_']]) ]) # exclude BatchNorm running status logger.info('Model Params : {} M.'.format((params / 1e6).numpy()[ 0])) # build optimizer in train mode if self.mode == 'train': steps_per_epoch = len(self.loader) if steps_per_epoch < 1: logger.warning( "Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader." ) self.lr = create('LearningRate')(steps_per_epoch) self.optimizer = create('OptimizerBuilder')(self.lr, self.model) # Unstructured pruner is only enabled in the train mode. if self.cfg.get('unstructured_prune'): self.pruner = create('UnstructuredPruner')(self.model, steps_per_epoch) if self.use_amp and self.amp_level == 'O2': paddle_version = paddle.__version__[:3] # paddle version >= 2.5.0 or develop if paddle_version in ["2.5", "0.0"]: self.model, self.optimizer = paddle.amp.decorate( models=self.model, optimizers=self.optimizer, level=self.amp_level, master_grad=self.use_master_grad) else: self.model, self.optimizer = paddle.amp.decorate( models=self.model, optimizers=self.optimizer, level=self.amp_level) # support sync_bn for npu/xpu if (paddle.get_device()[:3]=='npu' or paddle.get_device()[:3]=='xpu'): use_npu = ('use_npu' in cfg and cfg['use_npu']) use_xpu = ('use_xpu' in cfg and cfg['use_xpu']) use_mlu = ('use_mlu' in cfg and cfg['use_mlu']) norm_type = ('norm_type' in cfg and cfg['norm_type']) if norm_type == 'sync_bn' and (use_npu or use_xpu or use_mlu) and dist.get_world_size() > 1: convert_syncbn(self.model) self.use_ema = ('use_ema' in cfg and cfg['use_ema']) if self.use_ema: ema_decay = self.cfg.get('ema_decay', 0.9998) ema_decay_type = self.cfg.get('ema_decay_type', 'threshold') cycle_epoch = self.cfg.get('cycle_epoch', -1) ema_black_list = self.cfg.get('ema_black_list', None) ema_filter_no_grad = self.cfg.get('ema_filter_no_grad', False) self.ema = ModelEMA( self.model, decay=ema_decay, ema_decay_type=ema_decay_type, cycle_epoch=cycle_epoch, ema_black_list=ema_black_list, ema_filter_no_grad=ema_filter_no_grad) self._nranks = dist.get_world_size() self._local_rank = dist.get_rank() self.status = {} self.start_epoch = 0 self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch # initial default callbacks self._init_callbacks() # initial default metrics self._init_metrics() self._reset_metrics() def _init_callbacks(self): if self.mode == 'train': if self.cfg.get('ssod_method', False) and self.cfg['ssod_method'] == 'Semi_RTDETR': self._callbacks = [SemiLogPrinter(self), SemiCheckpointer(self)] else: self._callbacks = [LogPrinter(self), Checkpointer(self)] if self.cfg.get('use_vdl', False): self._callbacks.append(VisualDLWriter(self)) if self.cfg.get('save_proposals', False): self._callbacks.append(SniperProposalsGenerator(self)) if self.cfg.get('use_wandb', False) or 'wandb' in self.cfg: self._callbacks.append(WandbCallback(self)) self._compose_callback = ComposeCallback(self._callbacks) elif self.mode == 'eval': self._callbacks = [LogPrinter(self)] # if self.cfg.metric == 'WiderFace': # self._callbacks.append(WiferFaceEval(self)) self._compose_callback = ComposeCallback(self._callbacks) elif self.mode == 'test' and self.cfg.get('use_vdl', False): self._callbacks = [VisualDLWriter(self)] self._compose_callback = ComposeCallback(self._callbacks) else: self._callbacks = [] self._compose_callback = None def _init_metrics(self, validate=False): if self.mode == 'test' or (self.mode == 'train' and not validate): self._metrics = [] return classwise = self.cfg['classwise'] if 'classwise' in self.cfg else False if self.cfg.metric == 'COCO' or self.cfg.metric == "SNIPERCOCO" or self.cfg.metric == 'LVIS': # TODO: bias should be unified bias = 1 if self.cfg.get('bias', False) else 0 output_eval = self.cfg['output_eval'] \ if 'output_eval' in self.cfg else None save_prediction_only = self.cfg.get('save_prediction_only', False) # pass clsid2catid info to metric instance to avoid multiple loading # annotation file clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()} \ if self.mode == 'eval' else None save_threshold = self.cfg.get('save_threshold', 0) # when do validation in train, annotation file should be get from # EvalReader instead of self.dataset(which is TrainReader) if self.mode == 'train' and validate: eval_dataset = self.cfg['EvalDataset'] eval_dataset.check_or_download_dataset() anno_file = eval_dataset.get_anno() dataset = eval_dataset else: dataset = self.dataset anno_file = dataset.get_anno() IouType = self.cfg['IouType'] if 'IouType' in self.cfg else 'bbox' if self.cfg.metric == "COCO": self._metrics = [ COCOMetric( anno_file=anno_file, clsid2catid=clsid2catid, classwise=classwise, output_eval=output_eval, bias=bias, IouType=IouType, save_prediction_only=save_prediction_only, save_threshold=save_threshold) ] elif self.cfg.metric == "LVIS": self._metrics = [ LVISMetric( anno_file=anno_file, clsid2catid=clsid2catid, classwise=classwise, output_eval=output_eval, bias=bias, IouType=IouType, save_prediction_only=save_prediction_only) ] elif self.cfg.metric == "SNIPERCOCO": # sniper self._metrics = [ SNIPERCOCOMetric( anno_file=anno_file, dataset=dataset, clsid2catid=clsid2catid, classwise=classwise, output_eval=output_eval, bias=bias, IouType=IouType, save_prediction_only=save_prediction_only) ] elif self.cfg.metric == 'RBOX': # TODO: bias should be unified bias = self.cfg['bias'] if 'bias' in self.cfg else 0 output_eval = self.cfg['output_eval'] \ if 'output_eval' in self.cfg else None save_prediction_only = self.cfg.get('save_prediction_only', False) imid2path = self.cfg.get('imid2path', None) # when do validation in train, annotation file should be get from # EvalReader instead of self.dataset(which is TrainReader) anno_file = self.dataset.get_anno() if self.mode == 'train' and validate: eval_dataset = self.cfg['EvalDataset'] eval_dataset.check_or_download_dataset() anno_file = eval_dataset.get_anno() self._metrics = [ RBoxMetric( anno_file=anno_file, classwise=classwise, output_eval=output_eval, bias=bias, save_prediction_only=save_prediction_only, imid2path=imid2path) ] elif self.cfg.metric == 'VOC': output_eval = self.cfg['output_eval'] \ if 'output_eval' in self.cfg else None save_prediction_only = self.cfg.get('save_prediction_only', False) self._metrics = [ VOCMetric( label_list=self.dataset.get_label_list(), class_num=self.cfg.num_classes, map_type=self.cfg.map_type, classwise=classwise, output_eval=output_eval, save_prediction_only=save_prediction_only) ] elif self.cfg.metric == 'WiderFace': self._metrics = [ WiderFaceMetric() ] elif self.cfg.metric == 'KeyPointTopDownCOCOEval': eval_dataset = self.cfg['EvalDataset'] eval_dataset.check_or_download_dataset() anno_file = eval_dataset.get_anno() save_prediction_only = self.cfg.get('save_prediction_only', False) self._metrics = [ KeyPointTopDownCOCOEval( anno_file, len(eval_dataset), self.cfg.num_joints, self.cfg.save_dir, save_prediction_only=save_prediction_only) ] elif self.cfg.metric == 'KeyPointTopDownCOCOWholeBadyHandEval': eval_dataset = self.cfg['EvalDataset'] eval_dataset.check_or_download_dataset() anno_file = eval_dataset.get_anno() save_prediction_only = self.cfg.get('save_prediction_only', False) self._metrics = [ KeyPointTopDownCOCOWholeBadyHandEval( anno_file, len(eval_dataset), self.cfg.num_joints, self.cfg.save_dir, save_prediction_only=save_prediction_only) ] elif self.cfg.metric == 'KeyPointTopDownMPIIEval': eval_dataset = self.cfg['EvalDataset'] eval_dataset.check_or_download_dataset() anno_file = eval_dataset.get_anno() save_prediction_only = self.cfg.get('save_prediction_only', False) self._metrics = [ KeyPointTopDownMPIIEval( anno_file, len(eval_dataset), self.cfg.num_joints, self.cfg.save_dir, save_prediction_only=save_prediction_only) ] elif self.cfg.metric == 'Pose3DEval': save_prediction_only = self.cfg.get('save_prediction_only', False) self._metrics = [ Pose3DEval( self.cfg.save_dir, save_prediction_only=save_prediction_only) ] elif self.cfg.metric == 'MOTDet': self._metrics = [JDEDetMetric(), ] elif self.cfg.metric == 'CULaneMetric': output_eval = self.cfg.get('output_eval', None) self._metrics = [ CULaneMetric( cfg=self.cfg, output_eval=output_eval, split=self.dataset.split, dataset_dir=self.cfg.dataset_dir) ] else: logger.warning("Metric not support for metric type {}".format( self.cfg.metric)) self._metrics = [] def _reset_metrics(self): for metric in self._metrics: metric.reset() def register_callbacks(self, callbacks): callbacks = [c for c in list(callbacks) if c is not None] for c in callbacks: assert isinstance(c, Callback), \ "metrics shoule be instances of subclass of Metric" self._callbacks.extend(callbacks) self._compose_callback = ComposeCallback(self._callbacks) def register_metrics(self, metrics): metrics = [m for m in list(metrics) if m is not None] for m in metrics: assert isinstance(m, Metric), \ "metrics shoule be instances of subclass of Metric" self._metrics.extend(metrics) def load_weights(self, weights, ARSL_eval=False): if self.is_loaded_weights: return self.start_epoch = 0 load_pretrain_weight(self.model, weights, ARSL_eval) logger.debug("Load weights {} to start training".format(weights)) def load_weights_sde(self, det_weights, reid_weights): if self.model.detector: load_weight(self.model.detector, det_weights) if self.model.reid: load_weight(self.model.reid, reid_weights) else: load_weight(self.model.reid, reid_weights) def resume_weights(self, weights): # support Distill resume weights if hasattr(self.model, 'student_model'): self.start_epoch = load_weight(self.model.student_model, weights, self.optimizer) else: self.start_epoch = load_weight(self.model, weights, self.optimizer, self.ema if self.use_ema else None) logger.debug("Resume weights of epoch {}".format(self.start_epoch)) def train(self, validate=False): assert self.mode == 'train', "Model not in 'train' mode" Init_mark = False if validate: self.cfg['EvalDataset'] = self.cfg.EvalDataset = create( "EvalDataset")() model = self.model if self.cfg.get('to_static', False): model = apply_to_static(self.cfg, model) sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and self.cfg.use_gpu and self._nranks > 1) if sync_bn: model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model) # enabel auto mixed precision mode if self.use_amp: scaler = paddle.amp.GradScaler( enable=self.cfg.use_gpu or self.cfg.use_npu or self.cfg.use_mlu, init_loss_scaling=self.cfg.get('init_loss_scaling', 1024)) # get distributed model if self.cfg.get('fleet', False): model = fleet.distributed_model(model) self.optimizer = fleet.distributed_optimizer(self.optimizer) elif self._nranks > 1: find_unused_parameters = self.cfg[ 'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False model = paddle.DataParallel( model, find_unused_parameters=find_unused_parameters) self.status.update({ 'epoch_id': self.start_epoch, 'step_id': 0, 'steps_per_epoch': len(self.loader) }) self.status['batch_time'] = stats.SmoothedValue( self.cfg.log_iter, fmt='{avg:.4f}') self.status['data_time'] = stats.SmoothedValue( self.cfg.log_iter, fmt='{avg:.4f}') self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter) if self.cfg.get('print_flops', False): flops_loader = create('{}Reader'.format(self.mode.capitalize()))( self.dataset, self.cfg.worker_num) self._flops(flops_loader) profiler_options = self.cfg.get('profiler_options', None) self._compose_callback.on_train_begin(self.status) use_fused_allreduce_gradients = self.cfg[ 'use_fused_allreduce_gradients'] if 'use_fused_allreduce_gradients' in self.cfg else False for epoch_id in range(self.start_epoch, self.cfg.epoch): self.status['mode'] = 'train' self.status['epoch_id'] = epoch_id self._compose_callback.on_epoch_begin(self.status) self.loader.dataset.set_epoch(epoch_id) model.train() iter_tic = time.time() for step_id, data in enumerate(self.loader): def deep_pin(blob, blocking): if isinstance(blob, paddle.Tensor): return blob.cuda(blocking=blocking) elif isinstance(blob, dict): return {k: deep_pin(v, blocking) for k, v in blob.items()} elif isinstance(blob, (list, tuple)): return type(blob)([deep_pin(x, blocking) for x in blob]) else: return blob # if paddle.base.core.is_compiled_with_cuda(): # data = deep_pin(data, False) self.status['data_time'].update(time.time() - iter_tic) self.status['step_id'] = step_id profiler.add_profiler_step(profiler_options) self._compose_callback.on_step_begin(self.status) data['epoch_id'] = epoch_id if self.cfg.get('to_static', False) and 'image_file' in data.keys(): data.pop('image_file') if self.use_amp: if isinstance( model, paddle. DataParallel) and use_fused_allreduce_gradients: with model.no_sync(): with paddle.amp.auto_cast( enable=self.cfg.use_gpu or self.cfg.use_npu or self.cfg.use_mlu, custom_white_list=self.custom_white_list, custom_black_list=self.custom_black_list, level=self.amp_level): # model forward outputs = model(data) loss = outputs['loss'] # model backward scaled_loss = scaler.scale(loss) scaled_loss.backward() fused_allreduce_gradients( list(model.parameters()), None) else: with paddle.amp.auto_cast( enable=self.cfg.use_gpu or self.cfg.use_npu or self.cfg.use_mlu, custom_white_list=self.custom_white_list, custom_black_list=self.custom_black_list, level=self.amp_level): # model forward outputs = model(data) loss = outputs['loss'] # model backward scaled_loss = scaler.scale(loss) scaled_loss.backward() # in dygraph mode, optimizer.minimize is equal to optimizer.step scaler.minimize(self.optimizer, scaled_loss) else: if isinstance( model, paddle. DataParallel) and use_fused_allreduce_gradients: with model.no_sync(): # model forward outputs = model(data) loss = outputs['loss'] # model backward loss.backward() fused_allreduce_gradients( list(model.parameters()), None) else: # model forward outputs = model(data) loss = outputs['loss'] # model backward loss.backward() self.optimizer.step() curr_lr = self.optimizer.get_lr() self.lr.step() if self.cfg.get('unstructured_prune'): self.pruner.step() self.optimizer.clear_grad() self.status['learning_rate'] = curr_lr if self._nranks < 2 or self._local_rank in self.log_ranks: self.status['training_staus'].update(outputs) self.status['batch_time'].update(time.time() - iter_tic) self._compose_callback.on_step_end(self.status) if self.use_ema: self.ema.update() iter_tic = time.time() if self.cfg.get('unstructured_prune'): self.pruner.update_params() is_snapshot = (self._nranks < 2 or (self._local_rank == 0 or self.cfg.metric == "Pose3DEval")) \ and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 or epoch_id == self.end_epoch - 1) if is_snapshot and self.use_ema: # apply ema weight on model weight = copy.deepcopy(self.model.state_dict()) self.model.set_dict(self.ema.apply()) self.status['weight'] = weight self._compose_callback.on_epoch_end(self.status) if validate and is_snapshot: if not hasattr(self, '_eval_loader'): # build evaluation dataset and loader self._eval_dataset = self.cfg.EvalDataset self._eval_batch_sampler = \ paddle.io.BatchSampler( self._eval_dataset, batch_size=self.cfg.EvalReader['batch_size']) # If metric is VOC, need to be set collate_batch=False. if self.cfg.metric == 'VOC': self.cfg['EvalReader']['collate_batch'] = False if self.cfg.metric == "Pose3DEval": self._eval_loader = create('EvalReader')( self._eval_dataset, self.cfg.worker_num) else: self._eval_loader = create('EvalReader')( self._eval_dataset, self.cfg.worker_num, batch_sampler=self._eval_batch_sampler) # if validation in training is enabled, metrics should be re-init # Init_mark makes sure this code will only execute once if validate and Init_mark == False: Init_mark = True self._init_metrics(validate=validate) self._reset_metrics() with paddle.no_grad(): self.status['save_best_model'] = True self._eval_with_loader(self._eval_loader) if is_snapshot and self.use_ema: # reset original weight self.model.set_dict(weight) self.status.pop('weight') self._compose_callback.on_train_end(self.status) def _eval_with_loader(self, loader): sample_num = 0 tic = time.time() self._compose_callback.on_epoch_begin(self.status) self.status['mode'] = 'eval' self.model.eval() if self.cfg.get('print_flops', False): flops_loader = create('{}Reader'.format(self.mode.capitalize()))( self.dataset, self.cfg.worker_num, self._eval_batch_sampler) self._flops(flops_loader) for step_id, data in enumerate(loader): self.status['step_id'] = step_id self._compose_callback.on_step_begin(self.status) # forward if self.use_amp: with paddle.amp.auto_cast( enable=self.cfg.use_gpu or self.cfg.use_npu or self.cfg.use_mlu, custom_white_list=self.custom_white_list, custom_black_list=self.custom_black_list, level=self.amp_level): outs = self.model(data) else: outs = self.model(data) # update metrics for metric in self._metrics: metric.update(data, outs) # multi-scale inputs: all inputs have same im_id if isinstance(data, typing.Sequence): sample_num += data[0]['im_id'].numpy().shape[0] else: sample_num += data['im_id'].numpy().shape[0] self._compose_callback.on_step_end(self.status) self.status['sample_num'] = sample_num self.status['cost_time'] = time.time() - tic # accumulate metric to log out for metric in self._metrics: metric.accumulate() metric.log() self._compose_callback.on_epoch_end(self.status) # reset metric states for metric may performed multiple times self._reset_metrics() def evaluate(self): # get distributed model if self.cfg.get('fleet', False): self.model = fleet.distributed_model(self.model) self.optimizer = fleet.distributed_optimizer(self.optimizer) elif self._nranks > 1: find_unused_parameters = self.cfg[ 'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False self.model = paddle.DataParallel( self.model, find_unused_parameters=find_unused_parameters) with paddle.no_grad(): self._eval_with_loader(self.loader) def _eval_with_loader_slice(self, loader, slice_size=[640, 640], overlap_ratio=[0.25, 0.25], combine_method='nms', match_threshold=0.6, match_metric='iou'): sample_num = 0 tic = time.time() self._compose_callback.on_epoch_begin(self.status) self.status['mode'] = 'eval' self.model.eval() if self.cfg.get('print_flops', False): flops_loader = create('{}Reader'.format(self.mode.capitalize()))( self.dataset, self.cfg.worker_num, self._eval_batch_sampler) self._flops(flops_loader) merged_bboxs = [] for step_id, data in enumerate(loader): self.status['step_id'] = step_id self._compose_callback.on_step_begin(self.status) # forward if self.use_amp: with paddle.amp.auto_cast( enable=self.cfg.use_gpu or self.cfg.use_npu or self.cfg.use_mlu, custom_white_list=self.custom_white_list, custom_black_list=self.custom_black_list, level=self.amp_level): outs = self.model(data) else: outs = self.model(data) shift_amount = data['st_pix'] outs['bbox'][:, 2:4] = outs['bbox'][:, 2:4] + shift_amount outs['bbox'][:, 4:6] = outs['bbox'][:, 4:6] + shift_amount merged_bboxs.append(outs['bbox']) if data['is_last'] > 0: # merge matching predictions merged_results = {'bbox': []} if combine_method == 'nms': final_boxes = multiclass_nms( np.concatenate(merged_bboxs), self.cfg.num_classes, match_threshold, match_metric) merged_results['bbox'] = np.concatenate(final_boxes) elif combine_method == 'concat': merged_results['bbox'] = np.concatenate(merged_bboxs) else: raise ValueError( "Now only support 'nms' or 'concat' to fuse detection results." ) merged_results['im_id'] = np.array([[0]]) merged_results['bbox_num'] = np.array( [len(merged_results['bbox'])]) merged_bboxs = [] data['im_id'] = data['ori_im_id'] # update metrics for metric in self._metrics: metric.update(data, merged_results) # multi-scale inputs: all inputs have same im_id if isinstance(data, typing.Sequence): sample_num += data[0]['im_id'].numpy().shape[0] else: sample_num += data['im_id'].numpy().shape[0] self._compose_callback.on_step_end(self.status) self.status['sample_num'] = sample_num self.status['cost_time'] = time.time() - tic # accumulate metric to log out for metric in self._metrics: metric.accumulate() metric.log() self._compose_callback.on_epoch_end(self.status) # reset metric states for metric may performed multiple times self._reset_metrics() def evaluate_slice(self, slice_size=[640, 640], overlap_ratio=[0.25, 0.25], combine_method='nms', match_threshold=0.6, match_metric='iou'): with paddle.no_grad(): self._eval_with_loader_slice(self.loader, slice_size, overlap_ratio, combine_method, match_threshold, match_metric) def slice_predict(self, images, slice_size=[640, 640], overlap_ratio=[0.25, 0.25], combine_method='nms', match_threshold=0.6, match_metric='iou', draw_threshold=0.5, output_dir='output', save_results=False, visualize=True): if not os.path.exists(output_dir): os.makedirs(output_dir) self.dataset.set_slice_images(images, slice_size, overlap_ratio) loader = create('TestReader')(self.dataset, 0) imid2path = self.dataset.get_imid2path() def setup_metrics_for_loader(): # mem metrics = copy.deepcopy(self._metrics) mode = self.mode save_prediction_only = self.cfg[ 'save_prediction_only'] if 'save_prediction_only' in self.cfg else None output_eval = self.cfg[ 'output_eval'] if 'output_eval' in self.cfg else None # modify self.mode = '_test' self.cfg['save_prediction_only'] = True self.cfg['output_eval'] = output_dir self.cfg['imid2path'] = imid2path self._init_metrics() # restore self.mode = mode self.cfg.pop('save_prediction_only') if save_prediction_only is not None: self.cfg['save_prediction_only'] = save_prediction_only self.cfg.pop('output_eval') if output_eval is not None: self.cfg['output_eval'] = output_eval self.cfg.pop('imid2path') _metrics = copy.deepcopy(self._metrics) self._metrics = metrics return _metrics if save_results: metrics = setup_metrics_for_loader() else: metrics = [] anno_file = self.dataset.get_anno() clsid2catid, catid2name = get_categories( self.cfg.metric, anno_file=anno_file) # Run Infer self.status['mode'] = 'test' self.model.eval() if self.cfg.get('print_flops', False): flops_loader = create('TestReader')(self.dataset, 0) self._flops(flops_loader) results = [] # all images merged_bboxs = [] # single image for step_id, data in enumerate(tqdm(loader)): self.status['step_id'] = step_id # forward outs = self.model(data) outs['bbox'] = outs['bbox'].numpy() # only in test mode shift_amount = data['st_pix'] outs['bbox'][:, 2:4] = outs['bbox'][:, 2:4] + shift_amount.numpy() outs['bbox'][:, 4:6] = outs['bbox'][:, 4:6] + shift_amount.numpy() merged_bboxs.append(outs['bbox']) if data['is_last'] > 0: # merge matching predictions merged_results = {'bbox': []} if combine_method == 'nms': final_boxes = multiclass_nms( np.concatenate(merged_bboxs), self.cfg.num_classes, match_threshold, match_metric) merged_results['bbox'] = np.concatenate(final_boxes) elif combine_method == 'concat': merged_results['bbox'] = np.concatenate(merged_bboxs) else: raise ValueError( "Now only support 'nms' or 'concat' to fuse detection results." ) merged_results['im_id'] = np.array([[0]]) merged_results['bbox_num'] = np.array( [len(merged_results['bbox'])]) merged_bboxs = [] data['im_id'] = data['ori_im_id'] for _m in metrics: _m.update(data, merged_results) for key in ['im_shape', 'scale_factor', 'im_id']: if isinstance(data, typing.Sequence): merged_results[key] = data[0][key] else: merged_results[key] = data[key] for key, value in merged_results.items(): if hasattr(value, 'numpy'): merged_results[key] = value.numpy() results.append(merged_results) for _m in metrics: _m.accumulate() _m.reset() if visualize: for outs in results: batch_res = get_infer_results(outs, clsid2catid) bbox_num = outs['bbox_num'] start = 0 for i, im_id in enumerate(outs['im_id']): image_path = imid2path[int(im_id)] image = Image.open(image_path).convert('RGB') image = ImageOps.exif_transpose(image) self.status['original_image'] = np.array(image.copy()) end = start + bbox_num[i] bbox_res = batch_res['bbox'][start:end] \ if 'bbox' in batch_res else None mask_res = batch_res['mask'][start:end] \ if 'mask' in batch_res else None segm_res = batch_res['segm'][start:end] \ if 'segm' in batch_res else None keypoint_res = batch_res['keypoint'][start:end] \ if 'keypoint' in batch_res else None pose3d_res = batch_res['pose3d'][start:end] \ if 'pose3d' in batch_res else None image = visualize_results( image, bbox_res, mask_res, segm_res, keypoint_res, pose3d_res, int(im_id), catid2name, draw_threshold) self.status['result_image'] = np.array(image.copy()) if self._compose_callback: self._compose_callback.on_step_end(self.status) # save image with detection save_name = self._get_save_image_name(output_dir, image_path) logger.info("Detection bbox results save in {}".format( save_name)) image.save(save_name, quality=95) start = end def predict(self, images, draw_threshold=0.5, output_dir='output', save_results=False, visualize=True, save_threshold=0, do_eval=False): if not os.path.exists(output_dir): os.makedirs(output_dir) if do_eval: save_threshold = 0.0 self.dataset.set_images(images, do_eval=do_eval) loader = create('TestReader')(self.dataset, 0) imid2path = self.dataset.get_imid2path() def setup_metrics_for_loader(): # mem metrics = copy.deepcopy(self._metrics) mode = self.mode save_prediction_only = self.cfg[ 'save_prediction_only'] if 'save_prediction_only' in self.cfg else None output_eval = self.cfg[ 'output_eval'] if 'output_eval' in self.cfg else None # modify self.mode = '_test' self.cfg['save_prediction_only'] = True self.cfg['output_eval'] = output_dir self.cfg['imid2path'] = imid2path self.cfg['save_threshold'] = save_threshold self._init_metrics() # restore self.mode = mode self.cfg.pop('save_prediction_only') if save_prediction_only is not None: self.cfg['save_prediction_only'] = save_prediction_only self.cfg.pop('output_eval') if output_eval is not None: self.cfg['output_eval'] = output_eval self.cfg.pop('imid2path') _metrics = copy.deepcopy(self._metrics) self._metrics = metrics return _metrics if save_results: metrics = setup_metrics_for_loader() else: metrics = [] anno_file = self.dataset.get_anno() clsid2catid, catid2name = get_categories( self.cfg.metric, anno_file=anno_file) # Run Infer self.status['mode'] = 'test' self.model.eval() if self.cfg.get('print_flops', False): flops_loader = create('TestReader')(self.dataset, 0) self._flops(flops_loader) results = [] for step_id, data in enumerate(tqdm(loader)): self.status['step_id'] = step_id # forward if hasattr(self.model, 'modelTeacher'): outs = self.model.modelTeacher(data) else: outs = self.model(data) for _m in metrics: _m.update(data, outs) for key in ['im_shape', 'scale_factor', 'im_id']: if isinstance(data, typing.Sequence): outs[key] = data[0][key] else: outs[key] = data[key] for key, value in outs.items(): if hasattr(value, 'numpy'): outs[key] = value.numpy() results.append(outs) # sniper if type(self.dataset) == SniperCOCODataSet: results = self.dataset.anno_cropper.aggregate_chips_detections( results) for _m in metrics: _m.accumulate() _m.reset() if visualize: for outs in results: batch_res = get_infer_results(outs, clsid2catid) bbox_num = outs['bbox_num'] start = 0 for i, im_id in enumerate(outs['im_id']): image_path = imid2path[int(im_id)] image = Image.open(image_path).convert('RGB') image = ImageOps.exif_transpose(image) self.status['original_image'] = np.array(image.copy()) end = start + bbox_num[i] bbox_res = batch_res['bbox'][start:end] \ if 'bbox' in batch_res else None mask_res = batch_res['mask'][start:end] \ if 'mask' in batch_res else None segm_res = batch_res['segm'][start:end] \ if 'segm' in batch_res else None keypoint_res = batch_res['keypoint'][start:end] \ if 'keypoint' in batch_res else None pose3d_res = batch_res['pose3d'][start:end] \ if 'pose3d' in batch_res else None image = visualize_results( image, bbox_res, mask_res, segm_res, keypoint_res, pose3d_res, int(im_id), catid2name, draw_threshold) self.status['result_image'] = np.array(image.copy()) if self._compose_callback: self._compose_callback.on_step_end(self.status) # save image with detection save_name = self._get_save_image_name(output_dir, image_path) logger.info("Detection bbox results save in {}".format( save_name)) image.save(save_name, quality=95) start = end return results def _get_save_image_name(self, output_dir, image_path): """ Get save image name from source image path. """ image_name = os.path.split(image_path)[-1] name, ext = os.path.splitext(image_name) return os.path.join(output_dir, "{}".format(name)) + ext def _get_infer_cfg_and_input_spec(self, save_dir, prune_input=True, kl_quant=False, yaml_name=None, model=None): if yaml_name is None: yaml_name = 'infer_cfg.yml' if model is None: model = self.model image_shape = None im_shape = [None, 2] scale_factor = [None, 2] if self.cfg.architecture in MOT_ARCH: test_reader_name = 'TestMOTReader' else: test_reader_name = 'TestReader' if 'inputs_def' in self.cfg[test_reader_name]: inputs_def = self.cfg[test_reader_name]['inputs_def'] image_shape = inputs_def.get('image_shape', None) # set image_shape=[None, 3, -1, -1] as default if image_shape is None: image_shape = [None, 3, -1, -1] if len(image_shape) == 3: image_shape = [None] + image_shape else: im_shape = [image_shape[0], 2] scale_factor = [image_shape[0], 2] if hasattr(model, 'deploy'): model.deploy = True if 'slim' not in self.cfg: for layer in model.sublayers(): if hasattr(layer, 'convert_to_deploy'): layer.convert_to_deploy() if hasattr(self.cfg, 'export') and 'fuse_conv_bn' in self.cfg[ 'export'] and self.cfg['export']['fuse_conv_bn']: model = fuse_conv_bn(model) export_post_process = self.cfg['export'].get( 'post_process', False) if hasattr(self.cfg, 'export') else True export_nms = self.cfg['export'].get('nms', False) if hasattr( self.cfg, 'export') else True export_benchmark = self.cfg['export'].get( 'benchmark', False) if hasattr(self.cfg, 'export') else False if hasattr(model, 'fuse_norm'): model.fuse_norm = self.cfg['TestReader'].get('fuse_normalize', False) if hasattr(model, 'export_post_process'): model.export_post_process = export_post_process if not export_benchmark else False if hasattr(model, 'export_nms'): model.export_nms = export_nms if not export_benchmark else False if export_post_process and not export_benchmark: image_shape = [None] + image_shape[1:] # Save infer cfg _dump_infer_config(self.cfg, os.path.join(save_dir, yaml_name), image_shape, model) input_spec = [{ "image": InputSpec( shape=image_shape, name='image'), "im_shape": InputSpec( shape=im_shape, name='im_shape'), "scale_factor": InputSpec( shape=scale_factor, name='scale_factor') }] if self.cfg.architecture == 'DeepSORT': input_spec[0].update({ "crops": InputSpec( shape=[None, 3, 192, 64], name='crops') }) if self.cfg.architecture == 'CLRNet': input_spec[0].update({ "full_img_path": str, "img_name": str, }) if prune_input: static_model = paddle.jit.to_static( model, input_spec=input_spec, full_graph=True) # NOTE: dy2st do not pruned program, but jit.save will prune program # input spec, prune input spec here and save with pruned input spec pruned_input_spec = _prune_input_spec( input_spec, static_model.forward.main_program, static_model.forward.outputs) else: static_model = None pruned_input_spec = input_spec # TODO: Hard code, delete it when support prune input_spec. if self.cfg.architecture == 'PicoDet' and not export_post_process: pruned_input_spec = [{ "image": InputSpec( shape=image_shape, name='image') }] if kl_quant: if self.cfg.architecture == 'PicoDet' or 'ppyoloe' in self.cfg.weights: pruned_input_spec = [{ "image": InputSpec( shape=image_shape, name='image'), "scale_factor": InputSpec( shape=scale_factor, name='scale_factor') }] elif 'tinypose' in self.cfg.weights: pruned_input_spec = [{ "image": InputSpec( shape=image_shape, name='image') }] return static_model, pruned_input_spec def export(self, output_dir='output_inference', for_fd=False): if hasattr(self.model, 'aux_neck'): self.model.__delattr__('aux_neck') if hasattr(self.model, 'aux_head'): self.model.__delattr__('aux_head') self.model.eval() model = copy.deepcopy(self.model) model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0] if for_fd: save_dir = output_dir save_name = 'inference' yaml_name = 'inference.yml' else: save_dir = os.path.join(output_dir, model_name) save_name = 'model' yaml_name = None if not os.path.exists(save_dir): os.makedirs(save_dir) static_model, pruned_input_spec = self._get_infer_cfg_and_input_spec( save_dir, yaml_name=yaml_name, model=model) # dy2st and save model if 'slim' not in self.cfg or 'QAT' not in self.cfg['slim_type']: paddle.jit.save( static_model, os.path.join(save_dir, save_name), input_spec=pruned_input_spec) else: self.cfg.slim.save_quantized_model( self.model, os.path.join(save_dir, save_name), input_spec=pruned_input_spec) logger.info("Export model and saved in {}".format(save_dir)) def post_quant(self, output_dir='output_inference'): model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0] save_dir = os.path.join(output_dir, model_name) if not os.path.exists(save_dir): os.makedirs(save_dir) for idx, data in enumerate(self.loader): self.model(data) if idx == int(self.cfg.get('quant_batch_num', 10)): break # TODO: support prune input_spec kl_quant = True if hasattr(self.cfg.slim, 'ptq') else False _, pruned_input_spec = self._get_infer_cfg_and_input_spec( save_dir, prune_input=False, kl_quant=kl_quant) self.cfg.slim.save_quantized_model( self.model, os.path.join(save_dir, 'model'), input_spec=pruned_input_spec) logger.info("Export Post-Quant model and saved in {}".format(save_dir)) def _flops(self, loader): if hasattr(self.model, 'aux_neck'): self.model.__delattr__('aux_neck') if hasattr(self.model, 'aux_head'): self.model.__delattr__('aux_head') self.model.eval() try: import paddleslim except Exception as e: logger.warning( 'Unable to calculate flops, please install paddleslim, for example: `pip install paddleslim`' ) return from paddleslim.analysis import dygraph_flops as flops input_data = None for data in loader: input_data = data break input_spec = [{ "image": input_data['image'][0].unsqueeze(0), "im_shape": input_data['im_shape'][0].unsqueeze(0), "scale_factor": input_data['scale_factor'][0].unsqueeze(0) }] flops = flops(self.model, input_spec) / (1000**3) logger.info(" Model FLOPs : {:.6f}G. (image shape is {})".format( flops, input_data['image'][0].unsqueeze(0).shape)) def parse_mot_images(self, cfg): import glob # for quant dataset_dir = cfg['EvalMOTDataset'].dataset_dir data_root = cfg['EvalMOTDataset'].data_root data_root = '{}/{}'.format(dataset_dir, data_root) seqs = os.listdir(data_root) seqs.sort() all_images = [] for seq in seqs: infer_dir = os.path.join(data_root, seq) assert infer_dir is None or os.path.isdir(infer_dir), \ "{} is not a directory".format(infer_dir) images = set() exts = ['jpg', 'jpeg', 'png', 'bmp'] exts += [ext.upper() for ext in exts] for ext in exts: images.update(glob.glob('{}/*.{}'.format(infer_dir, ext))) images = list(images) images.sort() assert len(images) > 0, "no image found in {}".format(infer_dir) all_images.extend(images) logger.info("Found {} inference images in total.".format( len(images))) return all_images def predict_culane(self, images, output_dir='output', save_results=False, visualize=True): if not os.path.exists(output_dir): os.makedirs(output_dir) self.dataset.set_images(images) loader = create('TestReader')(self.dataset, 0) imid2path = self.dataset.get_imid2path() def setup_metrics_for_loader(): # mem metrics = copy.deepcopy(self._metrics) mode = self.mode save_prediction_only = self.cfg[ 'save_prediction_only'] if 'save_prediction_only' in self.cfg else None output_eval = self.cfg[ 'output_eval'] if 'output_eval' in self.cfg else None # modify self.mode = '_test' self.cfg['save_prediction_only'] = True self.cfg['output_eval'] = output_dir self.cfg['imid2path'] = imid2path self._init_metrics() # restore self.mode = mode self.cfg.pop('save_prediction_only') if save_prediction_only is not None: self.cfg['save_prediction_only'] = save_prediction_only self.cfg.pop('output_eval') if output_eval is not None: self.cfg['output_eval'] = output_eval self.cfg.pop('imid2path') _metrics = copy.deepcopy(self._metrics) self._metrics = metrics return _metrics if save_results: metrics = setup_metrics_for_loader() else: metrics = [] # Run Infer self.status['mode'] = 'test' self.model.eval() if self.cfg.get('print_flops', False): flops_loader = create('TestReader')(self.dataset, 0) self._flops(flops_loader) results = [] for step_id, data in enumerate(tqdm(loader)): self.status['step_id'] = step_id # forward outs = self.model(data) for _m in metrics: _m.update(data, outs) for key in ['im_shape', 'scale_factor', 'im_id']: if isinstance(data, typing.Sequence): outs[key] = data[0][key] else: outs[key] = data[key] for key, value in outs.items(): if hasattr(value, 'numpy'): outs[key] = value.numpy() results.append(outs) for _m in metrics: _m.accumulate() _m.reset() if visualize: import cv2 for outs in results: for i in range(len(outs['img_path'])): lanes = outs['lanes'][i] img_path = outs['img_path'][i] img = cv2.imread(img_path) out_file = os.path.join(output_dir, os.path.basename(img_path)) lanes = [ lane.to_array( sample_y_range=[ self.cfg['sample_y']['start'], self.cfg['sample_y']['end'], self.cfg['sample_y']['step'] ], img_w=self.cfg.ori_img_w, img_h=self.cfg.ori_img_h) for lane in lanes ] imshow_lanes(img, lanes, out_file=out_file) return results def reset_norm_param_attr(self, layer, **kwargs): if isinstance(layer, (nn.BatchNorm2D, nn.LayerNorm, nn.GroupNorm)): src_state_dict = layer.state_dict() if isinstance(layer, nn.BatchNorm2D): layer = nn.BatchNorm2D( num_features=layer._num_features, momentum=layer._momentum, epsilon=layer._epsilon, **kwargs) elif isinstance(layer, nn.LayerNorm): layer = nn.LayerNorm( normalized_shape=layer._normalized_shape, epsilon=layer._epsilon, **kwargs) else: layer = nn.GroupNorm( num_groups=layer._num_groups, num_channels=layer._num_channels, epsilon=layer._epsilon, **kwargs) layer.set_state_dict(src_state_dict) else: for name, sublayer in layer.named_children(): new_sublayer = self.reset_norm_param_attr(sublayer, **kwargs) if new_sublayer is not sublayer: setattr(layer, name, new_sublayer) return layer ================================================ FILE: ppdet/engine/trainer_cot.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ppdet.core.workspace import create from ppdet.utils.logger import setup_logger logger = setup_logger('ppdet.engine') from . import Trainer __all__ = ['TrainerCot'] class TrainerCot(Trainer): """ Trainer for label-cotuning calculate the relationship between base_classes and novel_classes """ def __init__(self, cfg, mode='train'): super(TrainerCot, self).__init__(cfg, mode) self.cotuning_init() def cotuning_init(self): num_classes_novel = self.cfg['num_classes'] self.load_weights(self.cfg.pretrain_weights) self.model.eval() relationship = self.model.relationship_learning(self.loader, num_classes_novel) self.model.init_cot_head(relationship) self.optimizer = create('OptimizerBuilder')(self.lr, self.model) ================================================ FILE: ppdet/engine/trainer_ssod.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import copy import time import typing import numpy as np import paddle import paddle.nn as nn import paddle.distributed as dist from paddle.distributed import fleet from ppdet.optimizer import ModelEMA, SimpleModelEMA from ppdet.core.workspace import create from ppdet.utils.checkpoint import load_weight, load_pretrain_weight, save_model import ppdet.utils.stats as stats from ppdet.utils import profiler from ppdet.modeling.ssod.utils import align_weak_strong_shape from .trainer import Trainer from ppdet.utils.logger import setup_logger from paddle.static import InputSpec from ppdet.engine.export_utils import _dump_infer_config, _prune_input_spec MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack'] logger = setup_logger('ppdet.engine') __all__ = ['Trainer_DenseTeacher', 'Trainer_ARSL', 'Trainer_Semi_RTDETR'] class Trainer_DenseTeacher(Trainer): def __init__(self, cfg, mode='train'): self.cfg = cfg assert mode.lower() in ['train', 'eval', 'test'], \ "mode should be 'train', 'eval' or 'test'" self.mode = mode.lower() self.optimizer = None self.is_loaded_weights = False self.use_amp = self.cfg.get('amp', False) self.amp_level = self.cfg.get('amp_level', 'O1') self.custom_white_list = self.cfg.get('custom_white_list', None) self.custom_black_list = self.cfg.get('custom_black_list', None) # build data loader capital_mode = self.mode.capitalize() self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create( '{}Dataset'.format(capital_mode))() if self.mode == 'train': self.dataset_unlabel = self.cfg['UnsupTrainDataset'] = create( 'UnsupTrainDataset') self.loader = create('SemiTrainReader')( self.dataset, self.dataset_unlabel, cfg.worker_num) # build model if 'model' not in self.cfg: self.model = create(cfg.architecture) else: self.model = self.cfg.model self.is_loaded_weights = True # EvalDataset build with BatchSampler to evaluate in single device # TODO: multi-device evaluate if self.mode == 'eval': self._eval_batch_sampler = paddle.io.BatchSampler( self.dataset, batch_size=self.cfg.EvalReader['batch_size']) # If metric is VOC, need to be set collate_batch=False. if cfg.metric == 'VOC': cfg['EvalReader']['collate_batch'] = False self.loader = create('EvalReader')(self.dataset, cfg.worker_num, self._eval_batch_sampler) # TestDataset build after user set images, skip loader creation here # build optimizer in train mode if self.mode == 'train': steps_per_epoch = len(self.loader) if steps_per_epoch < 1: logger.warning( "Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader." ) self.lr = create('LearningRate')(steps_per_epoch) self.optimizer = create('OptimizerBuilder')(self.lr, self.model) # Unstructured pruner is only enabled in the train mode. if self.cfg.get('unstructured_prune'): self.pruner = create('UnstructuredPruner')(self.model, steps_per_epoch) if self.use_amp and self.amp_level == 'O2': self.model, self.optimizer = paddle.amp.decorate( models=self.model, optimizers=self.optimizer, level=self.amp_level) self.use_ema = ('use_ema' in cfg and cfg['use_ema']) if self.use_ema: ema_decay = self.cfg.get('ema_decay', 0.9998) ema_decay_type = self.cfg.get('ema_decay_type', 'threshold') cycle_epoch = self.cfg.get('cycle_epoch', -1) ema_black_list = self.cfg.get('ema_black_list', None) self.ema = ModelEMA( self.model, decay=ema_decay, ema_decay_type=ema_decay_type, cycle_epoch=cycle_epoch, ema_black_list=ema_black_list) self.ema_start_iters = self.cfg.get('ema_start_iters', 0) # simple_ema for SSOD self.use_simple_ema = ('use_simple_ema' in cfg and cfg['use_simple_ema']) if self.use_simple_ema: self.use_ema = True ema_decay = self.cfg.get('ema_decay', 0.9996) self.ema = SimpleModelEMA(self.model, decay=ema_decay) self.ema_start_iters = self.cfg.get('ema_start_iters', 0) self._nranks = dist.get_world_size() self._local_rank = dist.get_rank() self.status = {} self.start_epoch = 0 self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch # initial default callbacks self._init_callbacks() # initial default metrics self._init_metrics() self._reset_metrics() def load_weights(self, weights): if self.is_loaded_weights: return self.start_epoch = 0 load_pretrain_weight(self.model, weights) load_pretrain_weight(self.ema.model, weights) logger.info("Load weights {} to start training for teacher and student". format(weights)) def resume_weights(self, weights, exchange=True): # support Distill resume weights if hasattr(self.model, 'student_model'): self.start_epoch = load_weight(self.model.student_model, weights, self.optimizer, exchange) else: self.start_epoch = load_weight(self.model, weights, self.optimizer, self.ema if self.use_ema else None, exchange) logger.debug("Resume weights of epoch {}".format(self.start_epoch)) def train(self, validate=False): self.semi_start_iters = self.cfg.get('semi_start_iters', 5000) Init_mark = False if validate: self.cfg['EvalDataset'] = self.cfg.EvalDataset = create( "EvalDataset")() sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and self.cfg.use_gpu and self._nranks > 1) if sync_bn: self.model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm( self.model) if self.cfg.get('fleet', False): self.model = fleet.distributed_model(self.model) self.optimizer = fleet.distributed_optimizer(self.optimizer) elif self._nranks > 1: find_unused_parameters = self.cfg[ 'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False self.model = paddle.DataParallel( self.model, find_unused_parameters=find_unused_parameters) self.ema.model = paddle.DataParallel( self.ema.model, find_unused_parameters=find_unused_parameters) self.status.update({ 'epoch_id': self.start_epoch, 'step_id': 0, 'steps_per_epoch': len(self.loader), 'exchange_save_model': True, }) # Note: exchange_save_model # in DenseTeacher SSOD, the teacher model will be higher, so exchange when saving pdparams self.status['batch_time'] = stats.SmoothedValue( self.cfg.log_iter, fmt='{avg:.4f}') self.status['data_time'] = stats.SmoothedValue( self.cfg.log_iter, fmt='{avg:.4f}') self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter) profiler_options = self.cfg.get('profiler_options', None) self._compose_callback.on_train_begin(self.status) train_cfg = self.cfg.DenseTeacher['train_cfg'] concat_sup_data = train_cfg.get('concat_sup_data', True) for param in self.ema.model.parameters(): param.stop_gradient = True for epoch_id in range(self.start_epoch, self.cfg.epoch): self.status['mode'] = 'train' self.status['epoch_id'] = epoch_id self._compose_callback.on_epoch_begin(self.status) self.loader.dataset_label.set_epoch(epoch_id) self.loader.dataset_unlabel.set_epoch(epoch_id) iter_tic = time.time() loss_dict = { 'loss': paddle.to_tensor([0]), 'loss_sup_sum': paddle.to_tensor([0]), 'loss_unsup_sum': paddle.to_tensor([0]), 'fg_sum': paddle.to_tensor([0]), } if self._nranks > 1: for k in self.model._layers.get_loss_keys(): loss_dict.update({k: paddle.to_tensor([0.])}) for k in self.model._layers.get_loss_keys(): loss_dict.update({'distill_' + k: paddle.to_tensor([0.])}) else: for k in self.model.get_loss_keys(): loss_dict.update({k: paddle.to_tensor([0.])}) for k in self.model.get_loss_keys(): loss_dict.update({'distill_' + k: paddle.to_tensor([0.])}) # Note: for step_id, data in enumerate(self.loader): # enumerate bug for step_id in range(len(self.loader)): data = next(self.loader) self.model.train() self.ema.model.eval() data_sup_w, data_sup_s, data_unsup_w, data_unsup_s = data self.status['data_time'].update(time.time() - iter_tic) self.status['step_id'] = step_id profiler.add_profiler_step(profiler_options) self._compose_callback.on_step_begin(self.status) if data_sup_w['image'].shape != data_sup_s['image'].shape: data_sup_w, data_sup_s = align_weak_strong_shape(data_sup_w, data_sup_s) data_sup_w['epoch_id'] = epoch_id data_sup_s['epoch_id'] = epoch_id if concat_sup_data: for k, v in data_sup_s.items(): if k in ['epoch_id']: continue data_sup_s[k] = paddle.concat([v, data_sup_w[k]]) loss_dict_sup = self.model(data_sup_s) else: loss_dict_sup_w = self.model(data_sup_w) loss_dict_sup = self.model(data_sup_s) for k, v in loss_dict_sup_w.items(): loss_dict_sup[k] = (loss_dict_sup[k] + v) * 0.5 losses_sup = loss_dict_sup['loss'] * train_cfg['sup_weight'] losses_sup.backward() losses = losses_sup.detach() loss_dict.update(loss_dict_sup) loss_dict.update({'loss_sup_sum': loss_dict['loss']}) curr_iter = len(self.loader) * epoch_id + step_id st_iter = self.semi_start_iters if curr_iter == st_iter: logger.info("***" * 30) logger.info('Semi starting ...') logger.info("***" * 30) if curr_iter > st_iter: unsup_weight = train_cfg['unsup_weight'] if train_cfg['suppress'] == 'linear': tar_iter = st_iter * 2 if curr_iter <= tar_iter: unsup_weight *= (curr_iter - st_iter) / st_iter elif train_cfg['suppress'] == 'exp': tar_iter = st_iter + 2000 if curr_iter <= tar_iter: scale = np.exp((curr_iter - tar_iter) / 1000) unsup_weight *= scale elif train_cfg['suppress'] == 'step': tar_iter = st_iter * 2 if curr_iter <= tar_iter: unsup_weight *= 0.25 else: raise ValueError if data_unsup_w['image'].shape != data_unsup_s[ 'image'].shape: data_unsup_w, data_unsup_s = align_weak_strong_shape( data_unsup_w, data_unsup_s) data_unsup_w['epoch_id'] = epoch_id data_unsup_s['epoch_id'] = epoch_id data_unsup_s['get_data'] = True student_preds = self.model(data_unsup_s) with paddle.no_grad(): data_unsup_w['is_teacher'] = True teacher_preds = self.ema.model(data_unsup_w) train_cfg['curr_iter'] = curr_iter train_cfg['st_iter'] = st_iter if self._nranks > 1: loss_dict_unsup = self.model._layers.get_ssod_loss( student_preds, teacher_preds, train_cfg) else: loss_dict_unsup = self.model.get_ssod_loss( student_preds, teacher_preds, train_cfg) fg_num = loss_dict_unsup["fg_sum"] del loss_dict_unsup["fg_sum"] distill_weights = train_cfg['loss_weight'] loss_dict_unsup = { k: v * distill_weights[k] for k, v in loss_dict_unsup.items() } losses_unsup = sum([ metrics_value for metrics_value in loss_dict_unsup.values() ]) * unsup_weight losses_unsup.backward() loss_dict.update(loss_dict_unsup) loss_dict.update({'loss_unsup_sum': losses_unsup}) losses += losses_unsup.detach() loss_dict.update({"fg_sum": fg_num}) loss_dict['loss'] = losses self.optimizer.step() curr_lr = self.optimizer.get_lr() self.lr.step() self.optimizer.clear_grad() self.status['learning_rate'] = curr_lr if self._nranks < 2 or self._local_rank == 0: self.status['training_staus'].update(loss_dict) self.status['batch_time'].update(time.time() - iter_tic) self._compose_callback.on_step_end(self.status) # Note: ema_start_iters if self.use_ema and curr_iter == self.ema_start_iters: logger.info("***" * 30) logger.info('EMA starting ...') logger.info("***" * 30) self.ema.update(self.model, decay=0) elif self.use_ema and curr_iter > self.ema_start_iters: self.ema.update(self.model) iter_tic = time.time() is_snapshot = (self._nranks < 2 or self._local_rank == 0) \ and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 or epoch_id == self.end_epoch - 1) if is_snapshot and self.use_ema: # apply ema weight on model weight = copy.deepcopy(self.ema.model.state_dict()) for k, v in weight.items(): if paddle.is_floating_point(v): weight[k].stop_gradient = True self.status['weight'] = weight self._compose_callback.on_epoch_end(self.status) if validate and is_snapshot: if not hasattr(self, '_eval_loader'): # build evaluation dataset and loader self._eval_dataset = self.cfg.EvalDataset self._eval_batch_sampler = \ paddle.io.BatchSampler( self._eval_dataset, batch_size=self.cfg.EvalReader['batch_size']) # If metric is VOC, need to be set collate_batch=False. if self.cfg.metric == 'VOC': self.cfg['EvalReader']['collate_batch'] = False self._eval_loader = create('EvalReader')( self._eval_dataset, self.cfg.worker_num, batch_sampler=self._eval_batch_sampler) # if validation in training is enabled, metrics should be re-init # Init_mark makes sure this code will only execute once if validate and Init_mark == False: Init_mark = True self._init_metrics(validate=validate) self._reset_metrics() with paddle.no_grad(): self.status['save_best_model'] = True self._eval_with_loader(self._eval_loader) if is_snapshot and self.use_ema: self.status.pop('weight') self._compose_callback.on_train_end(self.status) def evaluate(self): # get distributed model if self.cfg.get('fleet', False): self.model = fleet.distributed_model(self.model) self.optimizer = fleet.distributed_optimizer(self.optimizer) elif self._nranks > 1: find_unused_parameters = self.cfg[ 'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False self.model = paddle.DataParallel( self.model, find_unused_parameters=find_unused_parameters) with paddle.no_grad(): self._eval_with_loader(self.loader) def _eval_with_loader(self, loader): sample_num = 0 tic = time.time() self._compose_callback.on_epoch_begin(self.status) self.status['mode'] = 'eval' test_cfg = self.cfg.DenseTeacher['test_cfg'] if test_cfg['inference_on'] == 'teacher': logger.info("***** teacher model evaluating *****") eval_model = self.ema.model else: logger.info("***** student model evaluating *****") eval_model = self.model eval_model.eval() if self.cfg.get('print_flops', False): flops_loader = create('{}Reader'.format(self.mode.capitalize()))( self.dataset, self.cfg.worker_num, self._eval_batch_sampler) self._flops(flops_loader) for step_id, data in enumerate(loader): self.status['step_id'] = step_id self._compose_callback.on_step_begin(self.status) # forward if self.use_amp: with paddle.amp.auto_cast( enable=self.cfg.use_gpu or self.cfg.use_mlu, custom_white_list=self.custom_white_list, custom_black_list=self.custom_black_list, level=self.amp_level): outs = eval_model(data) else: outs = eval_model(data) # update metrics for metric in self._metrics: metric.update(data, outs) # multi-scale inputs: all inputs have same im_id if isinstance(data, typing.Sequence): sample_num += data[0]['im_id'].numpy().shape[0] else: sample_num += data['im_id'].numpy().shape[0] self._compose_callback.on_step_end(self.status) self.status['sample_num'] = sample_num self.status['cost_time'] = time.time() - tic # accumulate metric to log out for metric in self._metrics: metric.accumulate() metric.log() self._compose_callback.on_epoch_end(self.status) self._reset_metrics() class Trainer_ARSL(Trainer): def __init__(self, cfg, mode='train'): self.cfg = cfg assert mode.lower() in ['train', 'eval', 'test'], \ "mode should be 'train', 'eval' or 'test'" self.mode = mode.lower() self.optimizer = None self.is_loaded_weights = False capital_mode = self.mode.capitalize() self.use_ema = False self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create( '{}Dataset'.format(capital_mode))() if self.mode == 'train': self.dataset_unlabel = self.cfg['UnsupTrainDataset'] = create( 'UnsupTrainDataset') self.loader = create('SemiTrainReader')( self.dataset, self.dataset_unlabel, cfg.worker_num) # build model if 'model' not in self.cfg: self.student_model = create(cfg.architecture) self.teacher_model = create(cfg.architecture) self.model = EnsembleTSModel(self.teacher_model, self.student_model) else: self.model = self.cfg.model self.is_loaded_weights = True # save path for burn-in model self.base_path = cfg.get('weights') self.base_path = os.path.dirname(self.base_path) # EvalDataset build with BatchSampler to evaluate in single device # TODO: multi-device evaluate if self.mode == 'eval': self._eval_batch_sampler = paddle.io.BatchSampler( self.dataset, batch_size=self.cfg.EvalReader['batch_size']) self.loader = create('{}Reader'.format(self.mode.capitalize()))( self.dataset, cfg.worker_num, self._eval_batch_sampler) # TestDataset build after user set images, skip loader creation here self.start_epoch = 0 self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch self.epoch_iter = self.cfg.epoch_iter # set fixed iter in each epoch to control checkpoint # build optimizer in train mode if self.mode == 'train': steps_per_epoch = self.epoch_iter self.lr = create('LearningRate')(steps_per_epoch) self.optimizer = create('OptimizerBuilder')(self.lr, self.model.modelStudent) self._nranks = dist.get_world_size() self._local_rank = dist.get_rank() self.status = {} # initial default callbacks self._init_callbacks() # initial default metrics self._init_metrics() self._reset_metrics() self.iter = 0 def resume_weights(self, weights): # support Distill resume weights if hasattr(self.model, 'student_model'): self.start_epoch = load_weight(self.model.student_model, weights, self.optimizer) else: self.start_epoch = load_weight(self.model, weights, self.optimizer) logger.debug("Resume weights of epoch {}".format(self.start_epoch)) def train(self, validate=False): assert self.mode == 'train', "Model not in 'train' mode" Init_mark = False # if validation in training is enabled, metrics should be re-init if validate: self._init_metrics(validate=validate) self._reset_metrics() if self.cfg.get('fleet', False): self.model.modelStudent = fleet.distributed_model( self.model.modelStudent) self.optimizer = fleet.distributed_optimizer(self.optimizer) elif self._nranks > 1: find_unused_parameters = self.cfg[ 'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False self.model.modelStudent = paddle.DataParallel( self.model.modelStudent, find_unused_parameters=find_unused_parameters) # set fixed iter in each epoch to control checkpoint self.status.update({ 'epoch_id': self.start_epoch, 'step_id': 0, 'steps_per_epoch': self.epoch_iter }) print('338 Len of DataLoader: {}'.format(len(self.loader))) self.status['batch_time'] = stats.SmoothedValue( self.cfg.log_iter, fmt='{avg:.4f}') self.status['data_time'] = stats.SmoothedValue( self.cfg.log_iter, fmt='{avg:.4f}') self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter) self._compose_callback.on_train_begin(self.status) epoch_id = self.start_epoch self.iter = self.start_epoch * self.epoch_iter # use iter rather than epoch to control training schedule while self.iter < self.cfg.max_iter: # epoch loop self.status['mode'] = 'train' self.status['epoch_id'] = epoch_id self._compose_callback.on_epoch_begin(self.status) self.loader.dataset_label.set_epoch(epoch_id) self.loader.dataset_unlabel.set_epoch(epoch_id) paddle.device.cuda.empty_cache() # clear GPU memory # set model status self.model.modelStudent.train() self.model.modelTeacher.eval() iter_tic = time.time() # iter loop in each eopch for step_id in range(self.epoch_iter): data = next(self.loader) self.status['data_time'].update(time.time() - iter_tic) self.status['step_id'] = step_id # profiler.add_profiler_step(profiler_options) self._compose_callback.on_step_begin(self.status) # model forward and calculate loss loss_dict = self.run_step_full_semisup(data) if (step_id + 1) % self.cfg.optimize_rate == 0: self.optimizer.step() self.optimizer.clear_grad() curr_lr = self.optimizer.get_lr() self.lr.step() # update log status self.status['learning_rate'] = curr_lr if self._nranks < 2 or self._local_rank == 0: self.status['training_staus'].update(loss_dict) self.status['batch_time'].update(time.time() - iter_tic) self._compose_callback.on_step_end(self.status) self.iter += 1 iter_tic = time.time() self._compose_callback.on_epoch_end(self.status) if validate and (self._nranks < 2 or self._local_rank == 0) \ and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 \ or epoch_id == self.end_epoch - 1): if not hasattr(self, '_eval_loader'): # build evaluation dataset and loader self._eval_dataset = self.cfg.EvalDataset self._eval_batch_sampler = \ paddle.io.BatchSampler( self._eval_dataset, batch_size=self.cfg.EvalReader['batch_size']) self._eval_loader = create('EvalReader')( self._eval_dataset, self.cfg.worker_num, batch_sampler=self._eval_batch_sampler) if validate and Init_mark == False: Init_mark = True self._init_metrics(validate=validate) self._reset_metrics() with paddle.no_grad(): self.status['save_best_model'] = True # before burn-in stage, eval student. after burn-in stage, eval teacher if self.iter <= self.cfg.SEMISUPNET['BURN_UP_STEP']: print("start eval student model") self._eval_with_loader( self._eval_loader, mode="student") else: print("start eval teacher model") self._eval_with_loader( self._eval_loader, mode="teacher") epoch_id += 1 self._compose_callback.on_train_end(self.status) def merge_data(self, data1, data2): data = copy.deepcopy(data1) for k, v in data1.items(): if type(v) is paddle.Tensor: data[k] = paddle.concat(x=[data[k], data2[k]], axis=0) elif type(v) is list: data[k].extend(data2[k]) return data def run_step_full_semisup(self, data): label_data_k, label_data_q, unlabel_data_k, unlabel_data_q = data data_merge = self.merge_data(label_data_k, label_data_q) loss_sup_dict = self.model.modelStudent(data_merge, branch="supervised") loss_dict = {} for key in loss_sup_dict.keys(): if key[:4] == "loss": loss_dict[key] = loss_sup_dict[key] * 1 losses_sup = paddle.add_n(list(loss_dict.values())) # norm loss when using gradient accumulation losses_sup = losses_sup / self.cfg.optimize_rate losses_sup.backward() for key in loss_sup_dict.keys(): loss_dict[key + "_pseudo"] = paddle.to_tensor([0]) loss_dict["loss_tot"] = losses_sup """ semi-supervised training after burn-in stage """ if self.iter >= self.cfg.SEMISUPNET['BURN_UP_STEP']: # init teacher model with burn-up weight if self.iter == self.cfg.SEMISUPNET['BURN_UP_STEP']: print( 'Starting semi-supervised learning and load the teacher model.' ) self._update_teacher_model(keep_rate=0.00) # save burn-in model if dist.get_world_size() < 2 or dist.get_rank() == 0: print('saving burn-in model.') save_name = 'burnIn' epoch_id = self.iter // self.epoch_iter save_model(self.model, self.optimizer, self.base_path, save_name, epoch_id) # Update teacher model with EMA elif (self.iter + 1) % self.cfg.optimize_rate == 0: self._update_teacher_model( keep_rate=self.cfg.SEMISUPNET['EMA_KEEP_RATE']) #warm-up weight for pseudo loss pseudo_weight = self.cfg.SEMISUPNET['UNSUP_LOSS_WEIGHT'] pseudo_warmup_iter = self.cfg.SEMISUPNET['PSEUDO_WARM_UP_STEPS'] temp = self.iter - self.cfg.SEMISUPNET['BURN_UP_STEP'] if temp <= pseudo_warmup_iter: pseudo_weight *= (temp / pseudo_warmup_iter) # get teacher predictions on weak-augmented unlabeled data with paddle.no_grad(): teacher_pred = self.model.modelTeacher( unlabel_data_k, branch='semi_supervised') # calculate unsupervised loss on strong-augmented unlabeled data loss_unsup_dict = self.model.modelStudent( unlabel_data_q, branch="semi_supervised", teacher_prediction=teacher_pred, ) for key in loss_unsup_dict.keys(): if key[-6:] == "pseudo": loss_unsup_dict[key] = loss_unsup_dict[key] * pseudo_weight losses_unsup = paddle.add_n(list(loss_unsup_dict.values())) # norm loss when using gradient accumulation losses_unsup = losses_unsup / self.cfg.optimize_rate losses_unsup.backward() loss_dict.update(loss_unsup_dict) loss_dict["loss_tot"] += losses_unsup return loss_dict def export(self, output_dir='output_inference'): self.model.eval() model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0] save_dir = os.path.join(output_dir, model_name) if not os.path.exists(save_dir): os.makedirs(save_dir) image_shape = None if self.cfg.architecture in MOT_ARCH: test_reader_name = 'TestMOTReader' else: test_reader_name = 'TestReader' if 'inputs_def' in self.cfg[test_reader_name]: inputs_def = self.cfg[test_reader_name]['inputs_def'] image_shape = inputs_def.get('image_shape', None) # set image_shape=[3, -1, -1] as default if image_shape is None: image_shape = [3, -1, -1] self.model.modelTeacher.eval() if hasattr(self.model.modelTeacher, 'deploy'): self.model.modelTeacher.deploy = True # Save infer cfg _dump_infer_config(self.cfg, os.path.join(save_dir, 'infer_cfg.yml'), image_shape, self.model.modelTeacher) input_spec = [{ "image": InputSpec( shape=[None] + image_shape, name='image'), "im_shape": InputSpec( shape=[None, 2], name='im_shape'), "scale_factor": InputSpec( shape=[None, 2], name='scale_factor') }] if self.cfg.architecture == 'DeepSORT': input_spec[0].update({ "crops": InputSpec( shape=[None, 3, 192, 64], name='crops') }) static_model = paddle.jit.to_static( self.model.modelTeacher, input_spec=input_spec) # NOTE: dy2st do not pruned program, but jit.save will prune program # input spec, prune input spec here and save with pruned input spec pruned_input_spec = _prune_input_spec(input_spec, static_model.forward.main_program, static_model.forward.outputs) # dy2st and save model if 'slim' not in self.cfg or self.cfg['slim_type'] != 'QAT': paddle.jit.save( static_model, os.path.join(save_dir, 'model'), input_spec=pruned_input_spec) else: self.cfg.slim.save_quantized_model( self.model.modelTeacher, os.path.join(save_dir, 'model'), input_spec=pruned_input_spec) logger.info("Export model and saved in {}".format(save_dir)) def _eval_with_loader(self, loader, mode="teacher"): sample_num = 0 tic = time.time() self._compose_callback.on_epoch_begin(self.status) self.status['mode'] = 'eval' # self.model.eval() self.model.modelTeacher.eval() self.model.modelStudent.eval() for step_id, data in enumerate(loader): self.status['step_id'] = step_id self._compose_callback.on_step_begin(self.status) if mode == "teacher": outs = self.model.modelTeacher(data) else: outs = self.model.modelStudent(data) # update metrics for metric in self._metrics: metric.update(data, outs) sample_num += data['im_id'].numpy().shape[0] self._compose_callback.on_step_end(self.status) self.status['sample_num'] = sample_num self.status['cost_time'] = time.time() - tic # accumulate metric to log out for metric in self._metrics: metric.accumulate() metric.log() self._compose_callback.on_epoch_end(self.status) # reset metric states for metric may performed multiple times self._reset_metrics() def evaluate(self): with paddle.no_grad(): self._eval_with_loader(self.loader) @paddle.no_grad() def _update_teacher_model(self, keep_rate=0.996): student_model_dict = copy.deepcopy(self.model.modelStudent.state_dict()) new_teacher_dict = dict() for key, value in self.model.modelTeacher.state_dict().items(): if key in student_model_dict.keys(): v = student_model_dict[key] * (1 - keep_rate ) + value * keep_rate v.stop_gradient = True new_teacher_dict[key] = v else: raise Exception("{} is not found in student model".format(key)) self.model.modelTeacher.set_dict(new_teacher_dict) class EnsembleTSModel(nn.Layer): def __init__(self, modelTeacher, modelStudent): super(EnsembleTSModel, self).__init__() self.modelTeacher = modelTeacher self.modelStudent = modelStudent class Trainer_Semi_RTDETR(Trainer): def __init__(self, cfg, mode='train'): self.cfg = cfg assert mode.lower() in ['train', 'eval', 'test'], \ "mode should be 'train', 'eval' or 'test'" self.mode = mode.lower() self.optimizer = None self.is_loaded_weights = False self.use_amp = self.cfg.get('amp', False) self.amp_level = self.cfg.get('amp_level', 'O1') self.custom_white_list = self.cfg.get('custom_white_list', None) self.custom_black_list = self.cfg.get('custom_black_list', None) # build data loader capital_mode = self.mode.capitalize() self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create( '{}Dataset'.format(capital_mode))() if self.mode == 'train': self.dataset_unlabel = self.cfg['UnsupTrainDataset'] = create( 'UnsupTrainDataset') self.loader = create('SemiTrainReader')( self.dataset, self.dataset_unlabel, cfg.worker_num) # build model if 'model' not in self.cfg: self.model = create(cfg.SSOD) else: self.model = self.cfg.model self.is_loaded_weights = True # EvalDataset build with BatchSampler to evaluate in single device # TODO: multi-device evaluate if self.mode == 'eval': self._eval_batch_sampler = paddle.io.BatchSampler( self.dataset, batch_size=self.cfg.EvalReader['batch_size']) # If metric is VOC, need to be set collate_batch=False. if cfg.metric == 'VOC': cfg['EvalReader']['collate_batch'] = False self.loader = create('EvalReader')(self.dataset, cfg.worker_num, self._eval_batch_sampler) # TestDataset build after user set images, skip loader creation here # build optimizer in train mode if self.mode == 'train': steps_per_epoch = len(self.loader) if steps_per_epoch < 1: logger.warning( "Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader." ) self.lr = create('LearningRate')(steps_per_epoch) self.optimizer = create('OptimizerBuilder')(self.lr, self.model) # Unstructured pruner is only enabled in the train mode. if self.cfg.get('unstructured_prune'): self.pruner = create('UnstructuredPruner')(self.model, steps_per_epoch) if self.use_amp and self.amp_level == 'O2': self.model, self.optimizer = paddle.amp.decorate( models=self.model, optimizers=self.optimizer, level=self.amp_level) self._nranks = dist.get_world_size() self._local_rank = dist.get_rank() self.status = {} self.start_epoch = 0 self.start_iter = 0 self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch # initial default callbacks self._init_callbacks() # initial default metrics self._init_metrics() self._reset_metrics() def load_semi_weights(self, t_weights, s_weights): if self.is_loaded_weights: return self.start_epoch = 0 load_pretrain_weight(self.model.teacher, t_weights) load_pretrain_weight(self.model.student, s_weights) logger.info("Load teacher weights {} to start training".format( t_weights)) logger.info("Load student weights {} to start training".format( s_weights)) def resume_weights(self, weights, exchange=True): # support Distill resume weights if hasattr(self.model, 'student_model'): self.start_epoch = load_weight(self.model.student_model, weights, self.optimizer, exchange) else: self.start_iter, self.start_epoch = load_weight( self.model, weights, self.optimizer, self.ema if self.use_ema else None, exchange) logger.debug("Resume weights of epoch {}".format(self.start_epoch)) logger.debug("Resume weights of iter {}".format(self.start_iter)) def train(self, validate=False): assert self.mode == 'train', "Model not in 'train' mode" Init_mark = False if validate: self.cfg.EvalDataset = create("EvalDataset")() model = self.model sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and self.cfg.use_gpu and self._nranks > 1) if sync_bn: # self.model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm( # self.model) model.teacher = paddle.nn.SyncBatchNorm.convert_sync_batchnorm( model.teacher) model.student = paddle.nn.SyncBatchNorm.convert_sync_batchnorm( self.model.student) if self.cfg.get('fleet', False): # model = fleet.distributed_model(model) model = fleet.distributed_model(model) self.optimizer = fleet.distributed_optimizer(self.optimizer) elif self._nranks > 1: find_unused_parameters = self.cfg[ 'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False model = paddle.DataParallel( model, find_unused_parameters=find_unused_parameters) if self.cfg.get('amp', False): scaler = amp.GradScaler( enable=self.cfg.use_gpu or self.cfg.use_npu, init_loss_scaling=1024) self.status.update({ 'epoch_id': self.start_epoch, 'iter_id': self.start_iter, # 'step_id': self.start_step, 'steps_per_epoch': len(self.loader), }) self.status['batch_time'] = stats.SmoothedValue( self.cfg.log_iter, fmt='{avg:.4f}') self.status['data_time'] = stats.SmoothedValue( self.cfg.log_iter, fmt='{avg:.4f}') self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter) if self.cfg.get('print_flops', False): flops_loader = create('{}Reader'.format(self.mode.capitalize()))( self.dataset, self.cfg.worker_num) self._flops(flops_loader) profiler_options = self.cfg.get('profiler_options', None) self._compose_callback.on_train_begin(self.status) iter_id = self.start_iter self.status['iter_id'] = iter_id self.status['eval_interval'] = self.cfg.eval_interval self.status['save_interval'] = self.cfg.save_interval for epoch_id in range(self.start_epoch, self.cfg.epoch): self.status['mode'] = 'train' self.status['epoch_id'] = epoch_id self._compose_callback.on_epoch_begin(self.status) self.loader.dataset_label.set_epoch(epoch_id) self.loader.dataset_unlabel.set_epoch(epoch_id) iter_tic = time.time() if self._nranks > 1: # print(model) model._layers.teacher.eval() model._layers.student.train() else: model.teacher.eval() model.student.train() iter_tic = time.time() for step_id in range(len(self.loader)): data = next(self.loader) data_sup_w, data_sup_s, data_unsup_w, data_unsup_s = data data_sup_w['epoch_id'] = epoch_id data_sup_s['epoch_id'] = epoch_id data_unsup_w['epoch_id'] = epoch_id data_unsup_s['epoch_id'] = epoch_id data = [data_sup_w, data_sup_s, data_unsup_w, data_unsup_s] iter_id += 1 self.status['data_time'].update(time.time() - iter_tic) self.status['step_id'] = step_id self.status['iter_id'] = iter_id data.append(iter_id) profiler.add_profiler_step(profiler_options) self._compose_callback.on_step_begin(self.status) if self.cfg.get('amp', False): with amp.auto_cast(enable=self.cfg.use_gpu): # model forward if self._nranks > 1: outputs = model._layers(data) else: outputs = model(data) loss = outputs['loss'] scaled_loss = scaler.scale(loss) scaled_loss.backward() scaler.minimize(self.optimizer, scaled_loss) else: outputs = model(data) loss = outputs['loss'] # model backward loss.backward() self.optimizer.step() curr_lr = self.optimizer.get_lr() self.lr.step() if self.cfg.get('unstructured_prune'): self.pruner.step() self.optimizer.clear_grad() # print(outputs) # outputs=reduce_dict(outputs) # if self.model.debug: # check_gradient(model) # self.check_gradient() self.status['learning_rate'] = curr_lr if self._nranks < 2 or self._local_rank == 0: self.status['training_staus'].update(outputs) self.status['batch_time'].update(time.time() - iter_tic) if validate and (self._nranks < 2 or self._local_rank == 0) and \ ((iter_id + 1) % self.cfg.eval_interval == 0): if not hasattr(self, '_eval_loader'): # build evaluation dataset and loader self._eval_dataset = self.cfg.EvalDataset self._eval_batch_sampler = \ paddle.io.BatchSampler( self._eval_dataset, batch_size=self.cfg.EvalReader['batch_size']) # If metric is VOC, need to be set collate_batch=False. if self.cfg.metric == 'VOC': self.cfg['EvalReader']['collate_batch'] = False self._eval_loader = create('EvalReader')( self._eval_dataset, self.cfg.worker_num, batch_sampler=self._eval_batch_sampler) # if validation in training is enabled, metrics should be re-init # Init_mark makes sure this code will only execute once if validate and Init_mark == False: Init_mark = True self._init_metrics(validate=validate) self._reset_metrics() with paddle.no_grad(): self.status['save_best_model'] = True self._eval_with_loader(self._eval_loader) model._layers.student.train() self._compose_callback.on_step_end(self.status) iter_tic = time.time() if self.cfg.get('unstructured_prune'): self.pruner.update_params() self._compose_callback.on_epoch_end(self.status) self._compose_callback.on_train_end(self.status) def _eval_with_loader(self, loader): sample_num = 0 tic = time.time() self._compose_callback.on_epoch_begin(self.status) self.status['mode'] = 'eval' self.model.eval() if self.cfg.get('print_flops', False): flops_loader = create('{}Reader'.format(self.mode.capitalize()))( self.dataset, self.cfg.worker_num, self._eval_batch_sampler) self._flops(flops_loader) print("*****teacher evaluate*****") for step_id, data in enumerate(loader): self.status['step_id'] = step_id self._compose_callback.on_step_begin(self.status) # forward outs = self.model.teacher(data) # update metrics for metric in self._metrics: metric.update(data, outs) # multi-scale inputs: all inputs have same im_id if isinstance(data, typing.Sequence): sample_num += data[0]['im_id'].numpy().shape[0] else: sample_num += data['im_id'].numpy().shape[0] self._compose_callback.on_step_end(self.status) self.status['sample_num'] = sample_num self.status['cost_time'] = time.time() - tic # accumulate metric to log out for metric in self._metrics: metric.accumulate() metric.log() self._compose_callback.on_epoch_end(self.status) # reset metric states for metric may performed multiple times self._reset_metrics() print("*****student evaluate*****") for step_id, data in enumerate(loader): self.status['step_id'] = step_id self._compose_callback.on_step_begin(self.status) # forward outs = self.model.student(data) # update metrics for metric in self._metrics: metric.update(data, outs) # multi-scale inputs: all inputs have same im_id if isinstance(data, typing.Sequence): sample_num += data[0]['im_id'].numpy().shape[0] else: sample_num += data['im_id'].numpy().shape[0] self._compose_callback.on_step_end(self.status) self.status['sample_num'] = sample_num self.status['cost_time'] = time.time() - tic # accumulate metric to log out for metric in self._metrics: metric.accumulate() metric.log() # reset metric states for metric may performed multiple times self._reset_metrics() self.status['mode'] = 'train' def evaluate(self): with paddle.no_grad(): self._eval_with_loader(self.loader) ================================================ FILE: ppdet/ext_op/README.md ================================================ # 自定义OP编译 旋转框IOU计算OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。 ## 1. 环境依赖 - Paddle >= 2.0.1 - gcc 8.2 ## 2. 安装 ``` python setup.py install ``` 编译完成后即可使用,以下为`rbox_iou`的使用示例 ``` # 引入自定义op from ext_op import rbox_iou paddle.set_device('gpu:0') paddle.disable_static() rbox1 = np.random.rand(13000, 5) rbox2 = np.random.rand(7, 5) pd_rbox1 = paddle.to_tensor(rbox1) pd_rbox2 = paddle.to_tensor(rbox2) iou = rbox_iou(pd_rbox1, pd_rbox2) print('iou', iou) ``` ## 3. 单元测试 可以通过执行单元测试来确认自定义算子功能的正确性,执行单元测试的示例如下所示: ``` python unittest/test_matched_rbox_iou.py ``` ================================================ FILE: ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cc ================================================ // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // The code is based on // https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/ #include "../rbox_iou/rbox_iou_utils.h" #include "paddle/extension.h" template void matched_rbox_iou_cpu_kernel(const int rbox_num, const T *rbox1_data_ptr, const T *rbox2_data_ptr, T *output_data_ptr) { int i; for (i = 0; i < rbox_num; i++) { output_data_ptr[i] = rbox_iou_single(rbox1_data_ptr + i * 5, rbox2_data_ptr + i * 5); } } #define CHECK_INPUT_CPU(x) \ PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.") std::vector MatchedRboxIouCPUForward(const paddle::Tensor &rbox1, const paddle::Tensor &rbox2) { CHECK_INPUT_CPU(rbox1); CHECK_INPUT_CPU(rbox2); PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], "inputs must be same dim"); auto rbox_num = rbox1.shape()[0]; auto output = paddle::empty({rbox_num}, rbox1.dtype(), paddle::CPUPlace()); PD_DISPATCH_FLOATING_TYPES(rbox1.type(), "matched_rbox_iou_cpu_kernel", ([&] { matched_rbox_iou_cpu_kernel( rbox_num, rbox1.data(), rbox2.data(), output.data()); })); return {output}; } #ifdef PADDLE_WITH_CUDA std::vector MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1, const paddle::Tensor &rbox2); #endif #define CHECK_INPUT_SAME(x1, x2) \ PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.") std::vector MatchedRboxIouForward(const paddle::Tensor &rbox1, const paddle::Tensor &rbox2) { CHECK_INPUT_SAME(rbox1, rbox2); if (rbox1.is_cpu()) { return MatchedRboxIouCPUForward(rbox1, rbox2); #ifdef PADDLE_WITH_CUDA } else if (rbox1.is_gpu()) { return MatchedRboxIouCUDAForward(rbox1, rbox2); #endif } } std::vector> MatchedRboxIouInferShape(std::vector rbox1_shape, std::vector rbox2_shape) { return {{rbox1_shape[0]}}; } std::vector MatchedRboxIouInferDtype(paddle::DataType t1, paddle::DataType t2) { return {t1}; } PD_BUILD_OP(matched_rbox_iou) .Inputs({"RBOX1", "RBOX2"}) .Outputs({"Output"}) .SetKernelFn(PD_KERNEL(MatchedRboxIouForward)) .SetInferShapeFn(PD_INFER_SHAPE(MatchedRboxIouInferShape)) .SetInferDtypeFn(PD_INFER_DTYPE(MatchedRboxIouInferDtype)); ================================================ FILE: ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cu ================================================ // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // The code is based on // https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/ #include "../rbox_iou/rbox_iou_utils.h" #include "paddle/extension.h" template __global__ void matched_rbox_iou_cuda_kernel(const int rbox_num, const T *rbox1_data_ptr, const T *rbox2_data_ptr, T *output_data_ptr) { for (int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < rbox_num; tid += blockDim.x * gridDim.x) { output_data_ptr[tid] = rbox_iou_single(rbox1_data_ptr + tid * 5, rbox2_data_ptr + tid * 5); } } #define CHECK_INPUT_GPU(x) \ PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.") std::vector MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1, const paddle::Tensor &rbox2) { CHECK_INPUT_GPU(rbox1); CHECK_INPUT_GPU(rbox2); PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], "inputs must be same dim"); auto rbox_num = rbox1.shape()[0]; auto output = paddle::empty({rbox_num}, rbox1.dtype(), paddle::GPUPlace()); const int thread_per_block = 512; const int block_per_grid = CeilDiv(rbox_num, thread_per_block); PD_DISPATCH_FLOATING_TYPES( rbox1.type(), "matched_rbox_iou_cuda_kernel", ([&] { matched_rbox_iou_cuda_kernel< data_t><<>>( rbox_num, rbox1.data(), rbox2.data(), output.data()); })); return {output}; } ================================================ FILE: ppdet/ext_op/csrc/nms_rotated/nms_rotated.cc ================================================ // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "../rbox_iou/rbox_iou_utils.h" #include "paddle/extension.h" template void nms_rotated_cpu_kernel(const T *boxes_data, const float threshold, const int64_t num_boxes, int64_t *num_keep_boxes, int64_t *output_data) { int num_masks = CeilDiv(num_boxes, 64); std::vector masks(num_masks, 0); for (int64_t i = 0; i < num_boxes; ++i) { if (masks[i / 64] & 1ULL << (i % 64)) continue; T box_1[5]; for (int k = 0; k < 5; ++k) { box_1[k] = boxes_data[i * 5 + k]; } for (int64_t j = i + 1; j < num_boxes; ++j) { if (masks[j / 64] & 1ULL << (j % 64)) continue; T box_2[5]; for (int k = 0; k < 5; ++k) { box_2[k] = boxes_data[j * 5 + k]; } if (rbox_iou_single(box_1, box_2) > threshold) { masks[j / 64] |= 1ULL << (j % 64); } } } int64_t output_data_idx = 0; for (int64_t i = 0; i < num_boxes; ++i) { if (masks[i / 64] & 1ULL << (i % 64)) continue; output_data[output_data_idx++] = i; } *num_keep_boxes = output_data_idx; for (; output_data_idx < num_boxes; ++output_data_idx) { output_data[output_data_idx] = 0; } } #define CHECK_INPUT_CPU(x) \ PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.") std::vector NMSRotatedCPUForward(const paddle::Tensor &boxes, const paddle::Tensor &scores, float threshold) { CHECK_INPUT_CPU(boxes); CHECK_INPUT_CPU(scores); auto num_boxes = boxes.shape()[0]; auto order_t = std::get<1>(paddle::argsort(scores, /* axis=*/0, /* descending=*/true)); auto boxes_sorted = paddle::gather(boxes, order_t, /* axis=*/0); auto keep = paddle::empty({num_boxes}, paddle::DataType::INT64, paddle::CPUPlace()); int64_t num_keep_boxes = 0; PD_DISPATCH_FLOATING_TYPES(boxes.type(), "nms_rotated_cpu_kernel", ([&] { nms_rotated_cpu_kernel( boxes_sorted.data(), threshold, num_boxes, &num_keep_boxes, keep.data()); })); keep = keep.slice(0, num_keep_boxes); return {paddle::gather(order_t, keep, /* axis=*/0)}; } #ifdef PADDLE_WITH_CUDA std::vector NMSRotatedCUDAForward(const paddle::Tensor &boxes, const paddle::Tensor &scores, float threshold); #endif std::vector NMSRotatedForward(const paddle::Tensor &boxes, const paddle::Tensor &scores, float threshold) { if (boxes.is_cpu()) { return NMSRotatedCPUForward(boxes, scores, threshold); #ifdef PADDLE_WITH_CUDA } else if (boxes.is_gpu()) { return NMSRotatedCUDAForward(boxes, scores, threshold); #endif } } std::vector> NMSRotatedInferShape(std::vector boxes_shape, std::vector scores_shape) { return {{-1}}; } std::vector NMSRotatedInferDtype(paddle::DataType t1, paddle::DataType t2) { return {paddle::DataType::INT64}; } PD_BUILD_OP(nms_rotated) .Inputs({"Boxes", "Scores"}) .Outputs({"Output"}) .Attrs({"threshold: float"}) .SetKernelFn(PD_KERNEL(NMSRotatedForward)) .SetInferShapeFn(PD_INFER_SHAPE(NMSRotatedInferShape)) .SetInferDtypeFn(PD_INFER_DTYPE(NMSRotatedInferDtype)); ================================================ FILE: ppdet/ext_op/csrc/nms_rotated/nms_rotated.cu ================================================ // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "../rbox_iou/rbox_iou_utils.h" #include "paddle/extension.h" static const int64_t threadsPerBlock = sizeof(int64_t) * 8; template __global__ void nms_rotated_cuda_kernel(const T *boxes_data, const float threshold, const int64_t num_boxes, int64_t *masks) { auto raw_start = blockIdx.y; auto col_start = blockIdx.x; if (raw_start > col_start) return; const int raw_last_storage = min(num_boxes - raw_start * threadsPerBlock, threadsPerBlock); const int col_last_storage = min(num_boxes - col_start * threadsPerBlock, threadsPerBlock); if (threadIdx.x < raw_last_storage) { int64_t mask = 0; auto current_box_idx = raw_start * threadsPerBlock + threadIdx.x; const T *current_box = boxes_data + current_box_idx * 5; for (int i = 0; i < col_last_storage; ++i) { const T *target_box = boxes_data + (col_start * threadsPerBlock + i) * 5; if (rbox_iou_single(current_box, target_box) > threshold) { mask |= 1ULL << i; } } const int blocks_per_line = CeilDiv(num_boxes, threadsPerBlock); masks[current_box_idx * blocks_per_line + col_start] = mask; } } #define CHECK_INPUT_GPU(x) \ PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.") std::vector NMSRotatedCUDAForward(const paddle::Tensor &boxes, const paddle::Tensor &scores, float threshold) { CHECK_INPUT_GPU(boxes); CHECK_INPUT_GPU(scores); auto num_boxes = boxes.shape()[0]; auto order_t = std::get<1>(paddle::argsort(scores, /* axis=*/0, /* descending=*/true)); auto boxes_sorted = paddle::gather(boxes, order_t, /* axis=*/0); const auto blocks_per_line = CeilDiv(num_boxes, threadsPerBlock); dim3 block(threadsPerBlock); dim3 grid(blocks_per_line, blocks_per_line); auto mask_dev = paddle::empty({num_boxes * blocks_per_line}, paddle::DataType::INT64, paddle::GPUPlace()); PD_DISPATCH_FLOATING_TYPES( boxes.type(), "nms_rotated_cuda_kernel", ([&] { nms_rotated_cuda_kernel<<>>( boxes_sorted.data(), threshold, num_boxes, mask_dev.data()); })); auto mask_host = mask_dev.copy_to(paddle::CPUPlace(), true); auto keep_host = paddle::empty({num_boxes}, paddle::DataType::INT64, paddle::CPUPlace()); int64_t *keep_host_ptr = keep_host.data(); int64_t *mask_host_ptr = mask_host.data(); std::vector remv(blocks_per_line); int64_t last_box_num = 0; for (int64_t i = 0; i < num_boxes; ++i) { auto remv_element_id = i / threadsPerBlock; auto remv_bit_id = i % threadsPerBlock; if (!(remv[remv_element_id] & 1ULL << remv_bit_id)) { keep_host_ptr[last_box_num++] = i; int64_t *current_mask = mask_host_ptr + i * blocks_per_line; for (auto j = remv_element_id; j < blocks_per_line; ++j) { remv[j] |= current_mask[j]; } } } keep_host = keep_host.slice(0, last_box_num); auto keep_dev = keep_host.copy_to(paddle::GPUPlace(), true); return {paddle::gather(order_t, keep_dev, /* axis=*/0)}; } ================================================ FILE: ppdet/ext_op/csrc/rbox_iou/rbox_iou.cc ================================================ // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // The code is based on // https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/ #include "paddle/extension.h" #include "rbox_iou_utils.h" template void rbox_iou_cpu_kernel(const int rbox1_num, const int rbox2_num, const T *rbox1_data_ptr, const T *rbox2_data_ptr, T *output_data_ptr) { int i, j; for (i = 0; i < rbox1_num; i++) { for (j = 0; j < rbox2_num; j++) { int offset = i * rbox2_num + j; output_data_ptr[offset] = rbox_iou_single(rbox1_data_ptr + i * 5, rbox2_data_ptr + j * 5); } } } #define CHECK_INPUT_CPU(x) \ PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.") std::vector RboxIouCPUForward(const paddle::Tensor &rbox1, const paddle::Tensor &rbox2) { CHECK_INPUT_CPU(rbox1); CHECK_INPUT_CPU(rbox2); auto rbox1_num = rbox1.shape()[0]; auto rbox2_num = rbox2.shape()[0]; auto output = paddle::empty({rbox1_num, rbox2_num}, rbox1.dtype(), paddle::CPUPlace()); PD_DISPATCH_FLOATING_TYPES(rbox1.type(), "rbox_iou_cpu_kernel", ([&] { rbox_iou_cpu_kernel( rbox1_num, rbox2_num, rbox1.data(), rbox2.data(), output.data()); })); return {output}; } #ifdef PADDLE_WITH_CUDA std::vector RboxIouCUDAForward(const paddle::Tensor &rbox1, const paddle::Tensor &rbox2); #endif #define CHECK_INPUT_SAME(x1, x2) \ PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.") std::vector RboxIouForward(const paddle::Tensor &rbox1, const paddle::Tensor &rbox2) { CHECK_INPUT_SAME(rbox1, rbox2); if (rbox1.is_cpu()) { return RboxIouCPUForward(rbox1, rbox2); #ifdef PADDLE_WITH_CUDA } else if (rbox1.is_gpu()) { return RboxIouCUDAForward(rbox1, rbox2); #endif } } std::vector> RboxIouInferShape(std::vector rbox1_shape, std::vector rbox2_shape) { return {{rbox1_shape[0], rbox2_shape[0]}}; } std::vector RboxIouInferDtype(paddle::DataType t1, paddle::DataType t2) { return {t1}; } PD_BUILD_OP(rbox_iou) .Inputs({"RBox1", "RBox2"}) .Outputs({"Output"}) .SetKernelFn(PD_KERNEL(RboxIouForward)) .SetInferShapeFn(PD_INFER_SHAPE(RboxIouInferShape)) .SetInferDtypeFn(PD_INFER_DTYPE(RboxIouInferDtype)); ================================================ FILE: ppdet/ext_op/csrc/rbox_iou/rbox_iou.cu ================================================ // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // The code is based on // https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/ #include "paddle/extension.h" #include "rbox_iou_utils.h" // 2D block with 32 * 16 = 512 threads per block const int BLOCK_DIM_X = 32; const int BLOCK_DIM_Y = 16; template __global__ void rbox_iou_cuda_kernel(const int rbox1_num, const int rbox2_num, const T *rbox1_data_ptr, const T *rbox2_data_ptr, T *output_data_ptr) { // get row_start and col_start const int rbox1_block_idx = blockIdx.x * blockDim.x; const int rbox2_block_idx = blockIdx.y * blockDim.y; const int rbox1_thread_num = min(rbox1_num - rbox1_block_idx, blockDim.x); const int rbox2_thread_num = min(rbox2_num - rbox2_block_idx, blockDim.y); __shared__ T block_boxes1[BLOCK_DIM_X * 5]; __shared__ T block_boxes2[BLOCK_DIM_Y * 5]; // It's safe to copy using threadIdx.x since BLOCK_DIM_X >= BLOCK_DIM_Y if (threadIdx.x < rbox1_thread_num && threadIdx.y == 0) { block_boxes1[threadIdx.x * 5 + 0] = rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 0]; block_boxes1[threadIdx.x * 5 + 1] = rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 1]; block_boxes1[threadIdx.x * 5 + 2] = rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 2]; block_boxes1[threadIdx.x * 5 + 3] = rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 3]; block_boxes1[threadIdx.x * 5 + 4] = rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 4]; } // threadIdx.x < BLOCK_DIM_Y=rbox2_thread_num, just use same condition as // above: threadIdx.y == 0 if (threadIdx.x < rbox2_thread_num && threadIdx.y == 0) { block_boxes2[threadIdx.x * 5 + 0] = rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 0]; block_boxes2[threadIdx.x * 5 + 1] = rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 1]; block_boxes2[threadIdx.x * 5 + 2] = rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 2]; block_boxes2[threadIdx.x * 5 + 3] = rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 3]; block_boxes2[threadIdx.x * 5 + 4] = rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 4]; } // sync __syncthreads(); if (threadIdx.x < rbox1_thread_num && threadIdx.y < rbox2_thread_num) { int offset = (rbox1_block_idx + threadIdx.x) * rbox2_num + rbox2_block_idx + threadIdx.y; output_data_ptr[offset] = rbox_iou_single( block_boxes1 + threadIdx.x * 5, block_boxes2 + threadIdx.y * 5); } } #define CHECK_INPUT_GPU(x) \ PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.") std::vector RboxIouCUDAForward(const paddle::Tensor &rbox1, const paddle::Tensor &rbox2) { CHECK_INPUT_GPU(rbox1); CHECK_INPUT_GPU(rbox2); auto rbox1_num = rbox1.shape()[0]; auto rbox2_num = rbox2.shape()[0]; auto output = paddle::empty({rbox1_num, rbox2_num}, rbox1.dtype(), paddle::GPUPlace()); const int blocks_x = CeilDiv(rbox1_num, BLOCK_DIM_X); const int blocks_y = CeilDiv(rbox2_num, BLOCK_DIM_Y); dim3 blocks(blocks_x, blocks_y); dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); PD_DISPATCH_FLOATING_TYPES( rbox1.type(), "rbox_iou_cuda_kernel", ([&] { rbox_iou_cuda_kernel<<>>( rbox1_num, rbox2_num, rbox1.data(), rbox2.data(), output.data()); })); return {output}; } ================================================ FILE: ppdet/ext_op/csrc/rbox_iou/rbox_iou_utils.h ================================================ // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // The code is based on // https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/ #pragma once #include #include #include #ifdef __CUDACC__ // Designates functions callable from the host (CPU) and the device (GPU) #define HOST_DEVICE __host__ __device__ #define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__ #else #include #define HOST_DEVICE #define HOST_DEVICE_INLINE HOST_DEVICE inline #endif namespace { template struct RotatedBox { T x_ctr, y_ctr, w, h, a; }; template struct Point { T x, y; HOST_DEVICE_INLINE Point(const T &px = 0, const T &py = 0) : x(px), y(py) {} HOST_DEVICE_INLINE Point operator+(const Point &p) const { return Point(x + p.x, y + p.y); } HOST_DEVICE_INLINE Point &operator+=(const Point &p) { x += p.x; y += p.y; return *this; } HOST_DEVICE_INLINE Point operator-(const Point &p) const { return Point(x - p.x, y - p.y); } HOST_DEVICE_INLINE Point operator*(const T coeff) const { return Point(x * coeff, y * coeff); } }; template HOST_DEVICE_INLINE T dot_2d(const Point &A, const Point &B) { return A.x * B.x + A.y * B.y; } template HOST_DEVICE_INLINE T cross_2d(const Point &A, const Point &B) { return A.x * B.y - B.x * A.y; } template HOST_DEVICE_INLINE void get_rotated_vertices(const RotatedBox &box, Point (&pts)[4]) { // M_PI / 180. == 0.01745329251 // double theta = box.a * 0.01745329251; // MODIFIED double theta = box.a; T cosTheta2 = (T)cos(theta) * 0.5f; T sinTheta2 = (T)sin(theta) * 0.5f; // y: top --> down; x: left --> right pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w; pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w; pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w; pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w; pts[2].x = 2 * box.x_ctr - pts[0].x; pts[2].y = 2 * box.y_ctr - pts[0].y; pts[3].x = 2 * box.x_ctr - pts[1].x; pts[3].y = 2 * box.y_ctr - pts[1].y; } template HOST_DEVICE_INLINE int get_intersection_points(const Point (&pts1)[4], const Point (&pts2)[4], Point (&intersections)[24]) { // Line vector // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1] Point vec1[4], vec2[4]; for (int i = 0; i < 4; i++) { vec1[i] = pts1[(i + 1) % 4] - pts1[i]; vec2[i] = pts2[(i + 1) % 4] - pts2[i]; } // Line test - test all line combos for intersection int num = 0; // number of intersections for (int i = 0; i < 4; i++) { for (int j = 0; j < 4; j++) { // Solve for 2x2 Ax=b T det = cross_2d(vec2[j], vec1[i]); // This takes care of parallel lines if (fabs(det) <= 1e-14) { continue; } auto vec12 = pts2[j] - pts1[i]; T t1 = cross_2d(vec2[j], vec12) / det; T t2 = cross_2d(vec1[i], vec12) / det; if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) { intersections[num++] = pts1[i] + vec1[i] * t1; } } } // Check for vertices of rect1 inside rect2 { const auto &AB = vec2[0]; const auto &DA = vec2[3]; auto ABdotAB = dot_2d(AB, AB); auto ADdotAD = dot_2d(DA, DA); for (int i = 0; i < 4; i++) { // assume ABCD is the rectangle, and P is the point to be judged // P is inside ABCD iff. P's projection on AB lies within AB // and P's projection on AD lies within AD auto AP = pts1[i] - pts2[0]; auto APdotAB = dot_2d(AP, AB); auto APdotAD = -dot_2d(AP, DA); if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD)) { intersections[num++] = pts1[i]; } } } // Reverse the check - check for vertices of rect2 inside rect1 { const auto &AB = vec1[0]; const auto &DA = vec1[3]; auto ABdotAB = dot_2d(AB, AB); auto ADdotAD = dot_2d(DA, DA); for (int i = 0; i < 4; i++) { auto AP = pts2[i] - pts1[0]; auto APdotAB = dot_2d(AP, AB); auto APdotAD = -dot_2d(AP, DA); if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD)) { intersections[num++] = pts2[i]; } } } return num; } template HOST_DEVICE_INLINE int convex_hull_graham(const Point (&p)[24], const int &num_in, Point (&q)[24], bool shift_to_zero = false) { assert(num_in >= 2); // Step 1: // Find point with minimum y // if more than 1 points have the same minimum y, // pick the one with the minimum x. int t = 0; for (int i = 1; i < num_in; i++) { if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) { t = i; } } auto &start = p[t]; // starting point // Step 2: // Subtract starting point from every points (for sorting in the next step) for (int i = 0; i < num_in; i++) { q[i] = p[i] - start; } // Swap the starting point to position 0 auto tmp = q[0]; q[0] = q[t]; q[t] = tmp; // Step 3: // Sort point 1 ~ num_in according to their relative cross-product values // (essentially sorting according to angles) // If the angles are the same, sort according to their distance to origin T dist[24]; for (int i = 0; i < num_in; i++) { dist[i] = dot_2d(q[i], q[i]); } #ifdef __CUDACC__ // CUDA version // In the future, we can potentially use thrust // for sorting here to improve speed (though not guaranteed) for (int i = 1; i < num_in - 1; i++) { for (int j = i + 1; j < num_in; j++) { T crossProduct = cross_2d(q[i], q[j]); if ((crossProduct < -1e-6) || (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) { auto q_tmp = q[i]; q[i] = q[j]; q[j] = q_tmp; auto dist_tmp = dist[i]; dist[i] = dist[j]; dist[j] = dist_tmp; } } } #else // CPU version std::sort(q + 1, q + num_in, [](const Point &A, const Point &B) -> bool { T temp = cross_2d(A, B); if (fabs(temp) < 1e-6) { return dot_2d(A, A) < dot_2d(B, B); } else { return temp > 0; } }); #endif // Step 4: // Make sure there are at least 2 points (that don't overlap with each other) // in the stack int k; // index of the non-overlapped second point for (k = 1; k < num_in; k++) { if (dist[k] > 1e-8) { break; } } if (k == num_in) { // We reach the end, which means the convex hull is just one point q[0] = p[t]; return 1; } q[1] = q[k]; int m = 2; // 2 points in the stack // Step 5: // Finally we can start the scanning process. // When a non-convex relationship between the 3 points is found // (either concave shape or duplicated points), // we pop the previous point from the stack // until the 3-point relationship is convex again, or // until the stack only contains two points for (int i = k + 1; i < num_in; i++) { while (m > 1 && cross_2d(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) { m--; } q[m++] = q[i]; } // Step 6 (Optional): // In general sense we need the original coordinates, so we // need to shift the points back (reverting Step 2) // But if we're only interested in getting the area/perimeter of the shape // We can simply return. if (!shift_to_zero) { for (int i = 0; i < m; i++) { q[i] += start; } } return m; } template HOST_DEVICE_INLINE T polygon_area(const Point (&q)[24], const int &m) { if (m <= 2) { return 0; } T area = 0; for (int i = 1; i < m - 1; i++) { area += fabs(cross_2d(q[i] - q[0], q[i + 1] - q[0])); } return area / 2.0; } template HOST_DEVICE_INLINE T rboxes_intersection(const RotatedBox &box1, const RotatedBox &box2) { // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned // from rotated_rect_intersection_pts Point intersectPts[24], orderedPts[24]; Point pts1[4]; Point pts2[4]; get_rotated_vertices(box1, pts1); get_rotated_vertices(box2, pts2); int num = get_intersection_points(pts1, pts2, intersectPts); if (num <= 2) { return 0.0; } // Convex Hull to order the intersection points in clockwise order and find // the contour area. int num_convex = convex_hull_graham(intersectPts, num, orderedPts, true); return polygon_area(orderedPts, num_convex); } } // namespace template HOST_DEVICE_INLINE T rbox_iou_single(T const *const box1_raw, T const *const box2_raw) { // shift center to the middle point to achieve higher precision in result RotatedBox box1, box2; auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0; auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0; box1.x_ctr = box1_raw[0] - center_shift_x; box1.y_ctr = box1_raw[1] - center_shift_y; box1.w = box1_raw[2]; box1.h = box1_raw[3]; box1.a = box1_raw[4]; box2.x_ctr = box2_raw[0] - center_shift_x; box2.y_ctr = box2_raw[1] - center_shift_y; box2.w = box2_raw[2]; box2.h = box2_raw[3]; box2.a = box2_raw[4]; if (box1.w < 1e-2 || box1.h < 1e-2 || box2.w < 1e-2 || box2.h < 1e-2) { return 0.f; } const T area1 = box1.w * box1.h; const T area2 = box2.w * box2.h; const T intersection = rboxes_intersection(box1, box2); const T iou = intersection / (area1 + area2 - intersection); return iou; } /** Computes ceil(a / b) */ HOST_DEVICE inline int CeilDiv(const int a, const int b) { return (a + b - 1) / b; } ================================================ FILE: ppdet/ext_op/setup.py ================================================ import os import glob import paddle from paddle.utils.cpp_extension import CppExtension, CUDAExtension, setup def get_extensions(): root_dir = os.path.dirname(os.path.abspath(__file__)) ext_root_dir = os.path.join(root_dir, 'csrc') sources = [] for ext_name in os.listdir(ext_root_dir): ext_dir = os.path.join(ext_root_dir, ext_name) source = glob.glob(os.path.join(ext_dir, '*.cc')) kwargs = dict() if paddle.device.is_compiled_with_cuda(): source += glob.glob(os.path.join(ext_dir, '*.cu')) if not source: continue sources += source if paddle.device.is_compiled_with_cuda(): extension = CUDAExtension( sources, extra_compile_args={'cxx': ['-DPADDLE_WITH_CUDA']}) else: extension = CppExtension(sources) return extension if __name__ == "__main__": setup(name='ext_op', ext_modules=get_extensions()) ================================================ FILE: ppdet/ext_op/unittest/test_matched_rbox_iou.py ================================================ import numpy as np import sys import time from shapely.geometry import Polygon import paddle import unittest from ext_op import matched_rbox_iou def rbox2poly_single(rrect, get_best_begin_point=False): """ rrect:[x_ctr,y_ctr,w,h,angle] to poly:[x0,y0,x1,y1,x2,y2,x3,y3] """ x_ctr, y_ctr, width, height, angle = rrect[:5] tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2 # rect 2x4 rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]]) R = np.array([[np.cos(angle), -np.sin(angle)], [np.sin(angle), np.cos(angle)]]) # poly poly = R.dot(rect) x0, x1, x2, x3 = poly[0, :4] + x_ctr y0, y1, y2, y3 = poly[1, :4] + y_ctr poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float64) return poly def intersection(g, p): """ Intersection. """ g = g[:8].reshape((4, 2)) p = p[:8].reshape((4, 2)) a = g b = p use_filter = True if use_filter: # step1: inter_x1 = np.maximum(np.min(a[:, 0]), np.min(b[:, 0])) inter_x2 = np.minimum(np.max(a[:, 0]), np.max(b[:, 0])) inter_y1 = np.maximum(np.min(a[:, 1]), np.min(b[:, 1])) inter_y2 = np.minimum(np.max(a[:, 1]), np.max(b[:, 1])) if inter_x1 >= inter_x2 or inter_y1 >= inter_y2: return 0. x1 = np.minimum(np.min(a[:, 0]), np.min(b[:, 0])) x2 = np.maximum(np.max(a[:, 0]), np.max(b[:, 0])) y1 = np.minimum(np.min(a[:, 1]), np.min(b[:, 1])) y2 = np.maximum(np.max(a[:, 1]), np.max(b[:, 1])) if x1 >= x2 or y1 >= y2 or (x2 - x1) < 2 or (y2 - y1) < 2: return 0. g = Polygon(g) p = Polygon(p) if not g.is_valid or not p.is_valid: return 0 inter = Polygon(g).intersection(Polygon(p)).area union = g.area + p.area - inter if union == 0: return 0 else: return inter / union def matched_rbox_overlaps(anchors, gt_bboxes, use_cv2=False): """ Args: anchors: [M, 5] x1,y1,x2,y2,angle gt_bboxes: [M, 5] x1,y1,x2,y2,angle Returns: macthed_iou: [M] """ assert anchors.shape[1] == 5 assert gt_bboxes.shape[1] == 5 gt_bboxes_ploy = [rbox2poly_single(e) for e in gt_bboxes] anchors_ploy = [rbox2poly_single(e) for e in anchors] num = len(anchors_ploy) iou = np.zeros((num, ), dtype=np.float64) start_time = time.time() for i in range(num): try: iou[i] = intersection(gt_bboxes_ploy[i], anchors_ploy[i]) except Exception as e: print('cur gt_bboxes_ploy[i]', gt_bboxes_ploy[i], 'anchors_ploy[j]', anchors_ploy[i], e) return iou def gen_sample(n): rbox = np.random.rand(n, 5) rbox[:, 0:4] = rbox[:, 0:4] * 0.45 + 0.001 rbox[:, 4] = rbox[:, 4] - 0.5 return rbox class MatchedRBoxIoUTest(unittest.TestCase): def setUp(self): self.initTestCase() self.rbox1 = gen_sample(self.n) self.rbox2 = gen_sample(self.n) def initTestCase(self): self.n = 1000 def assertAllClose(self, x, y, msg, atol=5e-1, rtol=1e-2): self.assertTrue(np.allclose(x, y, atol=atol, rtol=rtol), msg=msg) def get_places(self): places = [paddle.CPUPlace()] if paddle.device.is_compiled_with_cuda(): places.append(paddle.CUDAPlace(0)) return places def check_output(self, place): paddle.disable_static() pd_rbox1 = paddle.to_tensor(self.rbox1, place=place) pd_rbox2 = paddle.to_tensor(self.rbox2, place=place) actual_t = matched_rbox_iou(pd_rbox1, pd_rbox2).numpy() poly_rbox1 = self.rbox1 poly_rbox2 = self.rbox2 poly_rbox1[:, 0:4] = self.rbox1[:, 0:4] * 1024 poly_rbox2[:, 0:4] = self.rbox2[:, 0:4] * 1024 expect_t = matched_rbox_overlaps(poly_rbox1, poly_rbox2, use_cv2=False) self.assertAllClose( actual_t, expect_t, msg="rbox_iou has diff at {} \nExpect {}\nBut got {}".format( str(place), str(expect_t), str(actual_t))) def test_output(self): places = self.get_places() for place in places: self.check_output(place) if __name__ == "__main__": unittest.main() ================================================ FILE: ppdet/ext_op/unittest/test_rbox_iou.py ================================================ import numpy as np import sys import time from shapely.geometry import Polygon import paddle import unittest from ext_op import rbox_iou def rbox2poly_single(rrect, get_best_begin_point=False): """ rrect:[x_ctr,y_ctr,w,h,angle] to poly:[x0,y0,x1,y1,x2,y2,x3,y3] """ x_ctr, y_ctr, width, height, angle = rrect[:5] tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2 # rect 2x4 rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]]) R = np.array([[np.cos(angle), -np.sin(angle)], [np.sin(angle), np.cos(angle)]]) # poly poly = R.dot(rect) x0, x1, x2, x3 = poly[0, :4] + x_ctr y0, y1, y2, y3 = poly[1, :4] + y_ctr poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float64) return poly def intersection(g, p): """ Intersection. """ g = g[:8].reshape((4, 2)) p = p[:8].reshape((4, 2)) a = g b = p use_filter = True if use_filter: # step1: inter_x1 = np.maximum(np.min(a[:, 0]), np.min(b[:, 0])) inter_x2 = np.minimum(np.max(a[:, 0]), np.max(b[:, 0])) inter_y1 = np.maximum(np.min(a[:, 1]), np.min(b[:, 1])) inter_y2 = np.minimum(np.max(a[:, 1]), np.max(b[:, 1])) if inter_x1 >= inter_x2 or inter_y1 >= inter_y2: return 0. x1 = np.minimum(np.min(a[:, 0]), np.min(b[:, 0])) x2 = np.maximum(np.max(a[:, 0]), np.max(b[:, 0])) y1 = np.minimum(np.min(a[:, 1]), np.min(b[:, 1])) y2 = np.maximum(np.max(a[:, 1]), np.max(b[:, 1])) if x1 >= x2 or y1 >= y2 or (x2 - x1) < 2 or (y2 - y1) < 2: return 0. g = Polygon(g) p = Polygon(p) if not g.is_valid or not p.is_valid: return 0 inter = Polygon(g).intersection(Polygon(p)).area union = g.area + p.area - inter if union == 0: return 0 else: return inter / union def rbox_overlaps(anchors, gt_bboxes, use_cv2=False): """ Args: anchors: [NA, 5] x1,y1,x2,y2,angle gt_bboxes: [M, 5] x1,y1,x2,y2,angle Returns: iou: [NA, M] """ assert anchors.shape[1] == 5 assert gt_bboxes.shape[1] == 5 gt_bboxes_ploy = [rbox2poly_single(e) for e in gt_bboxes] anchors_ploy = [rbox2poly_single(e) for e in anchors] num_gt, num_anchors = len(gt_bboxes_ploy), len(anchors_ploy) iou = np.zeros((num_anchors, num_gt), dtype=np.float64) start_time = time.time() for i in range(num_anchors): for j in range(num_gt): try: iou[i, j] = intersection(anchors_ploy[i], gt_bboxes_ploy[j]) except Exception as e: print('cur anchors_ploy[i]', anchors_ploy[i], 'gt_bboxes_ploy[j]', gt_bboxes_ploy[j], e) return iou def gen_sample(n): rbox = np.random.rand(n, 5) rbox[:, 0:4] = rbox[:, 0:4] * 0.45 + 0.001 rbox[:, 4] = rbox[:, 4] - 0.5 return rbox class RBoxIoUTest(unittest.TestCase): def setUp(self): self.initTestCase() self.rbox1 = gen_sample(self.n) self.rbox2 = gen_sample(self.m) def initTestCase(self): self.n = 13000 self.m = 7 def assertAllClose(self, x, y, msg, atol=5e-1, rtol=1e-2): self.assertTrue(np.allclose(x, y, atol=atol, rtol=rtol), msg=msg) def get_places(self): places = [paddle.CPUPlace()] if paddle.device.is_compiled_with_cuda(): places.append(paddle.CUDAPlace(0)) return places def check_output(self, place): paddle.disable_static() pd_rbox1 = paddle.to_tensor(self.rbox1, place=place) pd_rbox2 = paddle.to_tensor(self.rbox2, place=place) actual_t = rbox_iou(pd_rbox1, pd_rbox2).numpy() poly_rbox1 = self.rbox1 poly_rbox2 = self.rbox2 poly_rbox1[:, 0:4] = self.rbox1[:, 0:4] * 1024 poly_rbox2[:, 0:4] = self.rbox2[:, 0:4] * 1024 expect_t = rbox_overlaps(poly_rbox1, poly_rbox2, use_cv2=False) self.assertAllClose( actual_t, expect_t, msg="rbox_iou has diff at {} \nExpect {}\nBut got {}".format( str(place), str(expect_t), str(actual_t))) def test_output(self): places = self.get_places() for place in places: self.check_output(place) if __name__ == "__main__": unittest.main() ================================================ FILE: ppdet/metrics/__init__.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import metrics from . import keypoint_metrics from .metrics import * from .keypoint_metrics import * from .pose3d_metrics import * __all__ = metrics.__all__ + keypoint_metrics.__all__ from . import mot_metrics from .mot_metrics import * __all__ = metrics.__all__ + mot_metrics.__all__ from . import mcmot_metrics from .mcmot_metrics import * __all__ = metrics.__all__ + mcmot_metrics.__all__ from . import culane_metrics from .culane_metrics import * __all__ = metrics.__all__ + culane_metrics.__all__ ================================================ FILE: ppdet/metrics/coco_utils.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys import numpy as np import itertools from ppdet.metrics.json_results import get_det_res, get_det_poly_res, get_seg_res, get_solov2_segm_res, get_keypoint_res, get_pose3d_res from ppdet.metrics.map_utils import draw_pr_curve from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) def get_infer_results(outs, catid, bias=0, save_threshold=0): """ Get result at the stage of inference. The output format is dictionary containing bbox or mask result. For example, bbox result is a list and each element contains image_id, category_id, bbox and score. """ if outs is None or len(outs) == 0: raise ValueError( 'The number of valid detection result if zero. Please use reasonable model and check input data.' ) im_id = outs['im_id'] im_file = outs['im_file'] if 'im_file' in outs else None infer_res = {} if 'bbox' in outs: if len(outs['bbox']) > 0 and len(outs['bbox'][0]) > 6: infer_res['bbox'] = get_det_poly_res( outs['bbox'], outs['bbox_num'], im_id, catid, bias=bias) else: infer_res['bbox'] = get_det_res( outs['bbox'], outs['bbox_num'], im_id, catid, bias=bias, im_file=im_file, save_threshold=save_threshold) if 'mask' in outs: # mask post process infer_res['mask'] = get_seg_res(outs['mask'], outs['bbox'], outs['bbox_num'], im_id, catid) if 'segm' in outs: infer_res['segm'] = get_solov2_segm_res(outs, im_id, catid) if 'keypoint' in outs: infer_res['keypoint'] = get_keypoint_res(outs, im_id) outs['bbox_num'] = [len(infer_res['keypoint'])] if 'pose3d' in outs: infer_res['pose3d'] = get_pose3d_res(outs, im_id) outs['bbox_num'] = [len(infer_res['pose3d'])] return infer_res def cocoapi_eval(jsonfile, style, coco_gt=None, anno_file=None, max_dets=(100, 300, 1000), classwise=False, sigmas=None, use_area=True): """ Args: jsonfile (str): Evaluation json file, eg: bbox.json, mask.json. style (str): COCOeval style, can be `bbox` , `segm` , `proposal`, `keypoints` and `keypoints_crowd`. coco_gt (str): Whether to load COCOAPI through anno_file, eg: coco_gt = COCO(anno_file) anno_file (str): COCO annotations file. max_dets (tuple): COCO evaluation maxDets. classwise (bool): Whether per-category AP and draw P-R Curve or not. sigmas (nparray): keypoint labelling sigmas. use_area (bool): If gt annotations (eg. CrowdPose, AIC) do not have 'area', please set use_area=False. """ assert coco_gt != None or anno_file != None if style == 'keypoints_crowd': #please install xtcocotools==1.6 from xtcocotools.coco import COCO from xtcocotools.cocoeval import COCOeval else: from pycocotools.coco import COCO try: from .fast_cocoeval import FastCOCOeval as COCOeval except: from pycocotools.cocoeval import COCOeval if coco_gt == None: coco_gt = COCO(anno_file) logger.info("Start evaluate...") coco_dt = coco_gt.loadRes(jsonfile) if style == 'proposal': coco_eval = COCOeval(coco_gt, coco_dt, 'bbox') coco_eval.params.useCats = 0 coco_eval.params.maxDets = list(max_dets) elif style == 'keypoints_crowd': coco_eval = COCOeval(coco_gt, coco_dt, style, sigmas, use_area) else: coco_eval = COCOeval(coco_gt, coco_dt, style) coco_eval.evaluate() coco_eval.accumulate() coco_eval.summarize() if classwise: # Compute per-category AP and PR curve try: from terminaltables import AsciiTable except Exception as e: logger.error( 'terminaltables not found, plaese install terminaltables. ' 'for example: `pip install terminaltables`.') raise e precisions = coco_eval.eval['precision'] cat_ids = coco_gt.getCatIds() # precision: (iou, recall, cls, area range, max dets) assert len(cat_ids) == precisions.shape[2] results_per_category = [] for idx, catId in enumerate(cat_ids): # area range index 0: all area ranges # max dets index -1: typically 100 per image nm = coco_gt.loadCats(catId)[0] precision = precisions[:, :, idx, 0, -1] precision = precision[precision > -1] if precision.size: ap = np.mean(precision) else: ap = float('nan') results_per_category.append( (str(nm["name"]), '{:0.3f}'.format(float(ap)))) pr_array = precisions[0, :, idx, 0, 2] recall_array = np.arange(0.0, 1.01, 0.01) draw_pr_curve( pr_array, recall_array, out_dir=style + '_pr_curve', file_name='{}_precision_recall_curve.jpg'.format(nm["name"])) num_columns = min(6, len(results_per_category) * 2) results_flatten = list(itertools.chain(*results_per_category)) headers = ['category', 'AP'] * (num_columns // 2) results_2d = itertools.zip_longest( *[results_flatten[i::num_columns] for i in range(num_columns)]) table_data = [headers] table_data += [result for result in results_2d] table = AsciiTable(table_data) logger.info('Per-category of {} AP: \n{}'.format(style, table.table)) logger.info("per-category PR curve has output to {} folder.".format( style + '_pr_curve')) # flush coco evaluation result sys.stdout.flush() return coco_eval.stats def json_eval_results(metric, json_directory, dataset): """ cocoapi eval with already exists proposal.json, bbox.json or mask.json """ assert metric == 'COCO' anno_file = dataset.get_anno() json_file_list = ['proposal.json', 'bbox.json', 'mask.json'] if json_directory: assert os.path.exists( json_directory), "The json directory:{} does not exist".format( json_directory) for k, v in enumerate(json_file_list): json_file_list[k] = os.path.join(str(json_directory), v) coco_eval_style = ['proposal', 'bbox', 'segm'] for i, v_json in enumerate(json_file_list): if os.path.exists(v_json): cocoapi_eval(v_json, coco_eval_style[i], anno_file=anno_file) else: logger.info("{} not exists!".format(v_json)) ================================================ FILE: ppdet/metrics/culane_metrics.py ================================================ import os import cv2 import numpy as np import os.path as osp from functools import partial from .metrics import Metric from scipy.interpolate import splprep, splev from scipy.optimize import linear_sum_assignment from shapely.geometry import LineString, Polygon from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) __all__ = [ 'draw_lane', 'discrete_cross_iou', 'continuous_cross_iou', 'interp', 'culane_metric', 'load_culane_img_data', 'load_culane_data', 'eval_predictions', "CULaneMetric" ] LIST_FILE = { 'train': 'list/train_gt.txt', 'val': 'list/val.txt', 'test': 'list/test.txt', } CATEGORYS = { 'normal': 'list/test_split/test0_normal.txt', 'crowd': 'list/test_split/test1_crowd.txt', 'hlight': 'list/test_split/test2_hlight.txt', 'shadow': 'list/test_split/test3_shadow.txt', 'noline': 'list/test_split/test4_noline.txt', 'arrow': 'list/test_split/test5_arrow.txt', 'curve': 'list/test_split/test6_curve.txt', 'cross': 'list/test_split/test7_cross.txt', 'night': 'list/test_split/test8_night.txt', } def draw_lane(lane, img=None, img_shape=None, width=30): if img is None: img = np.zeros(img_shape, dtype=np.uint8) lane = lane.astype(np.int32) for p1, p2 in zip(lane[:-1], lane[1:]): cv2.line( img, tuple(p1), tuple(p2), color=(255, 255, 255), thickness=width) return img def discrete_cross_iou(xs, ys, width=30, img_shape=(590, 1640, 3)): xs = [draw_lane(lane, img_shape=img_shape, width=width) > 0 for lane in xs] ys = [draw_lane(lane, img_shape=img_shape, width=width) > 0 for lane in ys] ious = np.zeros((len(xs), len(ys))) for i, x in enumerate(xs): for j, y in enumerate(ys): ious[i, j] = (x & y).sum() / (x | y).sum() return ious def continuous_cross_iou(xs, ys, width=30, img_shape=(590, 1640, 3)): h, w, _ = img_shape image = Polygon([(0, 0), (0, h - 1), (w - 1, h - 1), (w - 1, 0)]) xs = [ LineString(lane).buffer( distance=width / 2., cap_style=1, join_style=2).intersection(image) for lane in xs ] ys = [ LineString(lane).buffer( distance=width / 2., cap_style=1, join_style=2).intersection(image) for lane in ys ] ious = np.zeros((len(xs), len(ys))) for i, x in enumerate(xs): for j, y in enumerate(ys): ious[i, j] = x.intersection(y).area / x.union(y).area return ious def interp(points, n=50): x = [x for x, _ in points] y = [y for _, y in points] tck, u = splprep([x, y], s=0, t=n, k=min(3, len(points) - 1)) u = np.linspace(0., 1., num=(len(u) - 1) * n + 1) return np.array(splev(u, tck)).T def culane_metric(pred, anno, width=30, iou_thresholds=[0.5], official=True, img_shape=(590, 1640, 3)): _metric = {} for thr in iou_thresholds: tp = 0 fp = 0 if len(anno) != 0 else len(pred) fn = 0 if len(pred) != 0 else len(anno) _metric[thr] = [tp, fp, fn] interp_pred = np.array( [interp( pred_lane, n=5) for pred_lane in pred], dtype=object) # (4, 50, 2) interp_anno = np.array( [interp( anno_lane, n=5) for anno_lane in anno], dtype=object) # (4, 50, 2) if official: ious = discrete_cross_iou( interp_pred, interp_anno, width=width, img_shape=img_shape) else: ious = continuous_cross_iou( interp_pred, interp_anno, width=width, img_shape=img_shape) row_ind, col_ind = linear_sum_assignment(1 - ious) _metric = {} for thr in iou_thresholds: tp = int((ious[row_ind, col_ind] > thr).sum()) fp = len(pred) - tp fn = len(anno) - tp _metric[thr] = [tp, fp, fn] return _metric def load_culane_img_data(path): with open(path, 'r') as data_file: img_data = data_file.readlines() img_data = [line.split() for line in img_data] img_data = [list(map(float, lane)) for lane in img_data] img_data = [[(lane[i], lane[i + 1]) for i in range(0, len(lane), 2)] for lane in img_data] img_data = [lane for lane in img_data if len(lane) >= 2] return img_data def load_culane_data(data_dir, file_list_path): with open(file_list_path, 'r') as file_list: filepaths = [ os.path.join(data_dir, line[1 if line[0] == '/' else 0:].rstrip().replace( '.jpg', '.lines.txt')) for line in file_list.readlines() ] data = [] for path in filepaths: img_data = load_culane_img_data(path) data.append(img_data) return data def eval_predictions(pred_dir, anno_dir, list_path, iou_thresholds=[0.5], width=30, official=True, sequential=False): logger.info('Calculating metric for List: {}'.format(list_path)) predictions = load_culane_data(pred_dir, list_path) annotations = load_culane_data(anno_dir, list_path) img_shape = (590, 1640, 3) if sequential: results = map(partial( culane_metric, width=width, official=official, iou_thresholds=iou_thresholds, img_shape=img_shape), predictions, annotations) else: from multiprocessing import Pool, cpu_count from itertools import repeat with Pool(cpu_count()) as p: results = p.starmap(culane_metric, zip(predictions, annotations, repeat(width), repeat(iou_thresholds), repeat(official), repeat(img_shape))) mean_f1, mean_prec, mean_recall, total_tp, total_fp, total_fn = 0, 0, 0, 0, 0, 0 ret = {} for thr in iou_thresholds: tp = sum(m[thr][0] for m in results) fp = sum(m[thr][1] for m in results) fn = sum(m[thr][2] for m in results) precision = float(tp) / (tp + fp) if tp != 0 else 0 recall = float(tp) / (tp + fn) if tp != 0 else 0 f1 = 2 * precision * recall / (precision + recall) if tp != 0 else 0 logger.info('iou thr: {:.2f}, tp: {}, fp: {}, fn: {},' 'precision: {}, recall: {}, f1: {}'.format( thr, tp, fp, fn, precision, recall, f1)) mean_f1 += f1 / len(iou_thresholds) mean_prec += precision / len(iou_thresholds) mean_recall += recall / len(iou_thresholds) total_tp += tp total_fp += fp total_fn += fn ret[thr] = { 'TP': tp, 'FP': fp, 'FN': fn, 'Precision': precision, 'Recall': recall, 'F1': f1 } if len(iou_thresholds) > 2: logger.info( 'mean result, total_tp: {}, total_fp: {}, total_fn: {},' 'precision: {}, recall: {}, f1: {}'.format( total_tp, total_fp, total_fn, mean_prec, mean_recall, mean_f1)) ret['mean'] = { 'TP': total_tp, 'FP': total_fp, 'FN': total_fn, 'Precision': mean_prec, 'Recall': mean_recall, 'F1': mean_f1 } return ret class CULaneMetric(Metric): def __init__(self, cfg, output_eval=None, split="test", dataset_dir="dataset/CULane/"): super(CULaneMetric, self).__init__() self.output_eval = "evaluation" if output_eval is None else output_eval self.dataset_dir = dataset_dir self.split = split self.list_path = osp.join(dataset_dir, LIST_FILE[split]) self.predictions = [] self.img_names = [] self.lanes = [] self.eval_results = {} self.cfg = cfg self.reset() def reset(self): self.predictions = [] self.img_names = [] self.lanes = [] self.eval_results = {} def get_prediction_string(self, pred): ys = np.arange(270, 590, 8) / self.cfg.ori_img_h out = [] for lane in pred: xs = lane(ys) valid_mask = (xs >= 0) & (xs < 1) xs = xs * self.cfg.ori_img_w lane_xs = xs[valid_mask] lane_ys = ys[valid_mask] * self.cfg.ori_img_h lane_xs, lane_ys = lane_xs[::-1], lane_ys[::-1] lane_str = ' '.join([ '{:.5f} {:.5f}'.format(x, y) for x, y in zip(lane_xs, lane_ys) ]) if lane_str != '': out.append(lane_str) return '\n'.join(out) def accumulate(self): loss_lines = [[], [], [], []] for idx, pred in enumerate(self.predictions): output_dir = os.path.join(self.output_eval, os.path.dirname(self.img_names[idx])) output_filename = os.path.basename(self.img_names[ idx])[:-3] + 'lines.txt' os.makedirs(output_dir, exist_ok=True) output = self.get_prediction_string(pred) # store loss lines lanes = self.lanes[idx] if len(lanes) - len(pred) in [1, 2, 3, 4]: loss_lines[len(lanes) - len(pred) - 1].append(self.img_names[ idx]) with open(os.path.join(output_dir, output_filename), 'w') as out_file: out_file.write(output) for i, names in enumerate(loss_lines): with open( os.path.join(output_dir, 'loss_{}_lines.txt'.format(i + 1)), 'w') as f: for name in names: f.write(name + '\n') for cate, cate_file in CATEGORYS.items(): result = eval_predictions( self.output_eval, self.dataset_dir, os.path.join(self.dataset_dir, cate_file), iou_thresholds=[0.5], official=True) result = eval_predictions( self.output_eval, self.dataset_dir, self.list_path, iou_thresholds=np.linspace(0.5, 0.95, 10), official=True) self.eval_results['F1@50'] = result[0.5]['F1'] self.eval_results['result'] = result def update(self, inputs, outputs): assert len(inputs['img_name']) == len(outputs['lanes']) self.predictions.extend(outputs['lanes']) self.img_names.extend(inputs['img_name']) self.lanes.extend(inputs['lane_line']) def log(self): logger.info(self.eval_results) # abstract method for getting metric results def get_results(self): return self.eval_results ================================================ FILE: ppdet/metrics/fast_cocoeval/README.md ================================================ # COCOeval C++ 扩展编译 ## 安装 ``` cd ext python setup.py install ``` ================================================ FILE: ppdet/metrics/fast_cocoeval/__init__.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import fast_cocoeval from .fast_cocoeval import * ================================================ FILE: ppdet/metrics/fast_cocoeval/ext/cocoeval.cc ================================================ // Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // The code is based on // https://github.com/facebookresearch/detectron2/tree/main/detectron2/layers/csrc/cocoeval/ #include "cocoeval.h" #include #include #include #include using namespace pybind11::literals; // Sort detections from highest score to lowest, such that // detection_instances[detection_sorted_indices[t]] >= // detection_instances[detection_sorted_indices[t+1]]. Use stable_sort to match // original COCO API void SortInstancesByDetectionScore( const std::vector& detection_instances, std::vector* detection_sorted_indices) { detection_sorted_indices->resize(detection_instances.size()); std::iota( detection_sorted_indices->begin(), detection_sorted_indices->end(), 0); std::stable_sort( detection_sorted_indices->begin(), detection_sorted_indices->end(), [&detection_instances](size_t j1, size_t j2) { return detection_instances[j1].score > detection_instances[j2].score; }); } // Partition the ground truth objects based on whether or not to ignore them // based on area void SortInstancesByIgnore( const std::array& area_range, const std::vector& ground_truth_instances, std::vector* ground_truth_sorted_indices, std::vector* ignores) { ignores->clear(); ignores->reserve(ground_truth_instances.size()); for (auto o : ground_truth_instances) { ignores->push_back( o.ignore || o.area < area_range[0] || o.area > area_range[1]); } ground_truth_sorted_indices->resize(ground_truth_instances.size()); std::iota( ground_truth_sorted_indices->begin(), ground_truth_sorted_indices->end(), 0); std::stable_sort( ground_truth_sorted_indices->begin(), ground_truth_sorted_indices->end(), [&ignores](size_t j1, size_t j2) { return (int)(*ignores)[j1] < (int)(*ignores)[j2]; }); } // For each IOU threshold, greedily match each detected instance to a ground // truth instance (if possible) and store the results void MatchDetectionsToGroundTruth( const std::vector& detection_instances, const std::vector& detection_sorted_indices, const std::vector& ground_truth_instances, const std::vector& ground_truth_sorted_indices, const std::vector& ignores, const std::vector>& ious, const std::vector& iou_thresholds, const std::array& area_range, ImageEvaluation* results) { // Initialize memory to store return data matches and ignore const int num_iou_thresholds = iou_thresholds.size(); const int num_ground_truth = ground_truth_sorted_indices.size(); const int num_detections = detection_sorted_indices.size(); std::vector ground_truth_matches( num_iou_thresholds * num_ground_truth, 0); std::vector& detection_matches = results->detection_matches; std::vector& detection_ignores = results->detection_ignores; std::vector& ground_truth_ignores = results->ground_truth_ignores; detection_matches.resize(num_iou_thresholds * num_detections, 0); detection_ignores.resize(num_iou_thresholds * num_detections, false); ground_truth_ignores.resize(num_ground_truth); for (auto g = 0; g < num_ground_truth; ++g) { ground_truth_ignores[g] = ignores[ground_truth_sorted_indices[g]]; } for (auto t = 0; t < num_iou_thresholds; ++t) { for (auto d = 0; d < num_detections; ++d) { // information about best match so far (match=-1 -> unmatched) double best_iou = std::min(iou_thresholds[t], 1 - 1e-10); int match = -1; for (auto g = 0; g < num_ground_truth; ++g) { // if this ground truth instance is already matched and not a // crowd, it cannot be matched to another detection if (ground_truth_matches[t * num_ground_truth + g] > 0 && !ground_truth_instances[ground_truth_sorted_indices[g]].is_crowd) { continue; } // if detected instance matched to a regular ground truth // instance, we can break on the first ground truth instance // tagged as ignore (because they are sorted by the ignore tag) if (match >= 0 && !ground_truth_ignores[match] && ground_truth_ignores[g]) { break; } // if IOU overlap is the best so far, store the match appropriately if (ious[d][ground_truth_sorted_indices[g]] >= best_iou) { best_iou = ious[d][ground_truth_sorted_indices[g]]; match = g; } } // if match was made, store id of match for both detection and // ground truth if (match >= 0) { detection_ignores[t * num_detections + d] = ground_truth_ignores[match]; detection_matches[t * num_detections + d] = ground_truth_instances[ground_truth_sorted_indices[match]].id; ground_truth_matches[t * num_ground_truth + match] = detection_instances[detection_sorted_indices[d]].id; } // set unmatched detections outside of area range to ignore const InstanceAnnotation& detection = detection_instances[detection_sorted_indices[d]]; detection_ignores[t * num_detections + d] = detection_ignores[t * num_detections + d] || (detection_matches[t * num_detections + d] == 0 && (detection.area < area_range[0] || detection.area > area_range[1])); } } // store detection score results results->detection_scores.resize(detection_sorted_indices.size()); for (size_t d = 0; d < detection_sorted_indices.size(); ++d) { results->detection_scores[d] = detection_instances[detection_sorted_indices[d]].score; } } std::vector EvaluateImages( const std::vector>& area_ranges, int max_detections, const std::vector& iou_thresholds, const ImageCategoryInstances>& image_category_ious, const ImageCategoryInstances& image_category_ground_truth_instances, const ImageCategoryInstances& image_category_detection_instances) { const int num_area_ranges = area_ranges.size(); const int num_images = image_category_ground_truth_instances.size(); const int num_categories = image_category_ious.size() > 0 ? image_category_ious[0].size() : 0; std::vector detection_sorted_indices; std::vector ground_truth_sorted_indices; std::vector ignores; std::vector results_all( num_images * num_area_ranges * num_categories); // Store results for each image, category, and area range combination. Results // for each IOU threshold are packed into the same ImageEvaluation object for (auto i = 0; i < num_images; ++i) { for (auto c = 0; c < num_categories; ++c) { const std::vector& ground_truth_instances = image_category_ground_truth_instances[i][c]; const std::vector& detection_instances = image_category_detection_instances[i][c]; SortInstancesByDetectionScore( detection_instances, &detection_sorted_indices); if ((int)detection_sorted_indices.size() > max_detections) { detection_sorted_indices.resize(max_detections); } for (size_t a = 0; a < area_ranges.size(); ++a) { SortInstancesByIgnore( area_ranges[a], ground_truth_instances, &ground_truth_sorted_indices, &ignores); MatchDetectionsToGroundTruth( detection_instances, detection_sorted_indices, ground_truth_instances, ground_truth_sorted_indices, ignores, image_category_ious[i][c], iou_thresholds, area_ranges[a], &results_all [c * num_area_ranges * num_images + a * num_images + i]); } } } return results_all; } // Convert a python list to a vector template std::vector list_to_vec(const py::list& l) { std::vector v(py::len(l)); for (int i = 0; i < (int)py::len(l); ++i) { v[i] = l[i].cast(); } return v; } // Helper function to Accumulate() // Considers the evaluation results applicable to a particular category, area // range, and max_detections parameter setting, which begin at // evaluations[evaluation_index]. Extracts a sorted list of length n of all // applicable detection instances concatenated across all images in the dataset, // which are represented by the outputs evaluation_indices, detection_scores, // image_detection_indices, and detection_sorted_indices--all of which are // length n. evaluation_indices[i] stores the applicable index into // evaluations[] for instance i, which has detection score detection_score[i], // and is the image_detection_indices[i]'th of the list of detections // for the image containing i. detection_sorted_indices[] defines a sorted // permutation of the 3 other outputs int BuildSortedDetectionList( const std::vector& evaluations, const int64_t evaluation_index, const int64_t num_images, const int max_detections, std::vector* evaluation_indices, std::vector* detection_scores, std::vector* detection_sorted_indices, std::vector* image_detection_indices) { assert(evaluations.size() >= evaluation_index + num_images); // Extract a list of object instances of the applicable category, area // range, and max detections requirements such that they can be sorted image_detection_indices->clear(); evaluation_indices->clear(); detection_scores->clear(); image_detection_indices->reserve(num_images * max_detections); evaluation_indices->reserve(num_images * max_detections); detection_scores->reserve(num_images * max_detections); int num_valid_ground_truth = 0; for (auto i = 0; i < num_images; ++i) { const ImageEvaluation& evaluation = evaluations[evaluation_index + i]; for (int d = 0; d < (int)evaluation.detection_scores.size() && d < max_detections; ++d) { // detected instances evaluation_indices->push_back(evaluation_index + i); image_detection_indices->push_back(d); detection_scores->push_back(evaluation.detection_scores[d]); } for (auto ground_truth_ignore : evaluation.ground_truth_ignores) { if (!ground_truth_ignore) { ++num_valid_ground_truth; } } } // Sort detections by decreasing score, using stable sort to match // python implementation detection_sorted_indices->resize(detection_scores->size()); std::iota( detection_sorted_indices->begin(), detection_sorted_indices->end(), 0); std::stable_sort( detection_sorted_indices->begin(), detection_sorted_indices->end(), [&detection_scores](size_t j1, size_t j2) { return (*detection_scores)[j1] > (*detection_scores)[j2]; }); return num_valid_ground_truth; } // Helper function to Accumulate() // Compute a precision recall curve given a sorted list of detected instances // encoded in evaluations, evaluation_indices, detection_scores, // detection_sorted_indices, image_detection_indices (see // BuildSortedDetectionList()). Using vectors precisions and recalls // and temporary storage, output the results into precisions_out, recalls_out, // and scores_out, which are large buffers containing many precion/recall curves // for all possible parameter settings, with precisions_out_index and // recalls_out_index defining the applicable indices to store results. void ComputePrecisionRecallCurve( const int64_t precisions_out_index, const int64_t precisions_out_stride, const int64_t recalls_out_index, const std::vector& recall_thresholds, const int iou_threshold_index, const int num_iou_thresholds, const int num_valid_ground_truth, const std::vector& evaluations, const std::vector& evaluation_indices, const std::vector& detection_scores, const std::vector& detection_sorted_indices, const std::vector& image_detection_indices, std::vector* precisions, std::vector* recalls, std::vector* precisions_out, std::vector* scores_out, std::vector* recalls_out) { assert(recalls_out->size() > recalls_out_index); // Compute precision/recall for each instance in the sorted list of detections int64_t true_positives_sum = 0, false_positives_sum = 0; precisions->clear(); recalls->clear(); precisions->reserve(detection_sorted_indices.size()); recalls->reserve(detection_sorted_indices.size()); assert(!evaluations.empty() || detection_sorted_indices.empty()); for (auto detection_sorted_index : detection_sorted_indices) { const ImageEvaluation& evaluation = evaluations[evaluation_indices[detection_sorted_index]]; const auto num_detections = evaluation.detection_matches.size() / num_iou_thresholds; const auto detection_index = iou_threshold_index * num_detections + image_detection_indices[detection_sorted_index]; assert(evaluation.detection_matches.size() > detection_index); assert(evaluation.detection_ignores.size() > detection_index); const int64_t detection_match = evaluation.detection_matches[detection_index]; const bool detection_ignores = evaluation.detection_ignores[detection_index]; const auto true_positive = detection_match > 0 && !detection_ignores; const auto false_positive = detection_match == 0 && !detection_ignores; if (true_positive) { ++true_positives_sum; } if (false_positive) { ++false_positives_sum; } const double recall = static_cast(true_positives_sum) / num_valid_ground_truth; recalls->push_back(recall); const int64_t num_valid_detections = true_positives_sum + false_positives_sum; const double precision = num_valid_detections > 0 ? static_cast(true_positives_sum) / num_valid_detections : 0.0; precisions->push_back(precision); } (*recalls_out)[recalls_out_index] = !recalls->empty() ? recalls->back() : 0; for (int64_t i = static_cast(precisions->size()) - 1; i > 0; --i) { if ((*precisions)[i] > (*precisions)[i - 1]) { (*precisions)[i - 1] = (*precisions)[i]; } } // Sample the per instance precision/recall list at each recall threshold for (size_t r = 0; r < recall_thresholds.size(); ++r) { // first index in recalls >= recall_thresholds[r] std::vector::iterator low = std::lower_bound( recalls->begin(), recalls->end(), recall_thresholds[r]); size_t precisions_index = low - recalls->begin(); const auto results_ind = precisions_out_index + r * precisions_out_stride; assert(results_ind < precisions_out->size()); assert(results_ind < scores_out->size()); if (precisions_index < precisions->size()) { (*precisions_out)[results_ind] = (*precisions)[precisions_index]; (*scores_out)[results_ind] = detection_scores[detection_sorted_indices[precisions_index]]; } else { (*precisions_out)[results_ind] = 0; (*scores_out)[results_ind] = 0; } } } py::dict Accumulate( const py::object& params, const std::vector& evaluations) { const std::vector recall_thresholds = list_to_vec(params.attr("recThrs")); const std::vector max_detections = list_to_vec(params.attr("maxDets")); const int num_iou_thresholds = py::len(params.attr("iouThrs")); const int num_recall_thresholds = py::len(params.attr("recThrs")); const int num_categories = params.attr("useCats").cast() == 1 ? py::len(params.attr("catIds")) : 1; const int num_area_ranges = py::len(params.attr("areaRng")); const int num_max_detections = py::len(params.attr("maxDets")); const int num_images = py::len(params.attr("imgIds")); std::vector precisions_out( num_iou_thresholds * num_recall_thresholds * num_categories * num_area_ranges * num_max_detections, -1); std::vector recalls_out( num_iou_thresholds * num_categories * num_area_ranges * num_max_detections, -1); std::vector scores_out( num_iou_thresholds * num_recall_thresholds * num_categories * num_area_ranges * num_max_detections, -1); // Consider the list of all detected instances in the entire dataset in one // large list. evaluation_indices, detection_scores, // image_detection_indices, and detection_sorted_indices all have the same // length as this list, such that each entry corresponds to one detected // instance std::vector evaluation_indices; // indices into evaluations[] std::vector detection_scores; // detection scores of each instance std::vector detection_sorted_indices; // sorted indices of all // instances in the dataset std::vector image_detection_indices; // indices into the list of detected instances in // the same image as each instance std::vector precisions, recalls; for (auto c = 0; c < num_categories; ++c) { for (auto a = 0; a < num_area_ranges; ++a) { for (auto m = 0; m < num_max_detections; ++m) { // The COCO PythonAPI assumes evaluations[] (the return value of // COCOeval::EvaluateImages() is one long list storing results for each // combination of category, area range, and image id, with categories in // the outermost loop and images in the innermost loop. const int64_t evaluations_index = c * num_area_ranges * num_images + a * num_images; int num_valid_ground_truth = BuildSortedDetectionList( evaluations, evaluations_index, num_images, max_detections[m], &evaluation_indices, &detection_scores, &detection_sorted_indices, &image_detection_indices); if (num_valid_ground_truth == 0) { continue; } for (auto t = 0; t < num_iou_thresholds; ++t) { // recalls_out is a flattened vectors representing a // num_iou_thresholds X num_categories X num_area_ranges X // num_max_detections matrix const int64_t recalls_out_index = t * num_categories * num_area_ranges * num_max_detections + c * num_area_ranges * num_max_detections + a * num_max_detections + m; // precisions_out and scores_out are flattened vectors // representing a num_iou_thresholds X num_recall_thresholds X // num_categories X num_area_ranges X num_max_detections matrix const int64_t precisions_out_stride = num_categories * num_area_ranges * num_max_detections; const int64_t precisions_out_index = t * num_recall_thresholds * num_categories * num_area_ranges * num_max_detections + c * num_area_ranges * num_max_detections + a * num_max_detections + m; ComputePrecisionRecallCurve( precisions_out_index, precisions_out_stride, recalls_out_index, recall_thresholds, t, num_iou_thresholds, num_valid_ground_truth, evaluations, evaluation_indices, detection_scores, detection_sorted_indices, image_detection_indices, &precisions, &recalls, &precisions_out, &scores_out, &recalls_out); } } } } time_t rawtime; struct tm local_time; std::array buffer; time(&rawtime); #ifdef _WIN32 localtime_s(&local_time, &rawtime); #else localtime_r(&rawtime, &local_time); #endif strftime( buffer.data(), 200, "%Y-%m-%d %H:%num_max_detections:%S", &local_time); return py::dict( "params"_a = params, "counts"_a = std::vector( {num_iou_thresholds, num_recall_thresholds, num_categories, num_area_ranges, num_max_detections}), "date"_a = buffer, "precision"_a = precisions_out, "recall"_a = recalls_out, "scores"_a = scores_out); } PYBIND11_MODULE(cocoeval_ext, m) { m.def("COCOevalAccumulate", &Accumulate, "Accumulate"); m.def("COCOevalEvaluateImages", &EvaluateImages, "EvaluateImages"); py::class_(m, "InstanceAnnotation") .def(py::init()); py::class_(m, "ImageEvaluation") .def(py::init<>()); } ================================================ FILE: ppdet/metrics/fast_cocoeval/ext/cocoeval.h ================================================ // Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // The code is based on // https://github.com/facebookresearch/detectron2/tree/main/detectron2/layers/csrc/cocoeval/ #pragma once #include #include #include #include #include namespace py = pybind11; // Annotation data for a single object instance in an image struct InstanceAnnotation { InstanceAnnotation( uint64_t id, double score, double area, bool is_crowd, bool ignore) : id{id}, score{score}, area{area}, is_crowd{is_crowd}, ignore{ignore} {} uint64_t id; double score = 0.; double area = 0.; bool is_crowd = false; bool ignore = false; }; // Stores intermediate results for evaluating detection results for a single // image that has D detected instances and G ground truth instances. This stores // matches between detected and ground truth instances struct ImageEvaluation { // For each of the D detected instances, the id of the matched ground truth // instance, or 0 if unmatched std::vector detection_matches; // The detection score of each of the D detected instances std::vector detection_scores; // Marks whether or not each of G instances was ignored from evaluation (e.g., // because it's outside area_range) std::vector ground_truth_ignores; // Marks whether or not each of D instances was ignored from evaluation (e.g., // because it's outside aRng) std::vector detection_ignores; }; template using ImageCategoryInstances = std::vector>>; // C++ implementation of COCO API cocoeval.py::COCOeval.evaluateImg(). For each // combination of image, category, area range settings, and IOU thresholds to // evaluate, it matches detected instances to ground truth instances and stores // the results into a vector of ImageEvaluation results, which will be // interpreted by the COCOeval::Accumulate() function to produce precion-recall // curves. The parameters of nested vectors have the following semantics: // image_category_ious[i][c][d][g] is the intersection over union of the d'th // detected instance and g'th ground truth instance of // category category_ids[c] in image image_ids[i] // image_category_ground_truth_instances[i][c] is a vector of ground truth // instances in image image_ids[i] of category category_ids[c] // image_category_detection_instances[i][c] is a vector of detected // instances in image image_ids[i] of category category_ids[c] std::vector EvaluateImages( const std::vector>& area_ranges, // vector of 2-tuples int max_detections, const std::vector& iou_thresholds, const ImageCategoryInstances>& image_category_ious, const ImageCategoryInstances& image_category_ground_truth_instances, const ImageCategoryInstances& image_category_detection_instances); // C++ implementation of COCOeval.accumulate(), which generates precision // recall curves for each set of category, IOU threshold, detection area range, // and max number of detections parameters. It is assumed that the parameter // evaluations is the return value of the functon COCOeval::EvaluateImages(), // which was called with the same parameter settings params py::dict Accumulate( const py::object& params, const std::vector& evalutations); ================================================ FILE: ppdet/metrics/fast_cocoeval/ext/setup.py ================================================ from pybind11.setup_helpers import Pybind11Extension, build_ext from setuptools import setup ext_modules = [Pybind11Extension("cocoeval_ext", ["cocoeval.cc"])] setup( name="cocoeval_ext", version="0.0.0", ext_modules=ext_modules, cmdclass={"build_ext": build_ext}, ) ================================================ FILE: ppdet/metrics/fast_cocoeval/fast_cocoeval.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # The code is based on # https://github.com/facebookresearch/detectron2/blob/main/detectron2/evaluation/fast_eval_api.py import copy import time import numpy as np from cocoeval_ext import InstanceAnnotation, COCOevalEvaluateImages, COCOevalAccumulate from pycocotools.cocoeval import COCOeval __all__ = ['FastCOCOeval'] class FastCOCOeval(COCOeval): """ This is a slightly modified version of the original COCO API, where the functions evaluateImg() and accumulate() are implemented in C++ to speedup evaluation """ def evaluate(self): """ Run per image evaluation on given images and store results in self.evalImgs_cpp, a datastructure that isn't readable from Python but is used by a c++ implementation of accumulate(). Unlike the original COCO PythonAPI, we don't populate the datastructure self.evalImgs because this datastructure is a computational bottleneck. :return: None """ tic = time.time() print('Running per image evaluation...') p = self.params # add backward compatibility if useSegm is specified in params if p.useSegm is not None: p.iouType = "segm" if p.useSegm == 1 else "bbox" print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType)) print('Evaluate annotation type *{}*'.format(p.iouType)) p.imgIds = list(np.unique(p.imgIds)) if p.useCats: p.catIds = list(np.unique(p.catIds)) p.maxDets = sorted(p.maxDets) self.params = p self._prepare() # bottleneck # loop through images, area range, max detection number catIds = p.catIds if p.useCats else [-1] if p.iouType == "segm" or p.iouType == "bbox": computeIoU = self.computeIoU elif p.iouType == "keypoints": computeIoU = self.computeOks self.ious = { (imgId, catId): computeIoU(imgId, catId) for imgId in p.imgIds for catId in catIds } # bottleneck maxDet = p.maxDets[-1] # <<<< Beginning of code differences with original COCO API def convert_instances_to_cpp(instances, is_det=False): # Convert annotations for a list of instances in an image to a format that's fast # to access in C++ instances_cpp = [] for instance in instances: instance_cpp = InstanceAnnotation( int(instance["id"]), instance["score"] if is_det else instance.get("score", 0.0), instance["area"], bool(instance.get("iscrowd", 0)), bool(instance.get("ignore", 0)), ) instances_cpp.append(instance_cpp) return instances_cpp # Convert GT annotations, detections, and IOUs to a format that's fast to access in C++ ground_truth_instances = [ [convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds] for imgId in p.imgIds ] detected_instances = [ [convert_instances_to_cpp(self._dts[imgId, catId], is_det=True) for catId in p.catIds] for imgId in p.imgIds ] ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds] if not p.useCats: # For each image, flatten per-category lists into a single list ground_truth_instances = [[[o for c in i for o in c]] for i in ground_truth_instances] detected_instances = [[[o for c in i for o in c]] for i in detected_instances] # Call C++ implementation of self.evaluateImgs() self._evalImgs_cpp = COCOevalEvaluateImages( p.areaRng, maxDet, p.iouThrs, ious, ground_truth_instances, detected_instances ) self._evalImgs = None self._paramsEval = copy.deepcopy(self.params) toc = time.time() print('DONE (t={:0.2f}s).'.format(toc-tic)) # >>>> End of code differences with original COCO API def accumulate(self, p=None): """ Accumulate per image evaluation results and store the result in self.eval. Does not support changing parameter settings from those used by self.evaluate() """ print('Accumulating evaluation results...') tic = time.time() assert hasattr( self, "_evalImgs_cpp" ), "evaluate() must be called before accmulate() is called." self.eval = COCOevalAccumulate(self._paramsEval, self._evalImgs_cpp) # recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections self.eval["recall"] = np.array(self.eval["recall"]).reshape( self.eval["counts"][:1] + self.eval["counts"][2:] ) # precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X # num_area_ranges X num_max_detections self.eval["precision"] = np.array(self.eval["precision"]).reshape(self.eval["counts"]) self.eval["scores"] = np.array(self.eval["scores"]).reshape(self.eval["counts"]) toc = time.time() print('DONE (t={:0.2f}s).'.format( toc-tic)) ================================================ FILE: ppdet/metrics/json_results.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import six import numpy as np def get_det_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0, im_file=None, save_threshold=0): det_res = [] k = 0 for i in range(len(bbox_nums)): cur_image_id = int(image_id[i][0]) det_nums = bbox_nums[i] for j in range(det_nums): dt = bboxes[k] k = k + 1 num_id, score, xmin, ymin, xmax, ymax = dt.tolist() if int(num_id) < 0 or score < save_threshold: continue category_id = label_to_cat_id_map[int(num_id)] w = xmax - xmin + bias h = ymax - ymin + bias bbox = [xmin, ymin, w, h] dt_res = { 'image_id': cur_image_id, 'category_id': category_id, 'bbox': bbox, 'score': score } if im_file: dt_res['im_file'] = im_file det_res.append(dt_res) return det_res def get_det_poly_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0): det_res = [] k = 0 for i in range(len(bbox_nums)): cur_image_id = int(image_id[i][0]) det_nums = bbox_nums[i] for j in range(det_nums): dt = bboxes[k] k = k + 1 num_id, score, x1, y1, x2, y2, x3, y3, x4, y4 = dt.tolist() if int(num_id) < 0: continue category_id = label_to_cat_id_map[int(num_id)] rbox = [x1, y1, x2, y2, x3, y3, x4, y4] dt_res = { 'image_id': cur_image_id, 'category_id': category_id, 'bbox': rbox, 'score': score } det_res.append(dt_res) return det_res def strip_mask(mask): row = mask[0, 0, :] col = mask[0, :, 0] im_h = len(col) - np.count_nonzero(col == -1) im_w = len(row) - np.count_nonzero(row == -1) return mask[:, :im_h, :im_w] def get_seg_res(masks, bboxes, mask_nums, image_id, label_to_cat_id_map): import pycocotools.mask as mask_util seg_res = [] k = 0 for i in range(len(mask_nums)): cur_image_id = int(image_id[i][0]) det_nums = mask_nums[i] mask_i = masks[k:k + det_nums] mask_i = strip_mask(mask_i) for j in range(det_nums): mask = mask_i[j].astype(np.uint8) score = float(bboxes[k][1]) label = int(bboxes[k][0]) k = k + 1 if label == -1: continue cat_id = label_to_cat_id_map[label] rle = mask_util.encode( np.array( mask[:, :, None], order="F", dtype="uint8"))[0] if six.PY3: if 'counts' in rle: rle['counts'] = rle['counts'].decode("utf8") sg_res = { 'image_id': cur_image_id, 'category_id': cat_id, 'segmentation': rle, 'score': score } seg_res.append(sg_res) return seg_res def get_solov2_segm_res(results, image_id, num_id_to_cat_id_map): import pycocotools.mask as mask_util segm_res = [] # for each batch segms = results['segm'].astype(np.uint8) clsid_labels = results['cate_label'] clsid_scores = results['cate_score'] lengths = segms.shape[0] im_id = int(image_id[0][0]) if lengths == 0 or segms is None: return None # for each sample for i in range(lengths - 1): clsid = int(clsid_labels[i]) catid = num_id_to_cat_id_map[clsid] score = float(clsid_scores[i]) mask = segms[i] segm = mask_util.encode(np.array(mask[:, :, np.newaxis], order='F'))[0] segm['counts'] = segm['counts'].decode('utf8') coco_res = { 'image_id': im_id, 'category_id': catid, 'segmentation': segm, 'score': score } segm_res.append(coco_res) return segm_res def get_keypoint_res(results, im_id): anns = [] preds = results['keypoint'] for idx in range(im_id.shape[0]): image_id = im_id[idx].item() kpts, scores = preds[idx] for kpt, score in zip(kpts, scores): kpt = kpt.flatten() ann = { 'image_id': image_id, 'category_id': 1, # XXX hard code 'keypoints': kpt.tolist(), 'score': float(score) } x = kpt[0::3] y = kpt[1::3] x0, x1, y0, y1 = np.min(x).item(), np.max(x).item(), np.min(y).item( ), np.max(y).item() ann['area'] = (x1 - x0) * (y1 - y0) ann['bbox'] = [x0, y0, x1 - x0, y1 - y0] anns.append(ann) return anns def get_pose3d_res(results, im_id): anns = [] preds = results['pose3d'] for idx in range(im_id.shape[0]): image_id = im_id[idx].item() pose3d = preds[idx] ann = { 'image_id': image_id, 'category_id': 1, # XXX hard code 'pose3d': pose3d.tolist(), 'score': float(1.) } anns.append(ann) return anns ================================================ FILE: ppdet/metrics/keypoint_metrics.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import json from collections import defaultdict, OrderedDict import numpy as np import paddle from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval from ..modeling.keypoint_utils import oks_nms, keypoint_pck_accuracy, keypoint_auc, keypoint_epe from scipy.io import loadmat, savemat from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) __all__ = [ 'KeyPointTopDownCOCOEval', 'KeyPointTopDownCOCOWholeBadyHandEval', 'KeyPointTopDownMPIIEval' ] class KeyPointTopDownCOCOEval(object): """refer to https://github.com/leoxiaobin/deep-high-resolution-net.pytorch Copyright (c) Microsoft, under the MIT License. """ def __init__(self, anno_file, num_samples, num_joints, output_eval, iou_type='keypoints', in_vis_thre=0.2, oks_thre=0.9, save_prediction_only=False): super(KeyPointTopDownCOCOEval, self).__init__() self.coco = COCO(anno_file) self.num_samples = num_samples self.num_joints = num_joints self.iou_type = iou_type self.in_vis_thre = in_vis_thre self.oks_thre = oks_thre self.output_eval = output_eval self.res_file = os.path.join(output_eval, "keypoints_results.json") self.save_prediction_only = save_prediction_only self.reset() def reset(self): self.results = { 'all_preds': np.zeros( (self.num_samples, self.num_joints, 3), dtype=np.float32), 'all_boxes': np.zeros((self.num_samples, 6)), 'image_path': [] } self.eval_results = {} self.idx = 0 def update(self, inputs, outputs): kpts, _ = outputs['keypoint'][0] num_images = inputs['image'].shape[0] self.results['all_preds'][self.idx:self.idx + num_images, :, 0: 3] = kpts[:, :, 0:3] self.results['all_boxes'][self.idx:self.idx + num_images, 0:2] = inputs[ 'center'].numpy()[:, 0:2] if isinstance( inputs['center'], paddle.Tensor) else inputs['center'][:, 0:2] self.results['all_boxes'][self.idx:self.idx + num_images, 2:4] = inputs[ 'scale'].numpy()[:, 0:2] if isinstance( inputs['scale'], paddle.Tensor) else inputs['scale'][:, 0:2] self.results['all_boxes'][self.idx:self.idx + num_images, 4] = np.prod( inputs['scale'].numpy() * 200, 1) if isinstance(inputs['scale'], paddle.Tensor) else np.prod( inputs['scale'] * 200, 1) self.results['all_boxes'][ self.idx:self.idx + num_images, 5] = np.squeeze(inputs['score'].numpy()) if isinstance( inputs['score'], paddle.Tensor) else np.squeeze(inputs['score']) if isinstance(inputs['im_id'], paddle.Tensor): self.results['image_path'].extend(inputs['im_id'].numpy()) else: self.results['image_path'].extend(inputs['im_id']) self.idx += num_images def _write_coco_keypoint_results(self, keypoints): data_pack = [{ 'cat_id': 1, 'cls': 'person', 'ann_type': 'keypoints', 'keypoints': keypoints }] results = self._coco_keypoint_results_one_category_kernel(data_pack[0]) if not os.path.exists(self.output_eval): os.makedirs(self.output_eval) with open(self.res_file, 'w') as f: json.dump(results, f, sort_keys=True, indent=4) logger.info(f'The keypoint result is saved to {self.res_file}.') try: json.load(open(self.res_file)) except Exception: content = [] with open(self.res_file, 'r') as f: for line in f: content.append(line) content[-1] = ']' with open(self.res_file, 'w') as f: for c in content: f.write(c) def _coco_keypoint_results_one_category_kernel(self, data_pack): cat_id = data_pack['cat_id'] keypoints = data_pack['keypoints'] cat_results = [] for img_kpts in keypoints: if len(img_kpts) == 0: continue _key_points = np.array( [img_kpts[k]['keypoints'] for k in range(len(img_kpts))]) _key_points = _key_points.reshape(_key_points.shape[0], -1) result = [{ 'image_id': img_kpts[k]['image'], 'category_id': cat_id, 'keypoints': _key_points[k].tolist(), 'score': img_kpts[k]['score'], 'center': list(img_kpts[k]['center']), 'scale': list(img_kpts[k]['scale']) } for k in range(len(img_kpts))] cat_results.extend(result) return cat_results def get_final_results(self, preds, all_boxes, img_path): _kpts = [] for idx, kpt in enumerate(preds): _kpts.append({ 'keypoints': kpt, 'center': all_boxes[idx][0:2], 'scale': all_boxes[idx][2:4], 'area': all_boxes[idx][4], 'score': all_boxes[idx][5], 'image': int(img_path[idx]) }) # image x person x (keypoints) kpts = defaultdict(list) for kpt in _kpts: kpts[kpt['image']].append(kpt) # rescoring and oks nms num_joints = preds.shape[1] in_vis_thre = self.in_vis_thre oks_thre = self.oks_thre oks_nmsed_kpts = [] for img in kpts.keys(): img_kpts = kpts[img] for n_p in img_kpts: box_score = n_p['score'] kpt_score = 0 valid_num = 0 for n_jt in range(0, num_joints): t_s = n_p['keypoints'][n_jt][2] if t_s > in_vis_thre: kpt_score = kpt_score + t_s valid_num = valid_num + 1 if valid_num != 0: kpt_score = kpt_score / valid_num # rescoring n_p['score'] = kpt_score * box_score keep = oks_nms([img_kpts[i] for i in range(len(img_kpts))], oks_thre) if len(keep) == 0: oks_nmsed_kpts.append(img_kpts) else: oks_nmsed_kpts.append([img_kpts[_keep] for _keep in keep]) self._write_coco_keypoint_results(oks_nmsed_kpts) def accumulate(self): self.get_final_results(self.results['all_preds'], self.results['all_boxes'], self.results['image_path']) if self.save_prediction_only: logger.info(f'The keypoint result is saved to {self.res_file} ' 'and do not evaluate the mAP.') return coco_dt = self.coco.loadRes(self.res_file) coco_eval = COCOeval(self.coco, coco_dt, 'keypoints') coco_eval.params.useSegm = None coco_eval.evaluate() coco_eval.accumulate() coco_eval.summarize() keypoint_stats = [] for ind in range(len(coco_eval.stats)): keypoint_stats.append((coco_eval.stats[ind])) self.eval_results['keypoint'] = keypoint_stats def log(self): if self.save_prediction_only: return stats_names = [ 'AP', 'Ap .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5', 'AR .75', 'AR (M)', 'AR (L)' ] num_values = len(stats_names) print(' '.join(['| {}'.format(name) for name in stats_names]) + ' |') print('|---' * (num_values + 1) + '|') print(' '.join([ '| {:.3f}'.format(value) for value in self.eval_results['keypoint'] ]) + ' |') def get_results(self): return self.eval_results class KeyPointTopDownCOCOWholeBadyHandEval(object): def __init__(self, anno_file, num_samples, num_joints, output_eval, save_prediction_only=False): super(KeyPointTopDownCOCOWholeBadyHandEval, self).__init__() self.coco = COCO(anno_file) self.num_samples = num_samples self.num_joints = num_joints self.output_eval = output_eval self.res_file = os.path.join(output_eval, "keypoints_results.json") self.save_prediction_only = save_prediction_only self.parse_dataset() self.reset() def parse_dataset(self): gt_db = [] num_joints = self.num_joints coco = self.coco img_ids = coco.getImgIds() for img_id in img_ids: ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False) objs = coco.loadAnns(ann_ids) for obj in objs: for type in ['left', 'right']: if (obj[f'{type}hand_valid'] and max(obj[f'{type}hand_kpts']) > 0): joints = np.zeros((num_joints, 3), dtype=np.float32) joints_vis = np.zeros((num_joints, 3), dtype=np.float32) keypoints = np.array(obj[f'{type}hand_kpts']) keypoints = keypoints.reshape(-1, 3) joints[:, :2] = keypoints[:, :2] joints_vis[:, :2] = np.minimum(1, keypoints[:, 2:3]) gt_db.append({ 'bbox': obj[f'{type}hand_box'], 'gt_joints': joints, 'joints_vis': joints_vis, }) self.db = gt_db def reset(self): self.results = { 'preds': np.zeros( (self.num_samples, self.num_joints, 3), dtype=np.float32), } self.eval_results = {} self.idx = 0 def update(self, inputs, outputs): kpts, _ = outputs['keypoint'][0] num_images = inputs['image'].shape[0] self.results['preds'][self.idx:self.idx + num_images, :, 0: 3] = kpts[:, :, 0:3] self.idx += num_images def accumulate(self): self.get_final_results(self.results['preds']) if self.save_prediction_only: logger.info(f'The keypoint result is saved to {self.res_file} ' 'and do not evaluate the mAP.') return self.eval_results = self.evaluate(self.res_file, ('PCK', 'AUC', 'EPE')) def get_final_results(self, preds): kpts = [] for idx, kpt in enumerate(preds): kpts.append({'keypoints': kpt.tolist()}) self._write_keypoint_results(kpts) def _write_keypoint_results(self, keypoints): if not os.path.exists(self.output_eval): os.makedirs(self.output_eval) with open(self.res_file, 'w') as f: json.dump(keypoints, f, sort_keys=True, indent=4) logger.info(f'The keypoint result is saved to {self.res_file}.') try: json.load(open(self.res_file)) except Exception: content = [] with open(self.res_file, 'r') as f: for line in f: content.append(line) content[-1] = ']' with open(self.res_file, 'w') as f: for c in content: f.write(c) def log(self): if self.save_prediction_only: return for item, value in self.eval_results.items(): print("{} : {}".format(item, value)) def get_results(self): return self.eval_results def evaluate(self, res_file, metrics, pck_thr=0.2, auc_nor=30): """Keypoint evaluation. Args: res_file (str): Json file stored prediction results. metrics (str | list[str]): Metric to be performed. Options: 'PCK', 'AUC', 'EPE'. pck_thr (float): PCK threshold, default as 0.2. auc_nor (float): AUC normalization factor, default as 30 pixel. Returns: List: Evaluation results for evaluation metric. """ info_str = [] with open(res_file, 'r') as fin: preds = json.load(fin) assert len(preds) == len(self.db) outputs = [] gts = [] masks = [] threshold_bbox = [] for pred, item in zip(preds, self.db): outputs.append(np.array(pred['keypoints'])[:, :-1]) gts.append(np.array(item['gt_joints'])[:, :-1]) masks.append((np.array(item['joints_vis'])[:, 0]) > 0) if 'PCK' in metrics: bbox = np.array(item['bbox']) bbox_thr = np.max(bbox[2:]) threshold_bbox.append(np.array([bbox_thr, bbox_thr])) outputs = np.array(outputs) gts = np.array(gts) masks = np.array(masks) threshold_bbox = np.array(threshold_bbox) if 'PCK' in metrics: _, pck, _ = keypoint_pck_accuracy(outputs, gts, masks, pck_thr, threshold_bbox) info_str.append(('PCK', pck)) if 'AUC' in metrics: info_str.append(('AUC', keypoint_auc(outputs, gts, masks, auc_nor))) if 'EPE' in metrics: info_str.append(('EPE', keypoint_epe(outputs, gts, masks))) name_value = OrderedDict(info_str) return name_value class KeyPointTopDownMPIIEval(object): def __init__(self, anno_file, num_samples, num_joints, output_eval, oks_thre=0.9, save_prediction_only=False): super(KeyPointTopDownMPIIEval, self).__init__() self.ann_file = anno_file self.res_file = os.path.join(output_eval, "keypoints_results.json") self.save_prediction_only = save_prediction_only self.reset() def reset(self): self.results = [] self.eval_results = {} self.idx = 0 def update(self, inputs, outputs): kpts, _ = outputs['keypoint'][0] num_images = inputs['image'].shape[0] results = {} results['preds'] = kpts[:, :, 0:3] results['boxes'] = np.zeros((num_images, 6)) results['boxes'][:, 0:2] = inputs['center'].numpy()[:, 0:2] results['boxes'][:, 2:4] = inputs['scale'].numpy()[:, 0:2] results['boxes'][:, 4] = np.prod(inputs['scale'].numpy() * 200, 1) results['boxes'][:, 5] = np.squeeze(inputs['score'].numpy()) results['image_path'] = inputs['image_file'] self.results.append(results) def accumulate(self): self._mpii_keypoint_results_save() if self.save_prediction_only: logger.info(f'The keypoint result is saved to {self.res_file} ' 'and do not evaluate the mAP.') return self.eval_results = self.evaluate(self.results) def _mpii_keypoint_results_save(self): results = [] for res in self.results: if len(res) == 0: continue result = [{ 'preds': res['preds'][k].tolist(), 'boxes': res['boxes'][k].tolist(), 'image_path': res['image_path'][k], } for k in range(len(res))] results.extend(result) with open(self.res_file, 'w') as f: json.dump(results, f, sort_keys=True, indent=4) logger.info(f'The keypoint result is saved to {self.res_file}.') def log(self): if self.save_prediction_only: return for item, value in self.eval_results.items(): print("{} : {}".format(item, value)) def get_results(self): return self.eval_results def evaluate(self, outputs, savepath=None): """Evaluate PCKh for MPII dataset. refer to https://github.com/leoxiaobin/deep-high-resolution-net.pytorch Copyright (c) Microsoft, under the MIT License. Args: outputs(list(preds, boxes)): * preds (np.ndarray[N,K,3]): The first two dimensions are coordinates, score is the third dimension of the array. * boxes (np.ndarray[N,6]): [center[0], center[1], scale[0] , scale[1],area, score] Returns: dict: PCKh for each joint """ kpts = [] for output in outputs: preds = output['preds'] batch_size = preds.shape[0] for i in range(batch_size): kpts.append({'keypoints': preds[i]}) preds = np.stack([kpt['keypoints'] for kpt in kpts]) # convert 0-based index to 1-based index, # and get the first two dimensions. preds = preds[..., :2] + 1.0 if savepath is not None: pred_file = os.path.join(savepath, 'pred.mat') savemat(pred_file, mdict={'preds': preds}) SC_BIAS = 0.6 threshold = 0.5 gt_file = os.path.join( os.path.dirname(self.ann_file), 'mpii_gt_val.mat') gt_dict = loadmat(gt_file) dataset_joints = gt_dict['dataset_joints'] jnt_missing = gt_dict['jnt_missing'] pos_gt_src = gt_dict['pos_gt_src'] headboxes_src = gt_dict['headboxes_src'] pos_pred_src = np.transpose(preds, [1, 2, 0]) head = np.where(dataset_joints == 'head')[1][0] lsho = np.where(dataset_joints == 'lsho')[1][0] lelb = np.where(dataset_joints == 'lelb')[1][0] lwri = np.where(dataset_joints == 'lwri')[1][0] lhip = np.where(dataset_joints == 'lhip')[1][0] lkne = np.where(dataset_joints == 'lkne')[1][0] lank = np.where(dataset_joints == 'lank')[1][0] rsho = np.where(dataset_joints == 'rsho')[1][0] relb = np.where(dataset_joints == 'relb')[1][0] rwri = np.where(dataset_joints == 'rwri')[1][0] rkne = np.where(dataset_joints == 'rkne')[1][0] rank = np.where(dataset_joints == 'rank')[1][0] rhip = np.where(dataset_joints == 'rhip')[1][0] jnt_visible = 1 - jnt_missing uv_error = pos_pred_src - pos_gt_src uv_err = np.linalg.norm(uv_error, axis=1) headsizes = headboxes_src[1, :, :] - headboxes_src[0, :, :] headsizes = np.linalg.norm(headsizes, axis=0) headsizes *= SC_BIAS scale = headsizes * np.ones((len(uv_err), 1), dtype=np.float32) scaled_uv_err = uv_err / scale scaled_uv_err = scaled_uv_err * jnt_visible jnt_count = np.sum(jnt_visible, axis=1) less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible PCKh = 100. * np.sum(less_than_threshold, axis=1) / jnt_count # save rng = np.arange(0, 0.5 + 0.01, 0.01) pckAll = np.zeros((len(rng), 16), dtype=np.float32) for r, threshold in enumerate(rng): less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible pckAll[r, :] = 100. * np.sum(less_than_threshold, axis=1) / jnt_count PCKh = np.ma.array(PCKh, mask=False) PCKh.mask[6:8] = True jnt_count = np.ma.array(jnt_count, mask=False) jnt_count.mask[6:8] = True jnt_ratio = jnt_count / np.sum(jnt_count).astype(np.float64) name_value = [ #noqa ('Head', PCKh[head]), ('Shoulder', 0.5 * (PCKh[lsho] + PCKh[rsho])), ('Elbow', 0.5 * (PCKh[lelb] + PCKh[relb])), ('Wrist', 0.5 * (PCKh[lwri] + PCKh[rwri])), ('Hip', 0.5 * (PCKh[lhip] + PCKh[rhip])), ('Knee', 0.5 * (PCKh[lkne] + PCKh[rkne])), ('Ankle', 0.5 * (PCKh[lank] + PCKh[rank])), ('PCKh', np.sum(PCKh * jnt_ratio)), ('PCKh@0.1', np.sum(pckAll[11, :] * jnt_ratio)) ] name_value = OrderedDict(name_value) return name_value def _sort_and_unique_bboxes(self, kpts, key='bbox_id'): """sort kpts and remove the repeated ones.""" kpts = sorted(kpts, key=lambda x: x[key]) num = len(kpts) for i in range(num - 1, 0, -1): if kpts[i][key] == kpts[i - 1][key]: del kpts[i] return kpts ================================================ FILE: ppdet/metrics/lvis_utils.py ================================================ # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys import numpy as np import itertools from ppdet.metrics.json_results import get_det_res, get_det_poly_res, get_seg_res, get_solov2_segm_res, get_keypoint_res, get_pose3d_res from ppdet.metrics.map_utils import draw_pr_curve from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) def lvisapi_eval(jsonfile, style, lvis_gt=None, anno_file=None, max_dets=(100, 300, 1000), classwise=False, sigmas=None, use_area=True): """ Args: jsonfile (str): Evaluation json file, eg: bbox.json, mask.json. style (str): COCOeval style, can be `bbox` , `segm` , `proposal`, `keypoints` and `keypoints_crowd`. coco_gt (str): Whether to load COCOAPI through anno_file, eg: coco_gt = COCO(anno_file) anno_file (str): COCO annotations file. max_dets (tuple): COCO evaluation maxDets. classwise (bool): Whether per-category AP and draw P-R Curve or not. sigmas (nparray): keypoint labelling sigmas. use_area (bool): If gt annotations (eg. CrowdPose, AIC) do not have 'area', please set use_area=False. """ assert lvis_gt != None or anno_file != None from lvis import LVIS, LVISEval, LVISResults if lvis_gt == None: # coco_gt = COCO(anno_file) lvis_gt = LVIS(anno_file) logger.info("Start evaluate...") lvis_dt = LVISResults(lvis_gt, jsonfile) lvis_eval = LVISEval(lvis_gt, lvis_dt, style) lvis_eval.evaluate() lvis_eval.accumulate() lvis_eval.summarize() if classwise: # Compute per-category AP and PR curve try: from terminaltables import AsciiTable except Exception as e: logger.error( 'terminaltables not found, plaese install terminaltables. ' 'for example: `pip install terminaltables`.') raise e precisions = coco_eval.eval['precision'] cat_ids = coco_gt.getCatIds() # precision: (iou, recall, cls, area range, max dets) assert len(cat_ids) == precisions.shape[2] results_per_category = [] for idx, catId in enumerate(cat_ids): # area range index 0: all area ranges # max dets index -1: typically 100 per image nm = coco_gt.loadCats(catId)[0] precision = precisions[:, :, idx, 0, -1] precision = precision[precision > -1] if precision.size: ap = np.mean(precision) else: ap = float('nan') results_per_category.append( (str(nm["name"]), '{:0.3f}'.format(float(ap)))) pr_array = precisions[0, :, idx, 0, 2] recall_array = np.arange(0.0, 1.01, 0.01) draw_pr_curve( pr_array, recall_array, out_dir=style + '_pr_curve', file_name='{}_precision_recall_curve.jpg'.format(nm["name"])) num_columns = min(6, len(results_per_category) * 2) results_flatten = list(itertools.chain(*results_per_category)) headers = ['category', 'AP'] * (num_columns // 2) results_2d = itertools.zip_longest( * [results_flatten[i::num_columns] for i in range(num_columns)]) table_data = [headers] table_data += [result for result in results_2d] table = AsciiTable(table_data) logger.info('Per-category of {} AP: \n{}'.format(style, table.table)) logger.info("per-category PR curve has output to {} folder.".format( style + '_pr_curve')) # flush coco evaluation result sys.stdout.flush() return lvis_eval.get_results() ================================================ FILE: ppdet/metrics/map_utils.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import os import sys import numpy as np import itertools import paddle from ppdet.modeling.rbox_utils import poly2rbox_np from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) __all__ = [ 'draw_pr_curve', 'bbox_area', 'jaccard_overlap', 'prune_zero_padding', 'DetectionMAP', 'ap_per_class', 'compute_ap', ] def draw_pr_curve(precision, recall, iou=0.5, out_dir='pr_curve', file_name='precision_recall_curve.jpg'): if not os.path.exists(out_dir): os.makedirs(out_dir) output_path = os.path.join(out_dir, file_name) try: import matplotlib.pyplot as plt except Exception as e: logger.error('Matplotlib not found, plaese install matplotlib.' 'for example: `pip install matplotlib`.') raise e plt.cla() plt.figure('P-R Curve') plt.title('Precision/Recall Curve(IoU={})'.format(iou)) plt.xlabel('Recall') plt.ylabel('Precision') plt.grid(True) plt.plot(recall, precision) plt.savefig(output_path) def bbox_area(bbox, is_bbox_normalized): """ Calculate area of a bounding box """ norm = 1. - float(is_bbox_normalized) width = bbox[2] - bbox[0] + norm height = bbox[3] - bbox[1] + norm return width * height def jaccard_overlap(pred, gt, is_bbox_normalized=False): """ Calculate jaccard overlap ratio between two bounding box """ if pred[0] >= gt[2] or pred[2] <= gt[0] or \ pred[1] >= gt[3] or pred[3] <= gt[1]: return 0. inter_xmin = max(pred[0], gt[0]) inter_ymin = max(pred[1], gt[1]) inter_xmax = min(pred[2], gt[2]) inter_ymax = min(pred[3], gt[3]) inter_size = bbox_area([inter_xmin, inter_ymin, inter_xmax, inter_ymax], is_bbox_normalized) pred_size = bbox_area(pred, is_bbox_normalized) gt_size = bbox_area(gt, is_bbox_normalized) overlap = float(inter_size) / (pred_size + gt_size - inter_size) return overlap def calc_rbox_iou(pred, gt_poly): """ calc iou between rotated bbox """ # calc iou of bounding box for speedup pred = np.array(pred, np.float32).reshape(-1, 2) gt_poly = np.array(gt_poly, np.float32).reshape(-1, 2) pred_rect = [ np.min(pred[:, 0]), np.min(pred[:, 1]), np.max(pred[:, 0]), np.max(pred[:, 1]) ] gt_rect = [ np.min(gt_poly[:, 0]), np.min(gt_poly[:, 1]), np.max(gt_poly[:, 0]), np.max(gt_poly[:, 1]) ] iou = jaccard_overlap(pred_rect, gt_rect, False) if iou <= 0: return iou # calc rbox iou pred_rbox = poly2rbox_np(pred.reshape(-1, 8)).reshape(-1, 5) gt_rbox = poly2rbox_np(gt_poly.reshape(-1, 8)).reshape(-1, 5) try: from ext_op import rbox_iou except Exception as e: print("import custom_ops error, try install ext_op " \ "following ppdet/ext_op/README.md", e) sys.stdout.flush() sys.exit(-1) pd_gt_rbox = paddle.to_tensor(gt_rbox, dtype='float32') pd_pred_rbox = paddle.to_tensor(pred_rbox, dtype='float32') iou = rbox_iou(pd_gt_rbox, pd_pred_rbox) iou = iou.numpy() return iou[0][0] def prune_zero_padding(gt_box, gt_label, difficult=None): valid_cnt = 0 for i in range(len(gt_box)): if (gt_box[i] == 0).all(): break valid_cnt += 1 return (gt_box[:valid_cnt], gt_label[:valid_cnt], difficult[:valid_cnt] if difficult is not None else None) class DetectionMAP(object): """ Calculate detection mean average precision. Currently support two types: 11point and integral Args: class_num (int): The class number. overlap_thresh (float): The threshold of overlap ratio between prediction bounding box and ground truth bounding box for deciding true/false positive. Default 0.5. map_type (str): Calculation method of mean average precision, currently support '11point' and 'integral'. Default '11point'. is_bbox_normalized (bool): Whether bounding boxes is normalized to range[0, 1]. Default False. evaluate_difficult (bool): Whether to evaluate difficult bounding boxes. Default False. catid2name (dict): Mapping between category id and category name. classwise (bool): Whether per-category AP and draw P-R Curve or not. """ def __init__(self, class_num, overlap_thresh=0.5, map_type='11point', is_bbox_normalized=False, evaluate_difficult=False, catid2name=None, classwise=False): self.class_num = class_num self.overlap_thresh = overlap_thresh assert map_type in ['11point', 'integral'], \ "map_type currently only support '11point' "\ "and 'integral'" self.map_type = map_type self.is_bbox_normalized = is_bbox_normalized self.evaluate_difficult = evaluate_difficult self.classwise = classwise self.classes = [] for cname in catid2name.values(): self.classes.append(cname) self.reset() def update(self, bbox, score, label, gt_box, gt_label, difficult=None): """ Update metric statics from given prediction and ground truth infomations. """ if difficult is None: difficult = np.zeros_like(gt_label) # record class gt count for gtl, diff in zip(gt_label, difficult): if self.evaluate_difficult or int(diff) == 0: self.class_gt_counts[int(np.array(gtl))] += 1 # record class score positive visited = [False] * len(gt_label) for b, s, l in zip(bbox, score, label): pred = b.tolist() if isinstance(b, np.ndarray) else b max_idx = -1 max_overlap = -1.0 for i, gl in enumerate(gt_label): if int(gl) == int(l): if len(gt_box[i]) == 8: overlap = calc_rbox_iou(pred, gt_box[i]) else: overlap = jaccard_overlap(pred, gt_box[i], self.is_bbox_normalized) if overlap > max_overlap: max_overlap = overlap max_idx = i if max_overlap > self.overlap_thresh: if self.evaluate_difficult or \ int(np.array(difficult[max_idx])) == 0: if not visited[max_idx]: self.class_score_poss[int(l)].append([s, 1.0]) visited[max_idx] = True else: self.class_score_poss[int(l)].append([s, 0.0]) else: self.class_score_poss[int(l)].append([s, 0.0]) def reset(self): """ Reset metric statics """ self.class_score_poss = [[] for _ in range(self.class_num)] self.class_gt_counts = [0] * self.class_num self.mAP = 0.0 def accumulate(self): """ Accumulate metric results and calculate mAP """ mAP = 0. valid_cnt = 0 eval_results = [] for score_pos, count in zip(self.class_score_poss, self.class_gt_counts): if count == 0: continue if len(score_pos) == 0: valid_cnt += 1 continue accum_tp_list, accum_fp_list = \ self._get_tp_fp_accum(score_pos) precision = [] recall = [] for ac_tp, ac_fp in zip(accum_tp_list, accum_fp_list): precision.append(float(ac_tp) / (ac_tp + ac_fp)) recall.append(float(ac_tp) / count) one_class_ap = 0.0 if self.map_type == '11point': max_precisions = [0.] * 11 start_idx = len(precision) - 1 for j in range(10, -1, -1): for i in range(start_idx, -1, -1): if recall[i] < float(j) / 10.: start_idx = i if j > 0: max_precisions[j - 1] = max_precisions[j] break else: if max_precisions[j] < precision[i]: max_precisions[j] = precision[i] one_class_ap = sum(max_precisions) / 11. mAP += one_class_ap valid_cnt += 1 elif self.map_type == 'integral': import math prev_recall = 0. for i in range(len(precision)): recall_gap = math.fabs(recall[i] - prev_recall) if recall_gap > 1e-6: one_class_ap += precision[i] * recall_gap prev_recall = recall[i] mAP += one_class_ap valid_cnt += 1 else: logger.error("Unspported mAP type {}".format(self.map_type)) sys.exit(1) eval_results.append({ 'class': self.classes[valid_cnt - 1], 'ap': one_class_ap, 'precision': precision, 'recall': recall, }) self.eval_results = eval_results self.mAP = mAP / float(valid_cnt) if valid_cnt > 0 else mAP def get_map(self): """ Get mAP result """ if self.mAP is None: logger.error("mAP is not calculated.") if self.classwise: # Compute per-category AP and PR curve try: from terminaltables import AsciiTable except Exception as e: logger.error( 'terminaltables not found, plaese install terminaltables. ' 'for example: `pip install terminaltables`.') raise e results_per_category = [] for eval_result in self.eval_results: results_per_category.append( (str(eval_result['class']), '{:0.3f}'.format(float(eval_result['ap'])))) draw_pr_curve( eval_result['precision'], eval_result['recall'], out_dir='voc_pr_curve', file_name='{}_precision_recall_curve.jpg'.format( eval_result['class'])) num_columns = min(6, len(results_per_category) * 2) results_flatten = list(itertools.chain(*results_per_category)) headers = ['category', 'AP'] * (num_columns // 2) results_2d = itertools.zip_longest(* [ results_flatten[i::num_columns] for i in range(num_columns) ]) table_data = [headers] table_data += [result for result in results_2d] table = AsciiTable(table_data) logger.info('Per-category of VOC AP: \n{}'.format(table.table)) logger.info( "per-category PR curve has output to voc_pr_curve folder.") return self.mAP def _get_tp_fp_accum(self, score_pos_list): """ Calculate accumulating true/false positive results from [score, pos] records """ sorted_list = sorted(score_pos_list, key=lambda s: s[0], reverse=True) accum_tp = 0 accum_fp = 0 accum_tp_list = [] accum_fp_list = [] for (score, pos) in sorted_list: accum_tp += int(pos) accum_tp_list.append(accum_tp) accum_fp += 1 - int(pos) accum_fp_list.append(accum_fp) return accum_tp_list, accum_fp_list def ap_per_class(tp, conf, pred_cls, target_cls): """ Computes the average precision, given the recall and precision curves. Method originally from https://github.com/rafaelpadilla/Object-Detection-Metrics. Args: tp (list): True positives. conf (list): Objectness value from 0-1. pred_cls (list): Predicted object classes. target_cls (list): Target object classes. """ tp, conf, pred_cls, target_cls = np.array(tp), np.array(conf), np.array( pred_cls), np.array(target_cls) # Sort by objectness i = np.argsort(-conf) tp, conf, pred_cls = tp[i], conf[i], pred_cls[i] # Find unique classes unique_classes = np.unique(np.concatenate((pred_cls, target_cls), 0)) # Create Precision-Recall curve and compute AP for each class ap, p, r = [], [], [] for c in unique_classes: i = pred_cls == c n_gt = sum(target_cls == c) # Number of ground truth objects n_p = sum(i) # Number of predicted objects if (n_p == 0) and (n_gt == 0): continue elif (n_p == 0) or (n_gt == 0): ap.append(0) r.append(0) p.append(0) else: # Accumulate FPs and TPs fpc = np.cumsum(1 - tp[i]) tpc = np.cumsum(tp[i]) # Recall recall_curve = tpc / (n_gt + 1e-16) r.append(tpc[-1] / (n_gt + 1e-16)) # Precision precision_curve = tpc / (tpc + fpc) p.append(tpc[-1] / (tpc[-1] + fpc[-1])) # AP from recall-precision curve ap.append(compute_ap(recall_curve, precision_curve)) return np.array(ap), unique_classes.astype('int32'), np.array(r), np.array( p) def compute_ap(recall, precision): """ Computes the average precision, given the recall and precision curves. Code originally from https://github.com/rbgirshick/py-faster-rcnn. Args: recall (list): The recall curve. precision (list): The precision curve. Returns: The average precision as computed in py-faster-rcnn. """ # correct AP calculation # first append sentinel values at the end mrec = np.concatenate(([0.], recall, [1.])) mpre = np.concatenate(([0.], precision, [0.])) # compute the precision envelope for i in range(mpre.size - 1, 0, -1): mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) # to calculate area under PR curve, look for points # where X axis (recall) changes value i = np.where(mrec[1:] != mrec[:-1])[0] # and sum (\Delta recall) * prec ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) return ap ================================================ FILE: ppdet/metrics/mcmot_metrics.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import copy import sys import math from collections import defaultdict import numpy as np import pandas as pd from .metrics import Metric try: import motmetrics as mm from motmetrics.math_util import quiet_divide metrics = mm.metrics.motchallenge_metrics mh = mm.metrics.create() except: print( 'Warning: Unable to use MCMOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics' ) pass from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) __all__ = ['MCMOTEvaluator', 'MCMOTMetric'] METRICS_LIST = [ 'num_frames', 'num_matches', 'num_switches', 'num_transfer', 'num_ascend', 'num_migrate', 'num_false_positives', 'num_misses', 'num_detections', 'num_objects', 'num_predictions', 'num_unique_objects', 'mostly_tracked', 'partially_tracked', 'mostly_lost', 'num_fragmentations', 'motp', 'mota', 'precision', 'recall', 'idfp', 'idfn', 'idtp', 'idp', 'idr', 'idf1' ] NAME_MAP = { 'num_frames': 'num_frames', 'num_matches': 'num_matches', 'num_switches': 'IDs', 'num_transfer': 'IDt', 'num_ascend': 'IDa', 'num_migrate': 'IDm', 'num_false_positives': 'FP', 'num_misses': 'FN', 'num_detections': 'num_detections', 'num_objects': 'num_objects', 'num_predictions': 'num_predictions', 'num_unique_objects': 'GT', 'mostly_tracked': 'MT', 'partially_tracked': 'partially_tracked', 'mostly_lost': 'ML', 'num_fragmentations': 'FM', 'motp': 'MOTP', 'mota': 'MOTA', 'precision': 'Prcn', 'recall': 'Rcll', 'idfp': 'idfp', 'idfn': 'idfn', 'idtp': 'idtp', 'idp': 'IDP', 'idr': 'IDR', 'idf1': 'IDF1' } def parse_accs_metrics(seq_acc, index_name, verbose=False): """ Parse the evaluation indicators of multiple MOTAccumulator """ mh = mm.metrics.create() summary = MCMOTEvaluator.get_summary(seq_acc, index_name, METRICS_LIST) summary.loc['OVERALL', 'motp'] = (summary['motp'] * summary['num_detections']).sum() / \ summary.loc['OVERALL', 'num_detections'] if verbose: strsummary = mm.io.render_summary( summary, formatters=mh.formatters, namemap=NAME_MAP) print(strsummary) return summary def seqs_overall_metrics(summary_df, verbose=False): """ Calculate overall metrics for multiple sequences """ add_col = [ 'num_frames', 'num_matches', 'num_switches', 'num_transfer', 'num_ascend', 'num_migrate', 'num_false_positives', 'num_misses', 'num_detections', 'num_objects', 'num_predictions', 'num_unique_objects', 'mostly_tracked', 'partially_tracked', 'mostly_lost', 'num_fragmentations', 'idfp', 'idfn', 'idtp' ] calc_col = ['motp', 'mota', 'precision', 'recall', 'idp', 'idr', 'idf1'] calc_df = summary_df.copy() overall_dic = {} for col in add_col: overall_dic[col] = calc_df[col].sum() for col in calc_col: overall_dic[col] = getattr(MCMOTMetricOverall, col + '_overall')( calc_df, overall_dic) overall_df = pd.DataFrame(overall_dic, index=['overall_calc']) calc_df = pd.concat([calc_df, overall_df]) if verbose: mh = mm.metrics.create() str_calc_df = mm.io.render_summary( calc_df, formatters=mh.formatters, namemap=NAME_MAP) print(str_calc_df) return calc_df class MCMOTMetricOverall(object): def motp_overall(summary_df, overall_dic): motp = quiet_divide((summary_df['motp'] * summary_df['num_detections']).sum(), overall_dic['num_detections']) return motp def mota_overall(summary_df, overall_dic): del summary_df mota = 1. - quiet_divide( (overall_dic['num_misses'] + overall_dic['num_switches'] + overall_dic['num_false_positives']), overall_dic['num_objects']) return mota def precision_overall(summary_df, overall_dic): del summary_df precision = quiet_divide(overall_dic['num_detections'], ( overall_dic['num_false_positives'] + overall_dic['num_detections'])) return precision def recall_overall(summary_df, overall_dic): del summary_df recall = quiet_divide(overall_dic['num_detections'], overall_dic['num_objects']) return recall def idp_overall(summary_df, overall_dic): del summary_df idp = quiet_divide(overall_dic['idtp'], (overall_dic['idtp'] + overall_dic['idfp'])) return idp def idr_overall(summary_df, overall_dic): del summary_df idr = quiet_divide(overall_dic['idtp'], (overall_dic['idtp'] + overall_dic['idfn'])) return idr def idf1_overall(summary_df, overall_dic): del summary_df idf1 = quiet_divide(2. * overall_dic['idtp'], ( overall_dic['num_objects'] + overall_dic['num_predictions'])) return idf1 def read_mcmot_results_union(filename, is_gt, is_ignore): results_dict = dict() if os.path.isfile(filename): all_result = np.loadtxt(filename, delimiter=',') if all_result.shape[0] == 0 or all_result.shape[1] < 7: return results_dict if is_ignore: return results_dict if is_gt: # only for test use all_result = all_result[all_result[:, 7] != 0] all_result[:, 7] = all_result[:, 7] - 1 if all_result.shape[0] == 0: return results_dict class_unique = np.unique(all_result[:, 7]) last_max_id = 0 result_cls_list = [] for cls in class_unique: result_cls_split = all_result[all_result[:, 7] == cls] result_cls_split[:, 1] = result_cls_split[:, 1] + last_max_id # make sure track id different between every category last_max_id = max(np.unique(result_cls_split[:, 1])) + 1 result_cls_list.append(result_cls_split) results_con = np.concatenate(result_cls_list) for line in range(len(results_con)): linelist = results_con[line] fid = int(linelist[0]) if fid < 1: continue results_dict.setdefault(fid, list()) if is_gt: score = 1 else: score = float(linelist[6]) tlwh = tuple(map(float, linelist[2:6])) target_id = int(linelist[1]) cls = int(linelist[7]) results_dict[fid].append((tlwh, target_id, cls, score)) return results_dict def read_mcmot_results(filename, is_gt, is_ignore): results_dict = dict() if os.path.isfile(filename): with open(filename, 'r') as f: for line in f.readlines(): linelist = line.strip().split(',') if len(linelist) < 7: continue fid = int(linelist[0]) if fid < 1: continue cid = int(linelist[7]) if is_gt: score = 1 # only for test use cid -= 1 else: score = float(linelist[6]) cls_result_dict = results_dict.setdefault(cid, dict()) cls_result_dict.setdefault(fid, list()) tlwh = tuple(map(float, linelist[2:6])) target_id = int(linelist[1]) cls_result_dict[fid].append((tlwh, target_id, score)) return results_dict def read_results(filename, data_type, is_gt=False, is_ignore=False, multi_class=False, union=False): if data_type in ['mcmot', 'lab']: if multi_class: if union: # The results are evaluated by union all the categories. # Track IDs between different categories cannot be duplicate. read_fun = read_mcmot_results_union else: # The results are evaluated separately by category. read_fun = read_mcmot_results else: raise ValueError('multi_class: {}, MCMOT should have cls_id.'. format(multi_class)) else: raise ValueError('Unknown data type: {}'.format(data_type)) return read_fun(filename, is_gt, is_ignore) def unzip_objs(objs): if len(objs) > 0: tlwhs, ids, scores = zip(*objs) else: tlwhs, ids, scores = [], [], [] tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4) return tlwhs, ids, scores def unzip_objs_cls(objs): if len(objs) > 0: tlwhs, ids, cls, scores = zip(*objs) else: tlwhs, ids, cls, scores = [], [], [], [] tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4) ids = np.array(ids) cls = np.array(cls) scores = np.array(scores) return tlwhs, ids, cls, scores class MCMOTEvaluator(object): def __init__(self, data_root, seq_name, data_type, num_classes): self.data_root = data_root self.seq_name = seq_name self.data_type = data_type self.num_classes = num_classes self.load_annotations() try: import motmetrics as mm mm.lap.default_solver = 'lap' except Exception as e: raise RuntimeError( 'Unable to use MCMOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics' ) self.reset_accumulator() self.class_accs = [] def load_annotations(self): assert self.data_type == 'mcmot' self.gt_filename = os.path.join(self.data_root, '../', 'sequences', '{}.txt'.format(self.seq_name)) if not os.path.exists(self.gt_filename): logger.warning( "gt_filename '{}' of MCMOTEvaluator is not exist, so the MOTA will be -INF." ) def reset_accumulator(self): self.acc = mm.MOTAccumulator(auto_id=True) def eval_frame_dict(self, trk_objs, gt_objs, rtn_events=False, union=False): if union: trk_tlwhs, trk_ids, trk_cls = unzip_objs_cls(trk_objs)[:3] gt_tlwhs, gt_ids, gt_cls = unzip_objs_cls(gt_objs)[:3] # get distance matrix iou_distance = mm.distances.iou_matrix( gt_tlwhs, trk_tlwhs, max_iou=0.5) # Set the distance between objects of different categories to nan gt_cls_len = len(gt_cls) trk_cls_len = len(trk_cls) # When the number of GT or Trk is 0, iou_distance dimension is (0,0) if gt_cls_len != 0 and trk_cls_len != 0: gt_cls = gt_cls.reshape(gt_cls_len, 1) gt_cls = np.repeat(gt_cls, trk_cls_len, axis=1) trk_cls = trk_cls.reshape(1, trk_cls_len) trk_cls = np.repeat(trk_cls, gt_cls_len, axis=0) iou_distance = np.where(gt_cls == trk_cls, iou_distance, np.nan) else: trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2] gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2] # get distance matrix iou_distance = mm.distances.iou_matrix( gt_tlwhs, trk_tlwhs, max_iou=0.5) self.acc.update(gt_ids, trk_ids, iou_distance) if rtn_events and iou_distance.size > 0 and hasattr(self.acc, 'mot_events'): events = self.acc.mot_events # only supported by https://github.com/longcw/py-motmetrics else: events = None return events def eval_file(self, result_filename): # evaluation of each category gt_frame_dict = read_results( self.gt_filename, self.data_type, is_gt=True, multi_class=True, union=False) result_frame_dict = read_results( result_filename, self.data_type, is_gt=False, multi_class=True, union=False) for cid in range(self.num_classes): self.reset_accumulator() cls_result_frame_dict = result_frame_dict.setdefault(cid, dict()) cls_gt_frame_dict = gt_frame_dict.setdefault(cid, dict()) # only labeled frames will be evaluated frames = sorted(list(set(cls_gt_frame_dict.keys()))) for frame_id in frames: trk_objs = cls_result_frame_dict.get(frame_id, []) gt_objs = cls_gt_frame_dict.get(frame_id, []) self.eval_frame_dict(trk_objs, gt_objs, rtn_events=False) self.class_accs.append(self.acc) return self.class_accs @staticmethod def get_summary(accs, names, metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1', 'precision', 'recall')): names = copy.deepcopy(names) if metrics is None: metrics = mm.metrics.motchallenge_metrics metrics = copy.deepcopy(metrics) mh = mm.metrics.create() summary = mh.compute_many( accs, metrics=metrics, names=names, generate_overall=True) return summary @staticmethod def save_summary(summary, filename): import pandas as pd writer = pd.ExcelWriter(filename) summary.to_excel(writer) writer.save() class MCMOTMetric(Metric): def __init__(self, num_classes, save_summary=False): self.num_classes = num_classes self.save_summary = save_summary self.MCMOTEvaluator = MCMOTEvaluator self.result_root = None self.reset() self.seqs_overall = defaultdict(list) def reset(self): self.accs = [] self.seqs = [] def update(self, data_root, seq, data_type, result_root, result_filename): evaluator = self.MCMOTEvaluator(data_root, seq, data_type, self.num_classes) seq_acc = evaluator.eval_file(result_filename) self.accs.append(seq_acc) self.seqs.append(seq) self.result_root = result_root cls_index_name = [ '{}_{}'.format(seq, i) for i in range(self.num_classes) ] summary = parse_accs_metrics(seq_acc, cls_index_name) summary.rename( index={'OVERALL': '{}_OVERALL'.format(seq)}, inplace=True) for row in range(len(summary)): self.seqs_overall[row].append(summary.iloc[row:row + 1]) def accumulate(self): self.cls_summary_list = [] for row in range(self.num_classes): seqs_cls_df = pd.concat(self.seqs_overall[row]) seqs_cls_summary = seqs_overall_metrics(seqs_cls_df) cls_summary_overall = seqs_cls_summary.iloc[-1:].copy() cls_summary_overall.rename( index={'overall_calc': 'overall_calc_{}'.format(row)}, inplace=True) self.cls_summary_list.append(cls_summary_overall) def log(self): seqs_summary = seqs_overall_metrics( pd.concat(self.seqs_overall[self.num_classes]), verbose=True) class_summary = seqs_overall_metrics( pd.concat(self.cls_summary_list), verbose=True) def get_results(self): return 1 ================================================ FILE: ppdet/metrics/metrics.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys import json import paddle import numpy as np import typing from collections import defaultdict from pathlib import Path from .map_utils import prune_zero_padding, DetectionMAP from .coco_utils import get_infer_results, cocoapi_eval from .lvis_utils import lvisapi_eval from .widerface_utils import (face_eval_run, image_eval, img_pr_info, dataset_pr_info, voc_ap) from ppdet.data.source.category import get_categories from ppdet.modeling.rbox_utils import poly2rbox_np from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) __all__ = [ 'Metric', 'COCOMetric', 'VOCMetric', 'WiderFaceMetric', 'get_infer_results', 'RBoxMetric', 'SNIPERCOCOMetric', 'LVISMetric' ] COCO_SIGMAS = np.array([ .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89 ]) / 10.0 CROWD_SIGMAS = np.array( [.79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89, .79, .79]) / 10.0 class Metric(paddle.metric.Metric): def name(self): return self.__class__.__name__ def reset(self): pass def accumulate(self): pass # paddle.metric.Metric defined :metch:`update`, :meth:`accumulate` # :metch:`reset`, in ppdet, we also need following 2 methods: # abstract method for logging metric results def log(self): pass # abstract method for getting metric results def get_results(self): pass class COCOMetric(Metric): def __init__(self, anno_file, **kwargs): self.anno_file = anno_file self.clsid2catid = kwargs.get('clsid2catid', None) if self.clsid2catid is None: self.clsid2catid, _ = get_categories('COCO', anno_file) self.classwise = kwargs.get('classwise', False) self.output_eval = kwargs.get('output_eval', None) # TODO: bias should be unified self.bias = kwargs.get('bias', 0) self.save_prediction_only = kwargs.get('save_prediction_only', False) self.iou_type = kwargs.get('IouType', 'bbox') if not self.save_prediction_only: assert os.path.isfile(anno_file), \ "anno_file {} not a file".format(anno_file) if self.output_eval is not None: Path(self.output_eval).mkdir(exist_ok=True) self.save_threshold = kwargs.get('save_threshold', 0) self.reset() def reset(self): # only bbox and mask evaluation support currently self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []} self.eval_results = {} def update(self, inputs, outputs): outs = {} # outputs Tensor -> numpy.ndarray for k, v in outputs.items(): outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v # multi-scale inputs: all inputs have same im_id if isinstance(inputs, typing.Sequence): im_id = inputs[0]['im_id'] else: im_id = inputs['im_id'] outs['im_id'] = im_id.numpy() if isinstance(im_id, paddle.Tensor) else im_id if 'im_file' in inputs: outs['im_file'] = inputs['im_file'] infer_results = get_infer_results( outs, self.clsid2catid, bias=self.bias, save_threshold=self.save_threshold) self.results['bbox'] += infer_results[ 'bbox'] if 'bbox' in infer_results else [] self.results['mask'] += infer_results[ 'mask'] if 'mask' in infer_results else [] self.results['segm'] += infer_results[ 'segm'] if 'segm' in infer_results else [] self.results['keypoint'] += infer_results[ 'keypoint'] if 'keypoint' in infer_results else [] def accumulate(self): if len(self.results['bbox']) > 0: output = "bbox.json" if self.output_eval: output = os.path.join(self.output_eval, output) with open(output, 'w') as f: json.dump(self.results['bbox'], f) logger.info('The bbox result is saved to bbox.json.') if self.save_prediction_only: logger.info('The bbox result is saved to {} and do not ' 'evaluate the mAP.'.format(output)) else: bbox_stats = cocoapi_eval( output, 'bbox', anno_file=self.anno_file, classwise=self.classwise) self.eval_results['bbox'] = bbox_stats sys.stdout.flush() if len(self.results['mask']) > 0: output = "mask.json" if self.output_eval: output = os.path.join(self.output_eval, output) with open(output, 'w') as f: json.dump(self.results['mask'], f) logger.info('The mask result is saved to mask.json.') if self.save_prediction_only: logger.info('The mask result is saved to {} and do not ' 'evaluate the mAP.'.format(output)) else: seg_stats = cocoapi_eval( output, 'segm', anno_file=self.anno_file, classwise=self.classwise) self.eval_results['mask'] = seg_stats sys.stdout.flush() if len(self.results['segm']) > 0: output = "segm.json" if self.output_eval: output = os.path.join(self.output_eval, output) with open(output, 'w') as f: json.dump(self.results['segm'], f) logger.info('The segm result is saved to segm.json.') if self.save_prediction_only: logger.info('The segm result is saved to {} and do not ' 'evaluate the mAP.'.format(output)) else: seg_stats = cocoapi_eval( output, 'segm', anno_file=self.anno_file, classwise=self.classwise) self.eval_results['mask'] = seg_stats sys.stdout.flush() if len(self.results['keypoint']) > 0: output = "keypoint.json" if self.output_eval: output = os.path.join(self.output_eval, output) with open(output, 'w') as f: json.dump(self.results['keypoint'], f) logger.info('The keypoint result is saved to keypoint.json.') if self.save_prediction_only: logger.info('The keypoint result is saved to {} and do not ' 'evaluate the mAP.'.format(output)) else: style = 'keypoints' use_area = True sigmas = COCO_SIGMAS if self.iou_type == 'keypoints_crowd': style = 'keypoints_crowd' use_area = False sigmas = CROWD_SIGMAS keypoint_stats = cocoapi_eval( output, style, anno_file=self.anno_file, classwise=self.classwise, sigmas=sigmas, use_area=use_area) self.eval_results['keypoint'] = keypoint_stats sys.stdout.flush() def log(self): pass def get_results(self): return self.eval_results class LVISMetric(Metric): def __init__(self, anno_file, **kwargs): self.anno_file = anno_file self.clsid2catid = kwargs.get('clsid2catid', None) if self.clsid2catid is None: self.clsid2catid, _ = get_categories('COCO', anno_file) self.classwise = kwargs.get('classwise', False) self.output_eval = kwargs.get('output_eval', None) # TODO: bias should be unified self.bias = kwargs.get('bias', 0) self.save_prediction_only = kwargs.get('save_prediction_only', False) self.iou_type = kwargs.get('IouType', 'bbox') if not self.save_prediction_only: assert os.path.isfile(anno_file), \ "anno_file {} not a file".format(anno_file) if self.output_eval is not None: Path(self.output_eval).mkdir(exist_ok=True) self.reset() def reset(self): # only bbox and mask evaluation support currently self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []} self.eval_results = {} def update(self, inputs, outputs): outs = {} # outputs Tensor -> numpy.ndarray for k, v in outputs.items(): outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v # multi-scale inputs: all inputs have same im_id if isinstance(inputs, typing.Sequence): im_id = inputs[0]['im_id'] else: im_id = inputs['im_id'] outs['im_id'] = im_id.numpy() if isinstance(im_id, paddle.Tensor) else im_id infer_results = get_infer_results( outs, self.clsid2catid, bias=self.bias) self.results['bbox'] += infer_results[ 'bbox'] if 'bbox' in infer_results else [] self.results['mask'] += infer_results[ 'mask'] if 'mask' in infer_results else [] self.results['segm'] += infer_results[ 'segm'] if 'segm' in infer_results else [] self.results['keypoint'] += infer_results[ 'keypoint'] if 'keypoint' in infer_results else [] def accumulate(self): if len(self.results['bbox']) > 0: output = "bbox.json" if self.output_eval: output = os.path.join(self.output_eval, output) with open(output, 'w') as f: json.dump(self.results['bbox'], f) logger.info('The bbox result is saved to bbox.json.') if self.save_prediction_only: logger.info('The bbox result is saved to {} and do not ' 'evaluate the mAP.'.format(output)) else: bbox_stats = lvisapi_eval( output, 'bbox', anno_file=self.anno_file, classwise=self.classwise ) self.eval_results['bbox'] = bbox_stats sys.stdout.flush() if len(self.results['mask']) > 0: output = "mask.json" if self.output_eval: output = os.path.join(self.output_eval, output) with open(output, 'w') as f: json.dump(self.results['mask'], f) logger.info('The mask result is saved to mask.json.') if self.save_prediction_only: logger.info('The mask result is saved to {} and do not ' 'evaluate the mAP.'.format(output)) else: seg_stats = cocoapi_eval( output, 'segm', anno_file=self.anno_file, classwise=self.classwise) self.eval_results['mask'] = seg_stats sys.stdout.flush() if len(self.results['segm']) > 0: output = "segm.json" if self.output_eval: output = os.path.join(self.output_eval, output) with open(output, 'w') as f: json.dump(self.results['segm'], f) logger.info('The segm result is saved to segm.json.') if self.save_prediction_only: logger.info('The segm result is saved to {} and do not ' 'evaluate the mAP.'.format(output)) else: seg_stats = cocoapi_eval( output, 'segm', anno_file=self.anno_file, classwise=self.classwise) self.eval_results['mask'] = seg_stats sys.stdout.flush() if len(self.results['keypoint']) > 0: output = "keypoint.json" if self.output_eval: output = os.path.join(self.output_eval, output) with open(output, 'w') as f: json.dump(self.results['keypoint'], f) logger.info('The keypoint result is saved to keypoint.json.') if self.save_prediction_only: logger.info('The keypoint result is saved to {} and do not ' 'evaluate the mAP.'.format(output)) else: style = 'keypoints' use_area = True sigmas = COCO_SIGMAS if self.iou_type == 'keypoints_crowd': style = 'keypoints_crowd' use_area = False sigmas = CROWD_SIGMAS keypoint_stats = cocoapi_eval( output, style, anno_file=self.anno_file, classwise=self.classwise, sigmas=sigmas, use_area=use_area) self.eval_results['keypoint'] = keypoint_stats sys.stdout.flush() def log(self): # pass logger.info(self.eval_results['bbox']) def get_results(self): return self.eval_results class VOCMetric(Metric): def __init__(self, label_list, class_num=20, overlap_thresh=0.5, map_type='11point', is_bbox_normalized=False, evaluate_difficult=False, classwise=False, output_eval=None, save_prediction_only=False): assert os.path.isfile(label_list), \ "label_list {} not a file".format(label_list) self.clsid2catid, self.catid2name = get_categories('VOC', label_list) self.overlap_thresh = overlap_thresh self.map_type = map_type self.evaluate_difficult = evaluate_difficult self.output_eval = output_eval self.save_prediction_only = save_prediction_only self.detection_map = DetectionMAP( class_num=class_num, overlap_thresh=overlap_thresh, map_type=map_type, is_bbox_normalized=is_bbox_normalized, evaluate_difficult=evaluate_difficult, catid2name=self.catid2name, classwise=classwise) self.reset() def reset(self): self.results = {'bbox': [], 'score': [], 'label': []} self.detection_map.reset() def update(self, inputs, outputs): bbox_np = outputs['bbox'].numpy() if isinstance( outputs['bbox'], paddle.Tensor) else outputs['bbox'] bboxes = bbox_np[:, 2:] scores = bbox_np[:, 1] labels = bbox_np[:, 0] bbox_lengths = outputs['bbox_num'].numpy() if isinstance( outputs['bbox_num'], paddle.Tensor) else outputs['bbox_num'] self.results['bbox'].append(bboxes.tolist()) self.results['score'].append(scores.tolist()) self.results['label'].append(labels.tolist()) if bboxes.shape == (1, 1) or bboxes is None: return if self.save_prediction_only: return gt_boxes = inputs['gt_bbox'] gt_labels = inputs['gt_class'] difficults = inputs['difficult'] if not self.evaluate_difficult \ else None if 'scale_factor' in inputs: scale_factor = inputs['scale_factor'].numpy() if isinstance( inputs['scale_factor'], paddle.Tensor) else inputs['scale_factor'] else: scale_factor = np.ones((gt_boxes.shape[0], 2)).astype('float32') bbox_idx = 0 for i in range(len(gt_boxes)): gt_box = gt_boxes[i].numpy() if isinstance( gt_boxes[i], paddle.Tensor) else gt_boxes[i] h, w = scale_factor[i] gt_box = gt_box / np.array([w, h, w, h]) gt_label = gt_labels[i].numpy() if isinstance( gt_labels[i], paddle.Tensor) else gt_labels[i] if difficults is not None: difficult = difficults[i].numpy() if isinstance( difficults[i], paddle.Tensor) else difficults[i] else: difficult = None bbox_num = bbox_lengths[i] bbox = bboxes[bbox_idx:bbox_idx + bbox_num] score = scores[bbox_idx:bbox_idx + bbox_num] label = labels[bbox_idx:bbox_idx + bbox_num] gt_box, gt_label, difficult = prune_zero_padding(gt_box, gt_label, difficult) self.detection_map.update(bbox, score, label, gt_box, gt_label, difficult) bbox_idx += bbox_num def accumulate(self): output = "bbox.json" if self.output_eval: output = os.path.join(self.output_eval, output) with open(output, 'w') as f: json.dump(self.results, f) logger.info('The bbox result is saved to bbox.json.') if self.save_prediction_only: return logger.info("Accumulating evaluatation results...") self.detection_map.accumulate() def log(self): map_stat = 100. * self.detection_map.get_map() logger.info("mAP({:.2f}, {}) = {:.2f}%".format(self.overlap_thresh, self.map_type, map_stat)) def get_results(self): return {'bbox': [self.detection_map.get_map()]} class WiderFaceMetric(Metric): def __init__(self, iou_thresh=0.5): self.iou_thresh = iou_thresh self.reset() def reset(self): self.pred_boxes_list = [] self.gt_boxes_list = [] self.aps = [] self.hard_ignore_list = [] self.medium_ignore_list = [] self.easy_ignore_list = [] def update(self, data, outs): batch_pred_bboxes = outs['bbox'] batch_pred_bboxes_num = outs['bbox_num'] assert len(batch_pred_bboxes_num) == len(data['gt_bbox']) batch_size = len(data['gt_bbox']) box_cnt = 0 for batch_id in range(batch_size): pred_bboxes_num = batch_pred_bboxes_num[batch_id] pred_bboxes = batch_pred_bboxes[box_cnt: box_cnt + pred_bboxes_num].numpy() box_cnt += pred_bboxes_num det_conf = pred_bboxes[:, 1] det_xmin = pred_bboxes[:, 2] det_ymin = pred_bboxes[:, 3] det_xmax = pred_bboxes[:, 4] det_ymax = pred_bboxes[:, 5] det = np.column_stack((det_xmin, det_ymin, det_xmax, det_ymax, det_conf)) self.pred_boxes_list.append(det) # xyxy conf self.gt_boxes_list.append(data['gt_ori_bbox'][batch_id].numpy()) # xywh self.hard_ignore_list.append( data['gt_hard_ignore'][batch_id].numpy()) self.medium_ignore_list.append( data['gt_medium_ignore'][batch_id].numpy()) self.easy_ignore_list.append( data['gt_easy_ignore'][batch_id].numpy()) def accumulate(self): total_num = len(self.gt_boxes_list) settings = ['easy', 'medium', 'hard'] setting_ingores = [self.easy_ignore_list, self.medium_ignore_list, self.hard_ignore_list] thresh_num = 1000 aps = [] for setting_id in range(3): count_face = 0 pr_curve = np.zeros((thresh_num, 2)).astype(np.float32) gt_ignore_list = setting_ingores[setting_id] for i in range(total_num): pred_boxes = self.pred_boxes_list[i] # xyxy conf gt_boxes = self.gt_boxes_list[i] # xywh ignore = gt_ignore_list[i] count_face += np.sum(ignore) if len(gt_boxes) == 0 or len(pred_boxes) == 0: continue pred_recall, proposal_list = image_eval(pred_boxes, gt_boxes, ignore, self.iou_thresh) _img_pr_info = img_pr_info(thresh_num, pred_boxes, proposal_list, pred_recall) pr_curve += _img_pr_info pr_curve = dataset_pr_info(thresh_num, pr_curve, count_face) propose = pr_curve[:, 0] recall = pr_curve[:, 1] ap = voc_ap(recall, propose) aps.append(ap) self.aps = aps def log(self): logger.info("==================== Results ====================") logger.info("Easy Val AP: {}".format(self.aps[0])) logger.info("Medium Val AP: {}".format(self.aps[1])) logger.info("Hard Val AP: {}".format(self.aps[2])) logger.info("=================================================") def get_results(self): return { 'easy_ap': self.aps[0], 'medium_ap': self.aps[1], 'hard_ap': self.aps[2]} class RBoxMetric(Metric): def __init__(self, anno_file, **kwargs): self.anno_file = anno_file self.clsid2catid, self.catid2name = get_categories('RBOX', anno_file) self.catid2clsid = {v: k for k, v in self.clsid2catid.items()} self.classwise = kwargs.get('classwise', False) self.output_eval = kwargs.get('output_eval', None) self.save_prediction_only = kwargs.get('save_prediction_only', False) self.overlap_thresh = kwargs.get('overlap_thresh', 0.5) self.map_type = kwargs.get('map_type', '11point') self.evaluate_difficult = kwargs.get('evaluate_difficult', False) self.imid2path = kwargs.get('imid2path', None) class_num = len(self.catid2name) self.detection_map = DetectionMAP( class_num=class_num, overlap_thresh=self.overlap_thresh, map_type=self.map_type, is_bbox_normalized=False, evaluate_difficult=self.evaluate_difficult, catid2name=self.catid2name, classwise=self.classwise) self.reset() def reset(self): self.results = [] self.detection_map.reset() def update(self, inputs, outputs): outs = {} # outputs Tensor -> numpy.ndarray for k, v in outputs.items(): outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v im_id = inputs['im_id'] im_id = im_id.numpy() if isinstance(im_id, paddle.Tensor) else im_id outs['im_id'] = im_id infer_results = get_infer_results(outs, self.clsid2catid) infer_results = infer_results['bbox'] if 'bbox' in infer_results else [] self.results += infer_results if self.save_prediction_only: return gt_boxes = inputs['gt_poly'] gt_labels = inputs['gt_class'] if 'scale_factor' in inputs: scale_factor = inputs['scale_factor'].numpy() if isinstance( inputs['scale_factor'], paddle.Tensor) else inputs['scale_factor'] else: scale_factor = np.ones((gt_boxes.shape[0], 2)).astype('float32') for i in range(len(gt_boxes)): gt_box = gt_boxes[i].numpy() if isinstance( gt_boxes[i], paddle.Tensor) else gt_boxes[i] h, w = scale_factor[i] gt_box = gt_box / np.array([w, h, w, h, w, h, w, h]) gt_label = gt_labels[i].numpy() if isinstance( gt_labels[i], paddle.Tensor) else gt_labels[i] gt_box, gt_label, _ = prune_zero_padding(gt_box, gt_label) bbox = [ res['bbox'] for res in infer_results if int(res['image_id']) == int(im_id[i]) ] score = [ res['score'] for res in infer_results if int(res['image_id']) == int(im_id[i]) ] label = [ self.catid2clsid[int(res['category_id'])] for res in infer_results if int(res['image_id']) == int(im_id[i]) ] self.detection_map.update(bbox, score, label, gt_box, gt_label) def save_results(self, results, output_dir, imid2path): if imid2path: data_dicts = defaultdict(list) for result in results: image_id = result['image_id'] data_dicts[image_id].append(result) for image_id, image_path in imid2path.items(): basename = os.path.splitext(os.path.split(image_path)[-1])[0] output = os.path.join(output_dir, "{}.txt".format(basename)) dets = data_dicts.get(image_id, []) with open(output, 'w') as f: for det in dets: catid, bbox, score = det['category_id'], det[ 'bbox'], det['score'] bbox_pred = '{} {} '.format(self.catid2name[catid], score) + ' '.join( [str(e) for e in bbox]) f.write(bbox_pred + '\n') logger.info('The bbox result is saved to {}.'.format(output_dir)) else: output = os.path.join(output_dir, "bbox.json") with open(output, 'w') as f: json.dump(results, f) logger.info('The bbox result is saved to {}.'.format(output)) def accumulate(self): if self.output_eval: self.save_results(self.results, self.output_eval, self.imid2path) if not self.save_prediction_only: logger.info("Accumulating evaluatation results...") self.detection_map.accumulate() def log(self): map_stat = 100. * self.detection_map.get_map() logger.info("mAP({:.2f}, {}) = {:.2f}%".format(self.overlap_thresh, self.map_type, map_stat)) def get_results(self): return {'bbox': [self.detection_map.get_map()]} class SNIPERCOCOMetric(COCOMetric): def __init__(self, anno_file, **kwargs): super(SNIPERCOCOMetric, self).__init__(anno_file, **kwargs) self.dataset = kwargs["dataset"] self.chip_results = [] def reset(self): # only bbox and mask evaluation support currently self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []} self.eval_results = {} self.chip_results = [] def update(self, inputs, outputs): outs = {} # outputs Tensor -> numpy.ndarray for k, v in outputs.items(): outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v im_id = inputs['im_id'] outs['im_id'] = im_id.numpy() if isinstance(im_id, paddle.Tensor) else im_id self.chip_results.append(outs) def accumulate(self): results = self.dataset.anno_cropper.aggregate_chips_detections( self.chip_results) for outs in results: infer_results = get_infer_results( outs, self.clsid2catid, bias=self.bias) self.results['bbox'] += infer_results[ 'bbox'] if 'bbox' in infer_results else [] super(SNIPERCOCOMetric, self).accumulate() ================================================ FILE: ppdet/metrics/mot_metrics.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import copy import sys import math from collections import defaultdict import numpy as np from ppdet.modeling.bbox_utils import bbox_iou_np_expand from .map_utils import ap_per_class from .metrics import Metric from .munkres import Munkres try: import motmetrics as mm mm.lap.default_solver = 'lap' except: print( 'Warning: Unable to use MOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics' ) pass from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) __all__ = ['MOTEvaluator', 'MOTMetric', 'JDEDetMetric', 'KITTIMOTMetric'] def read_mot_results(filename, is_gt=False, is_ignore=False): valid_label = [1] ignore_labels = [2, 7, 8, 12] # only in motchallenge datasets like 'MOT16' if is_gt: logger.info( "In MOT16/17 dataset the valid_label of ground truth is '{}', " "in other dataset it should be '0' for single classs MOT.".format( valid_label[0])) results_dict = dict() if os.path.isfile(filename): with open(filename, 'r') as f: for line in f.readlines(): linelist = line.split(',') if len(linelist) < 7: continue fid = int(linelist[0]) if fid < 1: continue results_dict.setdefault(fid, list()) if is_gt: label = int(float(linelist[7])) mark = int(float(linelist[6])) if mark == 0 or label not in valid_label: continue score = 1 elif is_ignore: if 'MOT16-' in filename or 'MOT17-' in filename or 'MOT15-' in filename or 'MOT20-' in filename: label = int(float(linelist[7])) vis_ratio = float(linelist[8]) if label not in ignore_labels and vis_ratio >= 0: continue else: continue score = 1 else: score = float(linelist[6]) tlwh = tuple(map(float, linelist[2:6])) target_id = int(linelist[1]) results_dict[fid].append((tlwh, target_id, score)) return results_dict """ MOT dataset label list, see in https://motchallenge.net labels={'ped', ... % 1 'person_on_vhcl', ... % 2 'car', ... % 3 'bicycle', ... % 4 'mbike', ... % 5 'non_mot_vhcl', ... % 6 'static_person', ... % 7 'distractor', ... % 8 'occluder', ... % 9 'occluder_on_grnd', ... % 10 'occluder_full', ... % 11 'reflection', ... % 12 'crowd' ... % 13 }; """ def unzip_objs(objs): if len(objs) > 0: tlwhs, ids, scores = zip(*objs) else: tlwhs, ids, scores = [], [], [] tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4) return tlwhs, ids, scores class MOTEvaluator(object): def __init__(self, data_root, seq_name, data_type): self.data_root = data_root self.seq_name = seq_name self.data_type = data_type self.load_annotations() try: import motmetrics as mm mm.lap.default_solver = 'lap' except Exception as e: raise RuntimeError( 'Unable to use MOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics' ) self.reset_accumulator() def load_annotations(self): assert self.data_type == 'mot' gt_filename = os.path.join(self.data_root, self.seq_name, 'gt', 'gt.txt') if not os.path.exists(gt_filename): logger.warning( "gt_filename '{}' of MOTEvaluator is not exist, so the MOTA will be -INF." ) self.gt_frame_dict = read_mot_results(gt_filename, is_gt=True) self.gt_ignore_frame_dict = read_mot_results( gt_filename, is_ignore=True) def reset_accumulator(self): self.acc = mm.MOTAccumulator(auto_id=True) def eval_frame(self, frame_id, trk_tlwhs, trk_ids, rtn_events=False): # results trk_tlwhs = np.copy(trk_tlwhs) trk_ids = np.copy(trk_ids) # gts gt_objs = self.gt_frame_dict.get(frame_id, []) gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2] # ignore boxes ignore_objs = self.gt_ignore_frame_dict.get(frame_id, []) ignore_tlwhs = unzip_objs(ignore_objs)[0] # remove ignored results keep = np.ones(len(trk_tlwhs), dtype=bool) iou_distance = mm.distances.iou_matrix( ignore_tlwhs, trk_tlwhs, max_iou=0.5) if len(iou_distance) > 0: match_is, match_js = mm.lap.linear_sum_assignment(iou_distance) match_is, match_js = map(lambda a: np.asarray(a, dtype=int), [match_is, match_js]) match_ious = iou_distance[match_is, match_js] match_js = np.asarray(match_js, dtype=int) match_js = match_js[np.logical_not(np.isnan(match_ious))] keep[match_js] = False trk_tlwhs = trk_tlwhs[keep] trk_ids = trk_ids[keep] # get distance matrix iou_distance = mm.distances.iou_matrix(gt_tlwhs, trk_tlwhs, max_iou=0.5) # acc self.acc.update(gt_ids, trk_ids, iou_distance) if rtn_events and iou_distance.size > 0 and hasattr(self.acc, 'last_mot_events'): events = self.acc.last_mot_events # only supported by https://github.com/longcw/py-motmetrics else: events = None return events def eval_file(self, filename): self.reset_accumulator() result_frame_dict = read_mot_results(filename, is_gt=False) frames = sorted(list(set(result_frame_dict.keys()))) for frame_id in frames: trk_objs = result_frame_dict.get(frame_id, []) trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2] self.eval_frame(frame_id, trk_tlwhs, trk_ids, rtn_events=False) return self.acc @staticmethod def get_summary(accs, names, metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1', 'precision', 'recall')): names = copy.deepcopy(names) if metrics is None: metrics = mm.metrics.motchallenge_metrics metrics = copy.deepcopy(metrics) mh = mm.metrics.create() summary = mh.compute_many( accs, metrics=metrics, names=names, generate_overall=True) return summary @staticmethod def save_summary(summary, filename): import pandas as pd writer = pd.ExcelWriter(filename) summary.to_excel(writer) writer.save() class MOTMetric(Metric): def __init__(self, save_summary=False): self.save_summary = save_summary self.MOTEvaluator = MOTEvaluator self.result_root = None self.reset() def reset(self): self.accs = [] self.seqs = [] def update(self, data_root, seq, data_type, result_root, result_filename): evaluator = self.MOTEvaluator(data_root, seq, data_type) self.accs.append(evaluator.eval_file(result_filename)) self.seqs.append(seq) self.result_root = result_root def accumulate(self): metrics = mm.metrics.motchallenge_metrics mh = mm.metrics.create() summary = self.MOTEvaluator.get_summary(self.accs, self.seqs, metrics) self.strsummary = mm.io.render_summary( summary, formatters=mh.formatters, namemap=mm.io.motchallenge_metric_names) if self.save_summary: self.MOTEvaluator.save_summary( summary, os.path.join(self.result_root, 'summary.xlsx')) def log(self): print(self.strsummary) def get_results(self): return self.strsummary class JDEDetMetric(Metric): # Note this detection AP metric is different from COCOMetric or VOCMetric, # and the bboxes coordinates are not scaled to the original image def __init__(self, overlap_thresh=0.5): self.overlap_thresh = overlap_thresh self.reset() def reset(self): self.AP_accum = np.zeros(1) self.AP_accum_count = np.zeros(1) def update(self, inputs, outputs): bboxes = outputs['bbox'][:, 2:].numpy() scores = outputs['bbox'][:, 1].numpy() labels = outputs['bbox'][:, 0].numpy() bbox_lengths = outputs['bbox_num'].numpy() if bboxes.shape[0] == 1 and bboxes.sum() == 0.0: return gt_boxes = inputs['gt_bbox'].numpy()[0] gt_labels = inputs['gt_class'].numpy()[0] if gt_labels.shape[0] == 0: return correct = [] detected = [] for i in range(bboxes.shape[0]): obj_pred = 0 pred_bbox = bboxes[i].reshape(1, 4) # Compute iou with target boxes iou = bbox_iou_np_expand(pred_bbox, gt_boxes, x1y1x2y2=True)[0] # Extract index of largest overlap best_i = np.argmax(iou) # If overlap exceeds threshold and classification is correct mark as correct if iou[best_i] > self.overlap_thresh and obj_pred == gt_labels[ best_i] and best_i not in detected: correct.append(1) detected.append(best_i) else: correct.append(0) # Compute Average Precision (AP) per class target_cls = list(gt_labels.T[0]) AP, AP_class, R, P = ap_per_class( tp=correct, conf=scores, pred_cls=np.zeros_like(scores), target_cls=target_cls) self.AP_accum_count += np.bincount(AP_class, minlength=1) self.AP_accum += np.bincount(AP_class, minlength=1, weights=AP) def accumulate(self): logger.info("Accumulating evaluatation results...") self.map_stat = self.AP_accum[0] / (self.AP_accum_count[0] + 1E-16) def log(self): map_stat = 100. * self.map_stat logger.info("mAP({:.2f}) = {:.2f}%".format(self.overlap_thresh, map_stat)) def get_results(self): return self.map_stat """ Following code is borrow from https://github.com/xingyizhou/CenterTrack/blob/master/src/tools/eval_kitti_track/evaluate_tracking.py """ class tData: """ Utility class to load data. """ def __init__(self,frame=-1,obj_type="unset",truncation=-1,occlusion=-1,\ obs_angle=-10,x1=-1,y1=-1,x2=-1,y2=-1,w=-1,h=-1,l=-1,\ X=-1000,Y=-1000,Z=-1000,yaw=-10,score=-1000,track_id=-1): """ Constructor, initializes the object given the parameters. """ self.frame = frame self.track_id = track_id self.obj_type = obj_type self.truncation = truncation self.occlusion = occlusion self.obs_angle = obs_angle self.x1 = x1 self.y1 = y1 self.x2 = x2 self.y2 = y2 self.w = w self.h = h self.l = l self.X = X self.Y = Y self.Z = Z self.yaw = yaw self.score = score self.ignored = False self.valid = False self.tracker = -1 def __str__(self): attrs = vars(self) return '\n'.join("%s: %s" % item for item in attrs.items()) class KITTIEvaluation(object): """ KITTI tracking statistics (CLEAR MOT, id-switches, fragments, ML/PT/MT, precision/recall) MOTA - Multi-object tracking accuracy in [0,100] MOTP - Multi-object tracking precision in [0,100] (3D) / [td,100] (2D) MOTAL - Multi-object tracking accuracy in [0,100] with log10(id-switches) id-switches - number of id switches fragments - number of fragmentations MT, PT, ML - number of mostly tracked, partially tracked and mostly lost trajectories recall - recall = percentage of detected targets precision - precision = percentage of correctly detected targets FAR - number of false alarms per frame falsepositives - number of false positives (FP) missed - number of missed targets (FN) """ def __init__(self, result_path, gt_path, min_overlap=0.5, max_truncation = 0,\ min_height = 25, max_occlusion = 2, cls="car",\ n_frames=[], seqs=[], n_sequences=0): # get number of sequences and # get number of frames per sequence from test mapping # (created while extracting the benchmark) self.gt_path = os.path.join(gt_path, "../labels") self.n_frames = n_frames self.sequence_name = seqs self.n_sequences = n_sequences self.cls = cls # class to evaluate, i.e. pedestrian or car self.result_path = result_path # statistics and numbers for evaluation self.n_gt = 0 # number of ground truth detections minus ignored false negatives and true positives self.n_igt = 0 # number of ignored ground truth detections self.n_gts = [ ] # number of ground truth detections minus ignored false negatives and true positives PER SEQUENCE self.n_igts = [ ] # number of ground ignored truth detections PER SEQUENCE self.n_gt_trajectories = 0 self.n_gt_seq = [] self.n_tr = 0 # number of tracker detections minus ignored tracker detections self.n_trs = [ ] # number of tracker detections minus ignored tracker detections PER SEQUENCE self.n_itr = 0 # number of ignored tracker detections self.n_itrs = [] # number of ignored tracker detections PER SEQUENCE self.n_igttr = 0 # number of ignored ground truth detections where the corresponding associated tracker detection is also ignored self.n_tr_trajectories = 0 self.n_tr_seq = [] self.MOTA = 0 self.MOTP = 0 self.MOTAL = 0 self.MODA = 0 self.MODP = 0 self.MODP_t = [] self.recall = 0 self.precision = 0 self.F1 = 0 self.FAR = 0 self.total_cost = 0 self.itp = 0 # number of ignored true positives self.itps = [] # number of ignored true positives PER SEQUENCE self.tp = 0 # number of true positives including ignored true positives! self.tps = [ ] # number of true positives including ignored true positives PER SEQUENCE self.fn = 0 # number of false negatives WITHOUT ignored false negatives self.fns = [ ] # number of false negatives WITHOUT ignored false negatives PER SEQUENCE self.ifn = 0 # number of ignored false negatives self.ifns = [] # number of ignored false negatives PER SEQUENCE self.fp = 0 # number of false positives # a bit tricky, the number of ignored false negatives and ignored true positives # is subtracted, but if both tracker detection and ground truth detection # are ignored this number is added again to avoid double counting self.fps = [] # above PER SEQUENCE self.mme = 0 self.fragments = 0 self.id_switches = 0 self.MT = 0 self.PT = 0 self.ML = 0 self.min_overlap = min_overlap # minimum bounding box overlap for 3rd party metrics self.max_truncation = max_truncation # maximum truncation of an object for evaluation self.max_occlusion = max_occlusion # maximum occlusion of an object for evaluation self.min_height = min_height # minimum height of an object for evaluation self.n_sample_points = 500 # this should be enough to hold all groundtruth trajectories # is expanded if necessary and reduced in any case self.gt_trajectories = [[] for x in range(self.n_sequences)] self.ign_trajectories = [[] for x in range(self.n_sequences)] def loadGroundtruth(self): try: self._loadData(self.gt_path, cls=self.cls, loading_groundtruth=True) except IOError: return False return True def loadTracker(self): try: if not self._loadData( self.result_path, cls=self.cls, loading_groundtruth=False): return False except IOError: return False return True def _loadData(self, root_dir, cls, min_score=-1000, loading_groundtruth=False): """ Generic loader for ground truth and tracking data. Use loadGroundtruth() or loadTracker() to load this data. Loads detections in KITTI format from textfiles. """ # construct objectDetections object to hold detection data t_data = tData() data = [] eval_2d = True eval_3d = True seq_data = [] n_trajectories = 0 n_trajectories_seq = [] for seq, s_name in enumerate(self.sequence_name): i = 0 filename = os.path.join(root_dir, "%s.txt" % s_name) f = open(filename, "r") f_data = [ [] for x in range(self.n_frames[seq]) ] # current set has only 1059 entries, sufficient length is checked anyway ids = [] n_in_seq = 0 id_frame_cache = [] for line in f: # KITTI tracking benchmark data format: # (frame,tracklet_id,objectType,truncation,occlusion,alpha,x1,y1,x2,y2,h,w,l,X,Y,Z,ry) line = line.strip() fields = line.split(" ") # classes that should be loaded (ignored neighboring classes) if "car" in cls.lower(): classes = ["car", "van"] elif "pedestrian" in cls.lower(): classes = ["pedestrian", "person_sitting"] else: classes = [cls.lower()] classes += ["dontcare"] if not any([s for s in classes if s in fields[2].lower()]): continue # get fields from table t_data.frame = int(float(fields[0])) # frame t_data.track_id = int(float(fields[1])) # id t_data.obj_type = fields[ 2].lower() # object type [car, pedestrian, cyclist, ...] t_data.truncation = int( float(fields[3])) # truncation [-1,0,1,2] t_data.occlusion = int( float(fields[4])) # occlusion [-1,0,1,2] t_data.obs_angle = float(fields[5]) # observation angle [rad] t_data.x1 = float(fields[6]) # left [px] t_data.y1 = float(fields[7]) # top [px] t_data.x2 = float(fields[8]) # right [px] t_data.y2 = float(fields[9]) # bottom [px] t_data.h = float(fields[10]) # height [m] t_data.w = float(fields[11]) # width [m] t_data.l = float(fields[12]) # length [m] t_data.X = float(fields[13]) # X [m] t_data.Y = float(fields[14]) # Y [m] t_data.Z = float(fields[15]) # Z [m] t_data.yaw = float(fields[16]) # yaw angle [rad] if not loading_groundtruth: if len(fields) == 17: t_data.score = -1 elif len(fields) == 18: t_data.score = float(fields[17]) # detection score else: logger.info("file is not in KITTI format") return # do not consider objects marked as invalid if t_data.track_id is -1 and t_data.obj_type != "dontcare": continue idx = t_data.frame # check if length for frame data is sufficient if idx >= len(f_data): print("extend f_data", idx, len(f_data)) f_data += [[] for x in range(max(500, idx - len(f_data)))] try: id_frame = (t_data.frame, t_data.track_id) if id_frame in id_frame_cache and not loading_groundtruth: logger.info( "track ids are not unique for sequence %d: frame %d" % (seq, t_data.frame)) logger.info( "track id %d occurred at least twice for this frame" % t_data.track_id) logger.info("Exiting...") #continue # this allows to evaluate non-unique result files return False id_frame_cache.append(id_frame) f_data[t_data.frame].append(copy.copy(t_data)) except: print(len(f_data), idx) raise if t_data.track_id not in ids and t_data.obj_type != "dontcare": ids.append(t_data.track_id) n_trajectories += 1 n_in_seq += 1 # check if uploaded data provides information for 2D and 3D evaluation if not loading_groundtruth and eval_2d is True and ( t_data.x1 == -1 or t_data.x2 == -1 or t_data.y1 == -1 or t_data.y2 == -1): eval_2d = False if not loading_groundtruth and eval_3d is True and ( t_data.X == -1000 or t_data.Y == -1000 or t_data.Z == -1000): eval_3d = False # only add existing frames n_trajectories_seq.append(n_in_seq) seq_data.append(f_data) f.close() if not loading_groundtruth: self.tracker = seq_data self.n_tr_trajectories = n_trajectories self.eval_2d = eval_2d self.eval_3d = eval_3d self.n_tr_seq = n_trajectories_seq if self.n_tr_trajectories == 0: return False else: # split ground truth and DontCare areas self.dcareas = [] self.groundtruth = [] for seq_idx in range(len(seq_data)): seq_gt = seq_data[seq_idx] s_g, s_dc = [], [] for f in range(len(seq_gt)): all_gt = seq_gt[f] g, dc = [], [] for gg in all_gt: if gg.obj_type == "dontcare": dc.append(gg) else: g.append(gg) s_g.append(g) s_dc.append(dc) self.dcareas.append(s_dc) self.groundtruth.append(s_g) self.n_gt_seq = n_trajectories_seq self.n_gt_trajectories = n_trajectories return True def boxoverlap(self, a, b, criterion="union"): """ boxoverlap computes intersection over union for bbox a and b in KITTI format. If the criterion is 'union', overlap = (a inter b) / a union b). If the criterion is 'a', overlap = (a inter b) / a, where b should be a dontcare area. """ x1 = max(a.x1, b.x1) y1 = max(a.y1, b.y1) x2 = min(a.x2, b.x2) y2 = min(a.y2, b.y2) w = x2 - x1 h = y2 - y1 if w <= 0. or h <= 0.: return 0. inter = w * h aarea = (a.x2 - a.x1) * (a.y2 - a.y1) barea = (b.x2 - b.x1) * (b.y2 - b.y1) # intersection over union overlap if criterion.lower() == "union": o = inter / float(aarea + barea - inter) elif criterion.lower() == "a": o = float(inter) / float(aarea) else: raise TypeError("Unkown type for criterion") return o def compute3rdPartyMetrics(self): """ Computes the metrics defined in - Stiefelhagen 2008: Evaluating Multiple Object Tracking Performance: The CLEAR MOT Metrics MOTA, MOTAL, MOTP - Nevatia 2008: Global Data Association for Multi-Object Tracking Using Network Flows MT/PT/ML """ # construct Munkres object for Hungarian Method association hm = Munkres() max_cost = 1e9 # go through all frames and associate ground truth and tracker results # groundtruth and tracker contain lists for every single frame containing lists of KITTI format detections fr, ids = 0, 0 for seq_idx in range(len(self.groundtruth)): seq_gt = self.groundtruth[seq_idx] seq_dc = self.dcareas[seq_idx] # don't care areas seq_tracker = self.tracker[seq_idx] seq_trajectories = defaultdict(list) seq_ignored = defaultdict(list) # statistics over the current sequence, check the corresponding # variable comments in __init__ to get their meaning seqtp = 0 seqitp = 0 seqfn = 0 seqifn = 0 seqfp = 0 seqigt = 0 seqitr = 0 last_ids = [[], []] n_gts = 0 n_trs = 0 for f in range(len(seq_gt)): g = seq_gt[f] dc = seq_dc[f] t = seq_tracker[f] # counting total number of ground truth and tracker objects self.n_gt += len(g) self.n_tr += len(t) n_gts += len(g) n_trs += len(t) # use hungarian method to associate, using boxoverlap 0..1 as cost # build cost matrix cost_matrix = [] this_ids = [[], []] for gg in g: # save current ids this_ids[0].append(gg.track_id) this_ids[1].append(-1) gg.tracker = -1 gg.id_switch = 0 gg.fragmentation = 0 cost_row = [] for tt in t: # overlap == 1 is cost ==0 c = 1 - self.boxoverlap(gg, tt) # gating for boxoverlap if c <= self.min_overlap: cost_row.append(c) else: cost_row.append(max_cost) # = 1e9 cost_matrix.append(cost_row) # all ground truth trajectories are initially not associated # extend groundtruth trajectories lists (merge lists) seq_trajectories[gg.track_id].append(-1) seq_ignored[gg.track_id].append(False) if len(g) is 0: cost_matrix = [[]] # associate association_matrix = hm.compute(cost_matrix) # tmp variables for sanity checks and MODP computation tmptp = 0 tmpfp = 0 tmpfn = 0 tmpc = 0 # this will sum up the overlaps for all true positives tmpcs = [0] * len( g) # this will save the overlaps for all true positives # the reason is that some true positives might be ignored # later such that the corrsponding overlaps can # be subtracted from tmpc for MODP computation # mapping for tracker ids and ground truth ids for row, col in association_matrix: # apply gating on boxoverlap c = cost_matrix[row][col] if c < max_cost: g[row].tracker = t[col].track_id this_ids[1][row] = t[col].track_id t[col].valid = True g[row].distance = c self.total_cost += 1 - c tmpc += 1 - c tmpcs[row] = 1 - c seq_trajectories[g[row].track_id][-1] = t[col].track_id # true positives are only valid associations self.tp += 1 tmptp += 1 else: g[row].tracker = -1 self.fn += 1 tmpfn += 1 # associate tracker and DontCare areas # ignore tracker in neighboring classes nignoredtracker = 0 # number of ignored tracker detections ignoredtrackers = dict() # will associate the track_id with -1 # if it is not ignored and 1 if it is # ignored; # this is used to avoid double counting ignored # cases, see the next loop for tt in t: ignoredtrackers[tt.track_id] = -1 # ignore detection if it belongs to a neighboring class or is # smaller or equal to the minimum height tt_height = abs(tt.y1 - tt.y2) if ((self.cls == "car" and tt.obj_type == "van") or (self.cls == "pedestrian" and tt.obj_type == "person_sitting") or tt_height <= self.min_height) and not tt.valid: nignoredtracker += 1 tt.ignored = True ignoredtrackers[tt.track_id] = 1 continue for d in dc: overlap = self.boxoverlap(tt, d, "a") if overlap > 0.5 and not tt.valid: tt.ignored = True nignoredtracker += 1 ignoredtrackers[tt.track_id] = 1 break # check for ignored FN/TP (truncation or neighboring object class) ignoredfn = 0 # the number of ignored false negatives nignoredtp = 0 # the number of ignored true positives nignoredpairs = 0 # the number of ignored pairs, i.e. a true positive # which is ignored but where the associated tracker # detection has already been ignored gi = 0 for gg in g: if gg.tracker < 0: if gg.occlusion>self.max_occlusion or gg.truncation>self.max_truncation\ or (self.cls=="car" and gg.obj_type=="van") or (self.cls=="pedestrian" and gg.obj_type=="person_sitting"): seq_ignored[gg.track_id][-1] = True gg.ignored = True ignoredfn += 1 elif gg.tracker >= 0: if gg.occlusion>self.max_occlusion or gg.truncation>self.max_truncation\ or (self.cls=="car" and gg.obj_type=="van") or (self.cls=="pedestrian" and gg.obj_type=="person_sitting"): seq_ignored[gg.track_id][-1] = True gg.ignored = True nignoredtp += 1 # if the associated tracker detection is already ignored, # we want to avoid double counting ignored detections if ignoredtrackers[gg.tracker] > 0: nignoredpairs += 1 # for computing MODP, the overlaps from ignored detections # are subtracted tmpc -= tmpcs[gi] gi += 1 # the below might be confusion, check the comments in __init__ # to see what the individual statistics represent # correct TP by number of ignored TP due to truncation # ignored TP are shown as tracked in visualization tmptp -= nignoredtp # count the number of ignored true positives self.itp += nignoredtp # adjust the number of ground truth objects considered self.n_gt -= (ignoredfn + nignoredtp) # count the number of ignored ground truth objects self.n_igt += ignoredfn + nignoredtp # count the number of ignored tracker objects self.n_itr += nignoredtracker # count the number of ignored pairs, i.e. associated tracker and # ground truth objects that are both ignored self.n_igttr += nignoredpairs # false negatives = associated gt bboxes exceding association threshold + non-associated gt bboxes tmpfn += len(g) - len(association_matrix) - ignoredfn self.fn += len(g) - len(association_matrix) - ignoredfn self.ifn += ignoredfn # false positives = tracker bboxes - associated tracker bboxes # mismatches (mme_t) tmpfp += len( t) - tmptp - nignoredtracker - nignoredtp + nignoredpairs self.fp += len( t) - tmptp - nignoredtracker - nignoredtp + nignoredpairs # update sequence data seqtp += tmptp seqitp += nignoredtp seqfp += tmpfp seqfn += tmpfn seqifn += ignoredfn seqigt += ignoredfn + nignoredtp seqitr += nignoredtracker # sanity checks # - the number of true positives minues ignored true positives # should be greater or equal to 0 # - the number of false negatives should be greater or equal to 0 # - the number of false positives needs to be greater or equal to 0 # otherwise ignored detections might be counted double # - the number of counted true positives (plus ignored ones) # and the number of counted false negatives (plus ignored ones) # should match the total number of ground truth objects # - the number of counted true positives (plus ignored ones) # and the number of counted false positives # plus the number of ignored tracker detections should # match the total number of tracker detections; note that # nignoredpairs is subtracted here to avoid double counting # of ignored detection sin nignoredtp and nignoredtracker if tmptp < 0: print(tmptp, nignoredtp) raise NameError("Something went wrong! TP is negative") if tmpfn < 0: print(tmpfn, len(g), len(association_matrix), ignoredfn, nignoredpairs) raise NameError("Something went wrong! FN is negative") if tmpfp < 0: print(tmpfp, len(t), tmptp, nignoredtracker, nignoredtp, nignoredpairs) raise NameError("Something went wrong! FP is negative") if tmptp + tmpfn is not len(g) - ignoredfn - nignoredtp: print("seqidx", seq_idx) print("frame ", f) print("TP ", tmptp) print("FN ", tmpfn) print("FP ", tmpfp) print("nGT ", len(g)) print("nAss ", len(association_matrix)) print("ign GT", ignoredfn) print("ign TP", nignoredtp) raise NameError( "Something went wrong! nGroundtruth is not TP+FN") if tmptp + tmpfp + nignoredtp + nignoredtracker - nignoredpairs is not len( t): print(seq_idx, f, len(t), tmptp, tmpfp) print(len(association_matrix), association_matrix) raise NameError( "Something went wrong! nTracker is not TP+FP") # check for id switches or fragmentations for i, tt in enumerate(this_ids[0]): if tt in last_ids[0]: idx = last_ids[0].index(tt) tid = this_ids[1][i] lid = last_ids[1][idx] if tid != lid and lid != -1 and tid != -1: if g[i].truncation < self.max_truncation: g[i].id_switch = 1 ids += 1 if tid != lid and lid != -1: if g[i].truncation < self.max_truncation: g[i].fragmentation = 1 fr += 1 # save current index last_ids = this_ids # compute MOTP_t MODP_t = 1 if tmptp != 0: MODP_t = tmpc / float(tmptp) self.MODP_t.append(MODP_t) # remove empty lists for current gt trajectories self.gt_trajectories[seq_idx] = seq_trajectories self.ign_trajectories[seq_idx] = seq_ignored # gather statistics for "per sequence" statistics. self.n_gts.append(n_gts) self.n_trs.append(n_trs) self.tps.append(seqtp) self.itps.append(seqitp) self.fps.append(seqfp) self.fns.append(seqfn) self.ifns.append(seqifn) self.n_igts.append(seqigt) self.n_itrs.append(seqitr) # compute MT/PT/ML, fragments, idswitches for all groundtruth trajectories n_ignored_tr_total = 0 for seq_idx, ( seq_trajectories, seq_ignored ) in enumerate(zip(self.gt_trajectories, self.ign_trajectories)): if len(seq_trajectories) == 0: continue tmpMT, tmpML, tmpPT, tmpId_switches, tmpFragments = [0] * 5 n_ignored_tr = 0 for g, ign_g in zip(seq_trajectories.values(), seq_ignored.values()): # all frames of this gt trajectory are ignored if all(ign_g): n_ignored_tr += 1 n_ignored_tr_total += 1 continue # all frames of this gt trajectory are not assigned to any detections if all([this == -1 for this in g]): tmpML += 1 self.ML += 1 continue # compute tracked frames in trajectory last_id = g[0] # first detection (necessary to be in gt_trajectories) is always tracked tracked = 1 if g[0] >= 0 else 0 lgt = 0 if ign_g[0] else 1 for f in range(1, len(g)): if ign_g[f]: last_id = -1 continue lgt += 1 if last_id != g[f] and last_id != -1 and g[f] != -1 and g[ f - 1] != -1: tmpId_switches += 1 self.id_switches += 1 if f < len(g) - 1 and g[f - 1] != g[ f] and last_id != -1 and g[f] != -1 and g[f + 1] != -1: tmpFragments += 1 self.fragments += 1 if g[f] != -1: tracked += 1 last_id = g[f] # handle last frame; tracked state is handled in for loop (g[f]!=-1) if len(g) > 1 and g[f - 1] != g[f] and last_id != -1 and g[ f] != -1 and not ign_g[f]: tmpFragments += 1 self.fragments += 1 # compute MT/PT/ML tracking_ratio = tracked / float(len(g) - sum(ign_g)) if tracking_ratio > 0.8: tmpMT += 1 self.MT += 1 elif tracking_ratio < 0.2: tmpML += 1 self.ML += 1 else: # 0.2 <= tracking_ratio <= 0.8 tmpPT += 1 self.PT += 1 if (self.n_gt_trajectories - n_ignored_tr_total) == 0: self.MT = 0. self.PT = 0. self.ML = 0. else: self.MT /= float(self.n_gt_trajectories - n_ignored_tr_total) self.PT /= float(self.n_gt_trajectories - n_ignored_tr_total) self.ML /= float(self.n_gt_trajectories - n_ignored_tr_total) # precision/recall etc. if (self.fp + self.tp) == 0 or (self.tp + self.fn) == 0: self.recall = 0. self.precision = 0. else: self.recall = self.tp / float(self.tp + self.fn) self.precision = self.tp / float(self.fp + self.tp) if (self.recall + self.precision) == 0: self.F1 = 0. else: self.F1 = 2. * (self.precision * self.recall) / ( self.precision + self.recall) if sum(self.n_frames) == 0: self.FAR = "n/a" else: self.FAR = self.fp / float(sum(self.n_frames)) # compute CLEARMOT if self.n_gt == 0: self.MOTA = -float("inf") self.MODA = -float("inf") else: self.MOTA = 1 - (self.fn + self.fp + self.id_switches ) / float(self.n_gt) self.MODA = 1 - (self.fn + self.fp) / float(self.n_gt) if self.tp == 0: self.MOTP = float("inf") else: self.MOTP = self.total_cost / float(self.tp) if self.n_gt != 0: if self.id_switches == 0: self.MOTAL = 1 - (self.fn + self.fp + self.id_switches ) / float(self.n_gt) else: self.MOTAL = 1 - (self.fn + self.fp + math.log10(self.id_switches) ) / float(self.n_gt) else: self.MOTAL = -float("inf") if sum(self.n_frames) == 0: self.MODP = "n/a" else: self.MODP = sum(self.MODP_t) / float(sum(self.n_frames)) return True def createSummary(self): summary = "" summary += "tracking evaluation summary".center(80, "=") + "\n" summary += self.printEntry("Multiple Object Tracking Accuracy (MOTA)", self.MOTA) + "\n" summary += self.printEntry("Multiple Object Tracking Precision (MOTP)", self.MOTP) + "\n" summary += self.printEntry("Multiple Object Tracking Accuracy (MOTAL)", self.MOTAL) + "\n" summary += self.printEntry("Multiple Object Detection Accuracy (MODA)", self.MODA) + "\n" summary += self.printEntry("Multiple Object Detection Precision (MODP)", self.MODP) + "\n" summary += "\n" summary += self.printEntry("Recall", self.recall) + "\n" summary += self.printEntry("Precision", self.precision) + "\n" summary += self.printEntry("F1", self.F1) + "\n" summary += self.printEntry("False Alarm Rate", self.FAR) + "\n" summary += "\n" summary += self.printEntry("Mostly Tracked", self.MT) + "\n" summary += self.printEntry("Partly Tracked", self.PT) + "\n" summary += self.printEntry("Mostly Lost", self.ML) + "\n" summary += "\n" summary += self.printEntry("True Positives", self.tp) + "\n" #summary += self.printEntry("True Positives per Sequence", self.tps) + "\n" summary += self.printEntry("Ignored True Positives", self.itp) + "\n" #summary += self.printEntry("Ignored True Positives per Sequence", self.itps) + "\n" summary += self.printEntry("False Positives", self.fp) + "\n" #summary += self.printEntry("False Positives per Sequence", self.fps) + "\n" summary += self.printEntry("False Negatives", self.fn) + "\n" #summary += self.printEntry("False Negatives per Sequence", self.fns) + "\n" summary += self.printEntry("ID-switches", self.id_switches) + "\n" self.fp = self.fp / self.n_gt self.fn = self.fn / self.n_gt self.id_switches = self.id_switches / self.n_gt summary += self.printEntry("False Positives Ratio", self.fp) + "\n" #summary += self.printEntry("False Positives per Sequence", self.fps) + "\n" summary += self.printEntry("False Negatives Ratio", self.fn) + "\n" #summary += self.printEntry("False Negatives per Sequence", self.fns) + "\n" summary += self.printEntry("Ignored False Negatives Ratio", self.ifn) + "\n" #summary += self.printEntry("Ignored False Negatives per Sequence", self.ifns) + "\n" summary += self.printEntry("Missed Targets", self.fn) + "\n" summary += self.printEntry("ID-switches", self.id_switches) + "\n" summary += self.printEntry("Fragmentations", self.fragments) + "\n" summary += "\n" summary += self.printEntry("Ground Truth Objects (Total)", self.n_gt + self.n_igt) + "\n" #summary += self.printEntry("Ground Truth Objects (Total) per Sequence", self.n_gts) + "\n" summary += self.printEntry("Ignored Ground Truth Objects", self.n_igt) + "\n" #summary += self.printEntry("Ignored Ground Truth Objects per Sequence", self.n_igts) + "\n" summary += self.printEntry("Ground Truth Trajectories", self.n_gt_trajectories) + "\n" summary += "\n" summary += self.printEntry("Tracker Objects (Total)", self.n_tr) + "\n" #summary += self.printEntry("Tracker Objects (Total) per Sequence", self.n_trs) + "\n" summary += self.printEntry("Ignored Tracker Objects", self.n_itr) + "\n" #summary += self.printEntry("Ignored Tracker Objects per Sequence", self.n_itrs) + "\n" summary += self.printEntry("Tracker Trajectories", self.n_tr_trajectories) + "\n" #summary += "\n" #summary += self.printEntry("Ignored Tracker Objects with Associated Ignored Ground Truth Objects", self.n_igttr) + "\n" summary += "=" * 80 return summary def printEntry(self, key, val, width=(70, 10)): """ Pretty print an entry in a table fashion. """ s_out = key.ljust(width[0]) if type(val) == int: s = "%%%dd" % width[1] s_out += s % val elif type(val) == float: s = "%%%df" % (width[1]) s_out += s % val else: s_out += ("%s" % val).rjust(width[1]) return s_out def saveToStats(self, save_summary): """ Save the statistics in a whitespace separate file. """ summary = self.createSummary() if save_summary: filename = os.path.join(self.result_path, "summary_%s.txt" % self.cls) dump = open(filename, "w+") dump.write(summary) dump.close() return summary class KITTIMOTMetric(Metric): def __init__(self, save_summary=True): self.save_summary = save_summary self.MOTEvaluator = KITTIEvaluation self.result_root = None self.reset() def reset(self): self.seqs = [] self.n_sequences = 0 self.n_frames = [] self.strsummary = '' def update(self, data_root, seq, data_type, result_root, result_filename): assert data_type == 'kitti', "data_type should 'kitti'" self.result_root = result_root self.gt_path = data_root gt_path = '{}/../labels/{}.txt'.format(data_root, seq) gt = open(gt_path, "r") max_frame = 0 for line in gt: line = line.strip() line_list = line.split(" ") if int(line_list[0]) > max_frame: max_frame = int(line_list[0]) rs = open(result_filename, "r") for line in rs: line = line.strip() line_list = line.split(" ") if int(line_list[0]) > max_frame: max_frame = int(line_list[0]) gt.close() rs.close() self.n_frames.append(max_frame + 1) self.seqs.append(seq) self.n_sequences += 1 def accumulate(self): logger.info("Processing Result for KITTI Tracking Benchmark") e = self.MOTEvaluator(result_path=self.result_root, gt_path=self.gt_path,\ n_frames=self.n_frames, seqs=self.seqs, n_sequences=self.n_sequences) try: if not e.loadTracker(): return logger.info("Loading Results - Success") logger.info("Evaluate Object Class: %s" % c.upper()) except: logger.info("Caught exception while loading result data.") if not e.loadGroundtruth(): raise ValueError("Ground truth not found.") logger.info("Loading Groundtruth - Success") # sanity checks if len(e.groundtruth) is not len(e.tracker): logger.info( "The uploaded data does not provide results for every sequence.") return False logger.info("Loaded %d Sequences." % len(e.groundtruth)) logger.info("Start Evaluation...") if e.compute3rdPartyMetrics(): self.strsummary = e.saveToStats(self.save_summary) else: logger.info( "There seem to be no true positives or false positives at all in the submitted data." ) def log(self): print(self.strsummary) def get_results(self): return self.strsummary ================================================ FILE: ppdet/metrics/munkres.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This code is borrow from https://github.com/xingyizhou/CenterTrack/blob/master/src/tools/eval_kitti_track/munkres.py """ import sys __all__ = ['Munkres', 'make_cost_matrix'] class Munkres: """ Calculate the Munkres solution to the classical assignment problem. See the module documentation for usage. """ def __init__(self): """Create a new instance""" self.C = None self.row_covered = [] self.col_covered = [] self.n = 0 self.Z0_r = 0 self.Z0_c = 0 self.marked = None self.path = None def make_cost_matrix(profit_matrix, inversion_function): """ **DEPRECATED** Please use the module function ``make_cost_matrix()``. """ import munkres return munkres.make_cost_matrix(profit_matrix, inversion_function) make_cost_matrix = staticmethod(make_cost_matrix) def pad_matrix(self, matrix, pad_value=0): """ Pad a possibly non-square matrix to make it square. :Parameters: matrix : list of lists matrix to pad pad_value : int value to use to pad the matrix :rtype: list of lists :return: a new, possibly padded, matrix """ max_columns = 0 total_rows = len(matrix) for row in matrix: max_columns = max(max_columns, len(row)) total_rows = max(max_columns, total_rows) new_matrix = [] for row in matrix: row_len = len(row) new_row = row[:] if total_rows > row_len: # Row too short. Pad it. new_row += [0] * (total_rows - row_len) new_matrix += [new_row] while len(new_matrix) < total_rows: new_matrix += [[0] * total_rows] return new_matrix def compute(self, cost_matrix): """ Compute the indexes for the lowest-cost pairings between rows and columns in the database. Returns a list of (row, column) tuples that can be used to traverse the matrix. :Parameters: cost_matrix : list of lists The cost matrix. If this cost matrix is not square, it will be padded with zeros, via a call to ``pad_matrix()``. (This method does *not* modify the caller's matrix. It operates on a copy of the matrix.) **WARNING**: This code handles square and rectangular matrices. It does *not* handle irregular matrices. :rtype: list :return: A list of ``(row, column)`` tuples that describe the lowest cost path through the matrix """ self.C = self.pad_matrix(cost_matrix) self.n = len(self.C) self.original_length = len(cost_matrix) self.original_width = len(cost_matrix[0]) self.row_covered = [False for i in range(self.n)] self.col_covered = [False for i in range(self.n)] self.Z0_r = 0 self.Z0_c = 0 self.path = self.__make_matrix(self.n * 2, 0) self.marked = self.__make_matrix(self.n, 0) done = False step = 1 steps = { 1: self.__step1, 2: self.__step2, 3: self.__step3, 4: self.__step4, 5: self.__step5, 6: self.__step6 } while not done: try: func = steps[step] step = func() except KeyError: done = True # Look for the starred columns results = [] for i in range(self.original_length): for j in range(self.original_width): if self.marked[i][j] == 1: results += [(i, j)] return results def __copy_matrix(self, matrix): """Return an exact copy of the supplied matrix""" return copy.deepcopy(matrix) def __make_matrix(self, n, val): """Create an *n*x*n* matrix, populating it with the specific value.""" matrix = [] for i in range(n): matrix += [[val for j in range(n)]] return matrix def __step1(self): """ For each row of the matrix, find the smallest element and subtract it from every element in its row. Go to Step 2. """ C = self.C n = self.n for i in range(n): minval = min(self.C[i]) # Find the minimum value for this row and subtract that minimum # from every element in the row. for j in range(n): self.C[i][j] -= minval return 2 def __step2(self): """ Find a zero (Z) in the resulting matrix. If there is no starred zero in its row or column, star Z. Repeat for each element in the matrix. Go to Step 3. """ n = self.n for i in range(n): for j in range(n): if (self.C[i][j] == 0) and \ (not self.col_covered[j]) and \ (not self.row_covered[i]): self.marked[i][j] = 1 self.col_covered[j] = True self.row_covered[i] = True self.__clear_covers() return 3 def __step3(self): """ Cover each column containing a starred zero. If K columns are covered, the starred zeros describe a complete set of unique assignments. In this case, Go to DONE, otherwise, Go to Step 4. """ n = self.n count = 0 for i in range(n): for j in range(n): if self.marked[i][j] == 1: self.col_covered[j] = True count += 1 if count >= n: step = 7 # done else: step = 4 return step def __step4(self): """ Find a noncovered zero and prime it. If there is no starred zero in the row containing this primed zero, Go to Step 5. Otherwise, cover this row and uncover the column containing the starred zero. Continue in this manner until there are no uncovered zeros left. Save the smallest uncovered value and Go to Step 6. """ step = 0 done = False row = -1 col = -1 star_col = -1 while not done: (row, col) = self.__find_a_zero() if row < 0: done = True step = 6 else: self.marked[row][col] = 2 star_col = self.__find_star_in_row(row) if star_col >= 0: col = star_col self.row_covered[row] = True self.col_covered[col] = False else: done = True self.Z0_r = row self.Z0_c = col step = 5 return step def __step5(self): """ Construct a series of alternating primed and starred zeros as follows. Let Z0 represent the uncovered primed zero found in Step 4. Let Z1 denote the starred zero in the column of Z0 (if any). Let Z2 denote the primed zero in the row of Z1 (there will always be one). Continue until the series terminates at a primed zero that has no starred zero in its column. Unstar each starred zero of the series, star each primed zero of the series, erase all primes and uncover every line in the matrix. Return to Step 3 """ count = 0 path = self.path path[count][0] = self.Z0_r path[count][1] = self.Z0_c done = False while not done: row = self.__find_star_in_col(path[count][1]) if row >= 0: count += 1 path[count][0] = row path[count][1] = path[count - 1][1] else: done = True if not done: col = self.__find_prime_in_row(path[count][0]) count += 1 path[count][0] = path[count - 1][0] path[count][1] = col self.__convert_path(path, count) self.__clear_covers() self.__erase_primes() return 3 def __step6(self): """ Add the value found in Step 4 to every element of each covered row, and subtract it from every element of each uncovered column. Return to Step 4 without altering any stars, primes, or covered lines. """ minval = self.__find_smallest() for i in range(self.n): for j in range(self.n): if self.row_covered[i]: self.C[i][j] += minval if not self.col_covered[j]: self.C[i][j] -= minval return 4 def __find_smallest(self): """Find the smallest uncovered value in the matrix.""" minval = 2e9 # sys.maxint for i in range(self.n): for j in range(self.n): if (not self.row_covered[i]) and (not self.col_covered[j]): if minval > self.C[i][j]: minval = self.C[i][j] return minval def __find_a_zero(self): """Find the first uncovered element with value 0""" row = -1 col = -1 i = 0 n = self.n done = False while not done: j = 0 while True: if (self.C[i][j] == 0) and \ (not self.row_covered[i]) and \ (not self.col_covered[j]): row = i col = j done = True j += 1 if j >= n: break i += 1 if i >= n: done = True return (row, col) def __find_star_in_row(self, row): """ Find the first starred element in the specified row. Returns the column index, or -1 if no starred element was found. """ col = -1 for j in range(self.n): if self.marked[row][j] == 1: col = j break return col def __find_star_in_col(self, col): """ Find the first starred element in the specified row. Returns the row index, or -1 if no starred element was found. """ row = -1 for i in range(self.n): if self.marked[i][col] == 1: row = i break return row def __find_prime_in_row(self, row): """ Find the first prime element in the specified row. Returns the column index, or -1 if no starred element was found. """ col = -1 for j in range(self.n): if self.marked[row][j] == 2: col = j break return col def __convert_path(self, path, count): for i in range(count + 1): if self.marked[path[i][0]][path[i][1]] == 1: self.marked[path[i][0]][path[i][1]] = 0 else: self.marked[path[i][0]][path[i][1]] = 1 def __clear_covers(self): """Clear all covered matrix cells""" for i in range(self.n): self.row_covered[i] = False self.col_covered[i] = False def __erase_primes(self): """Erase all prime markings""" for i in range(self.n): for j in range(self.n): if self.marked[i][j] == 2: self.marked[i][j] = 0 def make_cost_matrix(profit_matrix, inversion_function): """ Create a cost matrix from a profit matrix by calling 'inversion_function' to invert each value. The inversion function must take one numeric argument (of any type) and return another numeric argument which is presumed to be the cost inverse of the original profit. This is a static method. Call it like this: .. python:: cost_matrix = Munkres.make_cost_matrix(matrix, inversion_func) For example: .. python:: cost_matrix = Munkres.make_cost_matrix(matrix, lambda x : sys.maxint - x) :Parameters: profit_matrix : list of lists The matrix to convert from a profit to a cost matrix inversion_function : function The function to use to invert each entry in the profit matrix :rtype: list of lists :return: The converted matrix """ cost_matrix = [] for row in profit_matrix: cost_matrix.append([inversion_function(value) for value in row]) return cost_matrix ================================================ FILE: ppdet/metrics/pose3d_metrics.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle from paddle.distributed import ParallelEnv import os import json from collections import defaultdict, OrderedDict import numpy as np from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) __all__ = ['Pose3DEval'] class AverageMeter(object): def __init__(self): self.reset() def reset(self): self.val = 0 self.avg = 0 self.sum = 0 self.count = 0 def update(self, val, n=1): self.val = val self.sum += val * n self.count += n self.avg = self.sum / self.count def mean_per_joint_position_error(pred, gt, has_3d_joints): """ Compute mPJPE """ gt = gt[has_3d_joints == 1] gt = gt[:, :, :3] pred = pred[has_3d_joints == 1] with paddle.no_grad(): gt_pelvis = (gt[:, 2, :] + gt[:, 3, :]) / 2 gt = gt - gt_pelvis[:, None, :] pred_pelvis = (pred[:, 2, :] + pred[:, 3, :]) / 2 pred = pred - pred_pelvis[:, None, :] error = paddle.sqrt(((pred - gt)**2).sum(axis=-1)).mean(axis=-1).numpy() return error def compute_similarity_transform(S1, S2): """Computes a similarity transform (sR, t) that takes a set of 3D points S1 (3 x N) closest to a set of 3D points S2, where R is an 3x3 rotation matrix, t 3x1 translation, s scale. i.e. solves the orthogonal Procrutes problem. """ transposed = False if S1.shape[0] != 3 and S1.shape[0] != 2: S1 = S1.T S2 = S2.T transposed = True assert (S2.shape[1] == S1.shape[1]) # 1. Remove mean. mu1 = S1.mean(axis=1, keepdims=True) mu2 = S2.mean(axis=1, keepdims=True) X1 = S1 - mu1 X2 = S2 - mu2 # 2. Compute variance of X1 used for scale. var1 = np.sum(X1**2) # 3. The outer product of X1 and X2. K = X1.dot(X2.T) # 4. Solution that Maximizes trace(R'K) is R=U*V', where U, V are # singular vectors of K. U, s, Vh = np.linalg.svd(K) V = Vh.T # Construct Z that fixes the orientation of R to get det(R)=1. Z = np.eye(U.shape[0]) Z[-1, -1] *= np.sign(np.linalg.det(U.dot(V.T))) # Construct R. R = V.dot(Z.dot(U.T)) # 5. Recover scale. scale = np.trace(R.dot(K)) / var1 # 6. Recover translation. t = mu2 - scale * (R.dot(mu1)) # 7. Error: S1_hat = scale * R.dot(S1) + t if transposed: S1_hat = S1_hat.T return S1_hat def compute_similarity_transform_batch(S1, S2): """Batched version of compute_similarity_transform.""" S1_hat = np.zeros_like(S1) for i in range(S1.shape[0]): S1_hat[i] = compute_similarity_transform(S1[i], S2[i]) return S1_hat def reconstruction_error(S1, S2, reduction='mean'): """Do Procrustes alignment and compute reconstruction error.""" S1_hat = compute_similarity_transform_batch(S1, S2) re = np.sqrt(((S1_hat - S2)**2).sum(axis=-1)).mean(axis=-1) if reduction == 'mean': re = re.mean() elif reduction == 'sum': re = re.sum() return re def all_gather(data): if paddle.distributed.get_world_size() == 1: return data vlist = [] paddle.distributed.all_gather(vlist, data) data = paddle.concat(vlist, 0) return data class Pose3DEval(object): def __init__(self, output_eval, save_prediction_only=False): super(Pose3DEval, self).__init__() self.output_eval = output_eval self.res_file = os.path.join(output_eval, "pose3d_results.json") self.save_prediction_only = save_prediction_only self.reset() def reset(self): self.PAmPJPE = AverageMeter() self.mPJPE = AverageMeter() self.eval_results = {} def get_human36m_joints(self, input): J24_TO_J14 = paddle.to_tensor( [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18]) J24_TO_J17 = paddle.to_tensor( [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19]) return paddle.index_select(input, J24_TO_J14, axis=1) def update(self, inputs, outputs): gt_3d_joints = all_gather(inputs['joints_3d'].cuda(ParallelEnv() .local_rank)) has_3d_joints = all_gather(inputs['has_3d_joints'].cuda(ParallelEnv() .local_rank)) pred_3d_joints = all_gather(outputs['pose3d']) if gt_3d_joints.shape[1] == 24: gt_3d_joints = self.get_human36m_joints(gt_3d_joints) if pred_3d_joints.shape[1] == 24: pred_3d_joints = self.get_human36m_joints(pred_3d_joints) mPJPE_val = mean_per_joint_position_error(pred_3d_joints, gt_3d_joints, has_3d_joints).mean() PAmPJPE_val = reconstruction_error( pred_3d_joints.numpy(), gt_3d_joints[:, :, :3].numpy(), reduction=None).mean() count = int(np.sum(has_3d_joints.numpy())) self.PAmPJPE.update(PAmPJPE_val * 1000., count) self.mPJPE.update(mPJPE_val * 1000., count) def accumulate(self): if self.save_prediction_only: logger.info(f'The pose3d result is saved to {self.res_file} ' 'and do not evaluate the model.') return self.eval_results['pose3d'] = [-self.mPJPE.avg, -self.PAmPJPE.avg] def log(self): if self.save_prediction_only: return stats_names = ['mPJPE', 'PAmPJPE'] num_values = len(stats_names) print(' '.join(['| {}'.format(name) for name in stats_names]) + ' |') print('|---' * (num_values + 1) + '|') print(' '.join([ '| {:.3f}'.format(abs(value)) for value in self.eval_results['pose3d'] ]) + ' |') def get_results(self): return self.eval_results ================================================ FILE: ppdet/metrics/widerface_utils.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import cv2 import numpy as np from collections import OrderedDict import paddle from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) __all__ = ['face_eval_run', 'lmk2out'] def face_eval_run(model, image_dir, gt_file, pred_dir='output/pred', eval_mode='widerface', multi_scale=False): # load ground truth files with open(gt_file, 'r') as f: gt_lines = f.readlines() imid2path = [] pos_gt = 0 while pos_gt < len(gt_lines): name_gt = gt_lines[pos_gt].strip('\n\t').split()[0] imid2path.append(name_gt) pos_gt += 1 n_gt = int(gt_lines[pos_gt].strip('\n\t').split()[0]) pos_gt += 1 + n_gt logger.info('The ground truth file load {} images'.format(len(imid2path))) dets_dist = OrderedDict() for iter_id, im_path in enumerate(imid2path): image_path = os.path.join(image_dir, im_path) if eval_mode == 'fddb': image_path += '.jpg' assert os.path.exists(image_path) image = cv2.imread(image_path) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) if multi_scale: shrink, max_shrink = get_shrink(image.shape[0], image.shape[1]) det0 = detect_face(model, image, shrink) det1 = flip_test(model, image, shrink) [det2, det3] = multi_scale_test(model, image, max_shrink) det4 = multi_scale_test_pyramid(model, image, max_shrink) det = np.row_stack((det0, det1, det2, det3, det4)) dets = bbox_vote(det) else: dets = detect_face(model, image, 1) if eval_mode == 'widerface': save_widerface_bboxes(image_path, dets, pred_dir) else: dets_dist[im_path] = dets if iter_id % 100 == 0: logger.info('Test iter {}'.format(iter_id)) if eval_mode == 'fddb': save_fddb_bboxes(dets_dist, pred_dir) logger.info("Finish evaluation.") def detect_face(model, image, shrink): image_shape = [image.shape[0], image.shape[1]] if shrink != 1: h, w = int(image_shape[0] * shrink), int(image_shape[1] * shrink) image = cv2.resize(image, (w, h)) image_shape = [h, w] img = face_img_process(image) image_shape = np.asarray([image_shape]) scale_factor = np.asarray([[shrink, shrink]]) data = { "image": paddle.to_tensor( img, dtype='float32'), "im_shape": paddle.to_tensor( image_shape, dtype='float32'), "scale_factor": paddle.to_tensor( scale_factor, dtype='float32') } model.eval() detection = model(data) detection = detection['bbox'].numpy() # layout: xmin, ymin, xmax. ymax, score if np.prod(detection.shape) == 1: logger.info("No face detected") return np.array([[0, 0, 0, 0, 0]]) det_conf = detection[:, 1] det_xmin = detection[:, 2] det_ymin = detection[:, 3] det_xmax = detection[:, 4] det_ymax = detection[:, 5] det = np.column_stack((det_xmin, det_ymin, det_xmax, det_ymax, det_conf)) return det def flip_test(model, image, shrink): img = cv2.flip(image, 1) det_f = detect_face(model, img, shrink) det_t = np.zeros(det_f.shape) img_width = image.shape[1] det_t[:, 0] = img_width - det_f[:, 2] det_t[:, 1] = det_f[:, 1] det_t[:, 2] = img_width - det_f[:, 0] det_t[:, 3] = det_f[:, 3] det_t[:, 4] = det_f[:, 4] return det_t def multi_scale_test(model, image, max_shrink): # Shrink detecting is only used to detect big faces st = 0.5 if max_shrink >= 0.75 else 0.5 * max_shrink det_s = detect_face(model, image, st) index = np.where( np.maximum(det_s[:, 2] - det_s[:, 0] + 1, det_s[:, 3] - det_s[:, 1] + 1) > 30)[0] det_s = det_s[index, :] # Enlarge one times bt = min(2, max_shrink) if max_shrink > 1 else (st + max_shrink) / 2 det_b = detect_face(model, image, bt) # Enlarge small image x times for small faces if max_shrink > 2: bt *= 2 while bt < max_shrink: det_b = np.row_stack((det_b, detect_face(model, image, bt))) bt *= 2 det_b = np.row_stack((det_b, detect_face(model, image, max_shrink))) # Enlarged images are only used to detect small faces. if bt > 1: index = np.where( np.minimum(det_b[:, 2] - det_b[:, 0] + 1, det_b[:, 3] - det_b[:, 1] + 1) < 100)[0] det_b = det_b[index, :] # Shrinked images are only used to detect big faces. else: index = np.where( np.maximum(det_b[:, 2] - det_b[:, 0] + 1, det_b[:, 3] - det_b[:, 1] + 1) > 30)[0] det_b = det_b[index, :] return det_s, det_b def multi_scale_test_pyramid(model, image, max_shrink): # Use image pyramids to detect faces det_b = detect_face(model, image, 0.25) index = np.where( np.maximum(det_b[:, 2] - det_b[:, 0] + 1, det_b[:, 3] - det_b[:, 1] + 1) > 30)[0] det_b = det_b[index, :] st = [0.75, 1.25, 1.5, 1.75] for i in range(len(st)): if st[i] <= max_shrink: det_temp = detect_face(model, image, st[i]) # Enlarged images are only used to detect small faces. if st[i] > 1: index = np.where( np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1, det_temp[:, 3] - det_temp[:, 1] + 1) < 100)[0] det_temp = det_temp[index, :] # Shrinked images are only used to detect big faces. else: index = np.where( np.maximum(det_temp[:, 2] - det_temp[:, 0] + 1, det_temp[:, 3] - det_temp[:, 1] + 1) > 30)[0] det_temp = det_temp[index, :] det_b = np.row_stack((det_b, det_temp)) return det_b def to_chw(image): """ Transpose image from HWC to CHW. Args: image (np.array): an image with HWC layout. """ # HWC to CHW if len(image.shape) == 3: image = np.swapaxes(image, 1, 2) image = np.swapaxes(image, 1, 0) return image def face_img_process(image, mean=[104., 117., 123.], std=[127.502231, 127.502231, 127.502231]): img = np.array(image) img = to_chw(img) img = img.astype('float32') img -= np.array(mean)[:, np.newaxis, np.newaxis].astype('float32') img /= np.array(std)[:, np.newaxis, np.newaxis].astype('float32') img = [img] img = np.array(img) return img def get_shrink(height, width): """ Args: height (int): image height. width (int): image width. """ # avoid out of memory max_shrink_v1 = (0x7fffffff / 577.0 / (height * width))**0.5 max_shrink_v2 = ((678 * 1024 * 2.0 * 2.0) / (height * width))**0.5 def get_round(x, loc): str_x = str(x) if '.' in str_x: str_before, str_after = str_x.split('.') len_after = len(str_after) if len_after >= 3: str_final = str_before + '.' + str_after[0:loc] return float(str_final) else: return x max_shrink = get_round(min(max_shrink_v1, max_shrink_v2), 2) - 0.3 if max_shrink >= 1.5 and max_shrink < 2: max_shrink = max_shrink - 0.1 elif max_shrink >= 2 and max_shrink < 3: max_shrink = max_shrink - 0.2 elif max_shrink >= 3 and max_shrink < 4: max_shrink = max_shrink - 0.3 elif max_shrink >= 4 and max_shrink < 5: max_shrink = max_shrink - 0.4 elif max_shrink >= 5: max_shrink = max_shrink - 0.5 elif max_shrink <= 0.1: max_shrink = 0.1 shrink = max_shrink if max_shrink < 1 else 1 return shrink, max_shrink def bbox_vote(det): order = det[:, 4].ravel().argsort()[::-1] det = det[order, :] if det.shape[0] == 0: dets = np.array([[10, 10, 20, 20, 0.002]]) det = np.empty(shape=[0, 5]) while det.shape[0] > 0: # IOU area = (det[:, 2] - det[:, 0] + 1) * (det[:, 3] - det[:, 1] + 1) xx1 = np.maximum(det[0, 0], det[:, 0]) yy1 = np.maximum(det[0, 1], det[:, 1]) xx2 = np.minimum(det[0, 2], det[:, 2]) yy2 = np.minimum(det[0, 3], det[:, 3]) w = np.maximum(0.0, xx2 - xx1 + 1) h = np.maximum(0.0, yy2 - yy1 + 1) inter = w * h o = inter / (area[0] + area[:] - inter) # nms merge_index = np.where(o >= 0.3)[0] det_accu = det[merge_index, :] det = np.delete(det, merge_index, 0) if merge_index.shape[0] <= 1: if det.shape[0] == 0: try: dets = np.row_stack((dets, det_accu)) except: dets = det_accu continue det_accu[:, 0:4] = det_accu[:, 0:4] * np.tile(det_accu[:, -1:], (1, 4)) max_score = np.max(det_accu[:, 4]) det_accu_sum = np.zeros((1, 5)) det_accu_sum[:, 0:4] = np.sum(det_accu[:, 0:4], axis=0) / np.sum(det_accu[:, -1:]) det_accu_sum[:, 4] = max_score try: dets = np.row_stack((dets, det_accu_sum)) except: dets = det_accu_sum dets = dets[0:750, :] keep_index = np.where(dets[:, 4] >= 0.01)[0] dets = dets[keep_index, :] return dets def save_widerface_bboxes(image_path, bboxes_scores, output_dir): image_name = image_path.split('/')[-1] image_class = image_path.split('/')[-2] odir = os.path.join(output_dir, image_class) if not os.path.exists(odir): os.makedirs(odir) ofname = os.path.join(odir, '%s.txt' % (image_name[:-4])) f = open(ofname, 'w') f.write('{:s}\n'.format(image_class + '/' + image_name)) f.write('{:d}\n'.format(bboxes_scores.shape[0])) for box_score in bboxes_scores: xmin, ymin, xmax, ymax, score = box_score f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n'.format(xmin, ymin, ( xmax - xmin + 1), (ymax - ymin + 1), score)) f.close() logger.info("The predicted result is saved as {}".format(ofname)) def save_fddb_bboxes(bboxes_scores, output_dir, output_fname='pred_fddb_res.txt'): if not os.path.exists(output_dir): os.makedirs(output_dir) predict_file = os.path.join(output_dir, output_fname) f = open(predict_file, 'w') for image_path, dets in bboxes_scores.iteritems(): f.write('{:s}\n'.format(image_path)) f.write('{:d}\n'.format(dets.shape[0])) for box_score in dets: xmin, ymin, xmax, ymax, score = box_score width, height = xmax - xmin, ymax - ymin f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n' .format(xmin, ymin, width, height, score)) logger.info("The predicted result is saved as {}".format(predict_file)) return predict_file def lmk2out(results, is_bbox_normalized=False): """ Args: results: request a dict, should include: `landmark`, `im_id`, if is_bbox_normalized=True, also need `im_shape`. is_bbox_normalized: whether or not landmark is normalized. """ xywh_res = [] for t in results: bboxes = t['bbox'][0] lengths = t['bbox'][1][0] im_ids = np.array(t['im_id'][0]).flatten() if bboxes.shape == (1, 1) or bboxes is None: continue face_index = t['face_index'][0] prior_box = t['prior_boxes'][0] predict_lmk = t['landmark'][0] prior = np.reshape(prior_box, (-1, 4)) predictlmk = np.reshape(predict_lmk, (-1, 10)) k = 0 for a in range(len(lengths)): num = lengths[a] im_id = int(im_ids[a]) for i in range(num): score = bboxes[k][1] theindex = face_index[i][0] me_prior = prior[theindex, :] lmk_pred = predictlmk[theindex, :] prior_w = me_prior[2] - me_prior[0] prior_h = me_prior[3] - me_prior[1] prior_w_center = (me_prior[2] + me_prior[0]) / 2 prior_h_center = (me_prior[3] + me_prior[1]) / 2 lmk_decode = np.zeros((10)) for j in [0, 2, 4, 6, 8]: lmk_decode[j] = lmk_pred[j] * 0.1 * prior_w + prior_w_center for j in [1, 3, 5, 7, 9]: lmk_decode[j] = lmk_pred[j] * 0.1 * prior_h + prior_h_center im_shape = t['im_shape'][0][a].tolist() image_h, image_w = int(im_shape[0]), int(im_shape[1]) if is_bbox_normalized: lmk_decode = lmk_decode * np.array([ image_w, image_h, image_w, image_h, image_w, image_h, image_w, image_h, image_w, image_h ]) lmk_res = { 'image_id': im_id, 'landmark': lmk_decode, 'score': score, } xywh_res.append(lmk_res) k += 1 return xywh_res def image_eval(pred, gt, ignore, iou_thresh): """ single image evaluation pred: Nx5 xyxys gt: Nx4 xywh ignore: """ _pred = pred.copy() _gt = gt.copy() pred_recall = np.zeros(_pred.shape[0]) recall_list = np.zeros(_gt.shape[0]) proposal_list = np.ones(_pred.shape[0]) _gt[:, 2] = _gt[:, 2] + _gt[:, 0] _gt[:, 3] = _gt[:, 3] + _gt[:, 1] overlaps = bbox_overlaps(_pred[:, :4], _gt) for h in range(_pred.shape[0]): gt_overlap = overlaps[h] max_overlap, max_idx = gt_overlap.max(), gt_overlap.argmax() if max_overlap >= iou_thresh: if ignore[max_idx] == 0: recall_list[max_idx] = -1 proposal_list[h] = -1 elif recall_list[max_idx] == 0: recall_list[max_idx] = 1 r_keep_index = np.where(recall_list == 1)[0] pred_recall[h] = len(r_keep_index) return pred_recall, proposal_list def bbox_overlaps(boxes1, boxes2): """ Parameters ---------- boxes1: (N, 4) ndarray of float boxes2: (K, 4) ndarray of float Returns ------- overlaps: (N, K) ndarray of overlap between boxes1 and boxes2 """ # Calculate the area of each box box_areas1 = (boxes1[:, 2] - boxes1[:, 0] + 1) * ( boxes1[:, 3] - boxes1[:, 1] + 1) box_areas2 = (boxes2[:, 2] - boxes2[:, 0] + 1) * ( boxes2[:, 3] - boxes2[:, 1] + 1) # Calculate the intersection areas iw = np.minimum(boxes1[:, None, 2], boxes2[None, :, 2]) - np.maximum( boxes1[:, None, 0], boxes2[None, :, 0]) + 1 ih = np.minimum(boxes1[:, None, 3], boxes2[None, :, 3]) - np.maximum( boxes1[:, None, 1], boxes2[None, :, 1]) + 1 # Ensure that the intersection width and height are non-negative iw = np.maximum(iw, 0) ih = np.maximum(ih, 0) # Calculate the intersection area intersection = iw * ih # Calculate the union area union = box_areas1[:, None] + box_areas2[None, :] - intersection union = box_areas1[:, None] + box_areas2[None, :] - intersection union = np.maximum(union, 1e-8) # Calculate the overlaps (intersection over union) overlaps = intersection / union return overlaps def img_pr_info(thresh_num, pred_info, proposal_list, pred_recall): pr_info = np.zeros((thresh_num, 2)).astype('float') for t in range(thresh_num): thresh = 1 - (t+1)/thresh_num r_index = np.where(pred_info[:, 4] >= thresh)[0] if len(r_index) == 0: pr_info[t, 0] = 0 pr_info[t, 1] = 0 else: r_index = r_index[-1] p_index = np.where(proposal_list[:r_index+1] == 1)[0] pr_info[t, 0] = len(p_index) pr_info[t, 1] = pred_recall[r_index] return pr_info def dataset_pr_info(thresh_num, pr_curve, count_face): _pr_curve = np.zeros((thresh_num, 2)) for i in range(thresh_num): _pr_curve[i, 0] = pr_curve[i, 1] / pr_curve[i, 0] _pr_curve[i, 1] = pr_curve[i, 1] / count_face return _pr_curve def voc_ap(rec, prec): # correct AP calculation # first append sentinel values at the end mrec = np.concatenate(([0.], rec, [1.])) mpre = np.concatenate(([0.], prec, [0.])) # compute the precision envelope for i in range(mpre.size - 1, 0, -1): mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) # to calculate area under PR curve, look for points # where X axis (recall) changes value i = np.where(mrec[1:] != mrec[:-1])[0] # and sum (\Delta recall) * prec ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) return ap ================================================ FILE: ppdet/model_zoo/.gitignore ================================================ MODEL_ZOO ================================================ FILE: ppdet/model_zoo/__init__.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import model_zoo from .model_zoo import * __all__ = model_zoo.__all__ ================================================ FILE: ppdet/model_zoo/model_zoo.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os.path as osp import pkg_resources try: from collections.abc import Sequence except: from collections import Sequence from ppdet.core.workspace import load_config, create from ppdet.utils.checkpoint import load_weight from ppdet.utils.download import get_config_path from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) __all__ = [ 'list_model', 'get_config_file', 'get_weights_url', 'get_model', 'MODEL_ZOO_FILENAME' ] MODEL_ZOO_FILENAME = 'MODEL_ZOO' def list_model(filters=[]): model_zoo_file = pkg_resources.resource_filename('ppdet.model_zoo', MODEL_ZOO_FILENAME) with open(model_zoo_file) as f: model_names = f.read().splitlines() # filter model_name def filt(name): for f in filters: if name.find(f) < 0: return False return True if isinstance(filters, str) or not isinstance(filters, Sequence): filters = [filters] model_names = [name for name in model_names if filt(name)] if len(model_names) == 0 and len(filters) > 0: raise ValueError("no model found, please check filters seeting, " "filters can be set as following kinds:\n" "\tDataset: coco, voc ...\n" "\tArchitecture: yolo, rcnn, ssd ...\n" "\tBackbone: resnet, vgg, darknet ...\n") model_str = "Available Models:\n" for model_name in model_names: model_str += "\t{}\n".format(model_name) logger.info(model_str) # models and configs save on bcebos under dygraph directory def get_config_file(model_name): return get_config_path("ppdet://configs/{}.yml".format(model_name)) def get_weights_url(model_name): return "ppdet://models/{}.pdparams".format(osp.split(model_name)[-1]) def get_model(model_name, pretrained=True): cfg_file = get_config_file(model_name) cfg = load_config(cfg_file) model = create(cfg.architecture) if pretrained: load_weight(model, get_weights_url(model_name)) return model ================================================ FILE: ppdet/model_zoo/tests/__init__.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppdet/model_zoo/tests/test_get_model.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import paddle import ppdet import unittest # NOTE: weights downloading costs time, we choose # a small model for unittesting MODEL_NAME = 'ppyolo/ppyolo_tiny_650e_coco' class TestGetConfigFile(unittest.TestCase): def test_main(self): try: cfg_file = ppdet.model_zoo.get_config_file(MODEL_NAME) assert os.path.isfile(cfg_file) except: self.assertTrue(False) class TestGetModel(unittest.TestCase): def test_main(self): try: model = ppdet.model_zoo.get_model(MODEL_NAME) assert isinstance(model, paddle.nn.Layer) except: self.assertTrue(False) if __name__ == '__main__': unittest.main() ================================================ FILE: ppdet/model_zoo/tests/test_list_model.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import unittest import ppdet class TestListModel(unittest.TestCase): def setUp(self): self._filter = [] def test_main(self): try: ppdet.model_zoo.list_model(self._filter) self.assertTrue(True) except: self.assertTrue(False) class TestListModelYOLO(TestListModel): def setUp(self): self._filter = ['yolo'] class TestListModelRCNN(TestListModel): def setUp(self): self._filter = ['rcnn'] class TestListModelSSD(TestListModel): def setUp(self): self._filter = ['ssd'] class TestListModelMultiFilter(TestListModel): def setUp(self): self._filter = ['yolo', 'darknet'] class TestListModelError(unittest.TestCase): def setUp(self): self._filter = ['xxx'] def test_main(self): try: ppdet.model_zoo.list_model(self._filter) self.assertTrue(False) except ValueError: self.assertTrue(True) if __name__ == '__main__': unittest.main() ================================================ FILE: ppdet/modeling/__init__.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import warnings warnings.filterwarnings( action='ignore', category=DeprecationWarning, module='ops') from . import ops from . import backbones from . import necks from . import proposal_generator from . import heads from . import losses from . import architectures from . import post_process from . import layers from . import reid from . import mot from . import transformers from . import assigners from . import rbox_utils from . import ssod from .ops import * from .backbones import * from .necks import * from .proposal_generator import * from .heads import * from .losses import * from .architectures import * from .post_process import * from .layers import * from .reid import * from .mot import * from .transformers import * from .assigners import * from .rbox_utils import * from .ssod import * ================================================ FILE: ppdet/modeling/architectures/__init__.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import meta_arch from . import faster_rcnn from . import mask_rcnn from . import yolo from . import ppyoloe from . import cascade_rcnn from . import ssd from . import fcos from . import solov2 from . import ttfnet from . import s2anet from . import keypoint_hrhrnet from . import keypoint_hrnet from . import keypoint_vitpose from . import jde from . import deepsort from . import fairmot from . import centernet from . import gfl from . import picodet from . import detr from . import sparse_rcnn from . import tood from . import retinanet from . import bytetrack from . import yolox from . import yolof from . import pose3d_metro from . import centertrack from . import queryinst from . import detr_ssod from . import multi_stream_detector from . import clrnet from .meta_arch import * from .faster_rcnn import * from .mask_rcnn import * from .yolo import * from .ppyoloe import * from .cascade_rcnn import * from .ssd import * from .fcos import * from .solov2 import * from .ttfnet import * from .s2anet import * from .keypoint_hrhrnet import * from .keypoint_hrnet import * from .keypoint_vitpose import * from .jde import * from .deepsort import * from .fairmot import * from .centernet import * from .blazeface import * from .gfl import * from .picodet import * from .detr import * from .sparse_rcnn import * from .tood import * from .retinanet import * from .bytetrack import * from .yolox import * from .yolof import * from .pose3d_metro import * from .centertrack import * from .queryinst import * from .keypoint_petr import * from .detr_ssod import * from .multi_stream_detector import * from .clrnet import * from . import rtdetrv3 from .rtdetrv3 import * ================================================ FILE: ppdet/modeling/architectures/blazeface.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function from ppdet.core.workspace import register, create from .meta_arch import BaseArch import paddle import paddle.nn.functional as F __all__ = ['BlazeFace'] @register class BlazeFace(BaseArch): """ BlazeFace: Sub-millisecond Neural Face Detection on Mobile GPUs, see https://arxiv.org/abs/1907.05047 Args: backbone (nn.Layer): backbone instance neck (nn.Layer): neck instance blaze_head (nn.Layer): `blazeHead` instance post_process (object): `BBoxPostProcess` instance """ __category__ = 'architecture' __inject__ = ['post_process'] def __init__(self, backbone, blaze_head, neck, post_process): super(BlazeFace, self).__init__() self.backbone = backbone self.neck = neck self.blaze_head = blaze_head self.post_process = post_process @classmethod def from_config(cls, cfg, *args, **kwargs): # backbone backbone = create(cfg['backbone']) # fpn kwargs = {'input_shape': backbone.out_shape} neck = create(cfg['neck'], **kwargs) # head kwargs = {'input_shape': neck.out_shape} blaze_head = create(cfg['blaze_head'], **kwargs) return { 'backbone': backbone, 'neck': neck, 'blaze_head': blaze_head, } def _forward(self): # Backbone body_feats = self.backbone(self.inputs) # neck neck_feats = self.neck(body_feats) # blaze Head if self.training: return self.blaze_head(neck_feats, self.inputs['image'], self.inputs['gt_bbox'], self.inputs['gt_class']) else: preds, anchors = self.blaze_head(neck_feats, self.inputs['image']) bbox, bbox_num, nms_keep_idx = self.post_process( preds, anchors, self.inputs['im_shape'], self.inputs['scale_factor']) if self.use_extra_data: extra_data = {} # record the bbox output before nms, such like scores and nms_keep_idx """extra_data:{ 'scores': predict scores, 'nms_keep_idx': bbox index before nms, } """ preds_logits = preds[1] # [[1xNumBBoxNumClass]] extra_data['scores'] = F.softmax(paddle.concat( preds_logits, axis=1)).transpose([0, 2, 1]) extra_data['logits'] = paddle.concat( preds_logits, axis=1).transpose([0, 2, 1]) extra_data['nms_keep_idx'] = nms_keep_idx # bbox index before nms return bbox, bbox_num, extra_data else: return bbox, bbox_num def get_loss(self, ): return {"loss": self._forward()} def get_pred(self): if self.use_extra_data: bbox_pred, bbox_num, extra_data = self._forward() output = { "bbox": bbox_pred, "bbox_num": bbox_num, "extra_data": extra_data } else: bbox_pred, bbox_num = self._forward() output = { "bbox": bbox_pred, "bbox_num": bbox_num, } return output ================================================ FILE: ppdet/modeling/architectures/bytetrack.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function from ppdet.core.workspace import register, create from .meta_arch import BaseArch __all__ = ['ByteTrack'] @register class ByteTrack(BaseArch): """ ByteTrack network, see https://arxiv.org/abs/2110.06864 Args: detector (object): detector model instance reid (object): reid model instance, default None tracker (object): tracker instance """ __category__ = 'architecture' def __init__(self, detector='YOLOX', reid=None, tracker='JDETracker'): super(ByteTrack, self).__init__() self.detector = detector self.reid = reid self.tracker = tracker @classmethod def from_config(cls, cfg, *args, **kwargs): detector = create(cfg['detector']) if cfg['reid'] != 'None': reid = create(cfg['reid']) else: reid = None tracker = create(cfg['tracker']) return { "detector": detector, "reid": reid, "tracker": tracker, } def _forward(self): det_outs = self.detector(self.inputs) if self.training: return det_outs else: if self.reid is not None: assert 'crops' in self.inputs crops = self.inputs['crops'] pred_embs = self.reid(crops) else: pred_embs = None det_outs['embeddings'] = pred_embs return det_outs def get_loss(self): return self._forward() def get_pred(self): return self._forward() ================================================ FILE: ppdet/modeling/architectures/cascade_rcnn.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle from ppdet.core.workspace import register, create from .meta_arch import BaseArch __all__ = ['CascadeRCNN'] @register class CascadeRCNN(BaseArch): """ Cascade R-CNN network, see https://arxiv.org/abs/1712.00726 Args: backbone (object): backbone instance rpn_head (object): `RPNHead` instance bbox_head (object): `BBoxHead` instance bbox_post_process (object): `BBoxPostProcess` instance neck (object): 'FPN' instance mask_head (object): `MaskHead` instance mask_post_process (object): `MaskPostProcess` instance """ __category__ = 'architecture' __inject__ = [ 'bbox_post_process', 'mask_post_process', ] def __init__(self, backbone, rpn_head, bbox_head, bbox_post_process, neck=None, mask_head=None, mask_post_process=None): super(CascadeRCNN, self).__init__() self.backbone = backbone self.rpn_head = rpn_head self.bbox_head = bbox_head self.bbox_post_process = bbox_post_process self.neck = neck self.mask_head = mask_head self.mask_post_process = mask_post_process self.with_mask = mask_head is not None @classmethod def from_config(cls, cfg, *args, **kwargs): backbone = create(cfg['backbone']) kwargs = {'input_shape': backbone.out_shape} neck = cfg['neck'] and create(cfg['neck'], **kwargs) out_shape = neck and neck.out_shape or backbone.out_shape kwargs = {'input_shape': out_shape} rpn_head = create(cfg['rpn_head'], **kwargs) bbox_head = create(cfg['bbox_head'], **kwargs) out_shape = neck and out_shape or bbox_head.get_head().out_shape kwargs = {'input_shape': out_shape} mask_head = cfg['mask_head'] and create(cfg['mask_head'], **kwargs) return { 'backbone': backbone, 'neck': neck, "rpn_head": rpn_head, "bbox_head": bbox_head, "mask_head": mask_head, } def _forward(self): body_feats = self.backbone(self.inputs) if self.neck is not None: body_feats = self.neck(body_feats) if self.training: rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs) bbox_loss, bbox_feat = self.bbox_head(body_feats, rois, rois_num, self.inputs) rois, rois_num = self.bbox_head.get_assigned_rois() bbox_targets = self.bbox_head.get_assigned_targets() if self.with_mask: mask_loss = self.mask_head(body_feats, rois, rois_num, self.inputs, bbox_targets, bbox_feat) return rpn_loss, bbox_loss, mask_loss else: return rpn_loss, bbox_loss, {} else: rois, rois_num, _ = self.rpn_head(body_feats, self.inputs) preds, _ = self.bbox_head(body_feats, rois, rois_num, self.inputs) refined_rois = self.bbox_head.get_refined_rois() im_shape = self.inputs['im_shape'] scale_factor = self.inputs['scale_factor'] bbox, bbox_num, nms_keep_idx = self.bbox_post_process( preds, (refined_rois, rois_num), im_shape, scale_factor) # rescale the prediction back to origin image bbox, bbox_pred, bbox_num = self.bbox_post_process.get_pred( bbox, bbox_num, im_shape, scale_factor) if not self.with_mask: return bbox_pred, bbox_num, None mask_out = self.mask_head(body_feats, bbox, bbox_num, self.inputs) origin_shape = self.bbox_post_process.get_origin_shape() mask_pred = self.mask_post_process(mask_out, bbox_pred, bbox_num, origin_shape) return bbox_pred, bbox_num, mask_pred def get_loss(self, ): rpn_loss, bbox_loss, mask_loss = self._forward() loss = {} loss.update(rpn_loss) loss.update(bbox_loss) if self.with_mask: loss.update(mask_loss) total_loss = paddle.add_n(list(loss.values())) loss.update({'loss': total_loss}) return loss def get_pred(self): bbox_pred, bbox_num, mask_pred = self._forward() output = { 'bbox': bbox_pred, 'bbox_num': bbox_num, } if self.with_mask: output.update({'mask': mask_pred}) return output ================================================ FILE: ppdet/modeling/architectures/centernet.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function from ppdet.core.workspace import register, create from .meta_arch import BaseArch __all__ = ['CenterNet'] @register class CenterNet(BaseArch): """ CenterNet network, see http://arxiv.org/abs/1904.07850 Args: backbone (object): backbone instance neck (object): FPN instance, default use 'CenterNetDLAFPN' head (object): 'CenterNetHead' instance post_process (object): 'CenterNetPostProcess' instance for_mot (bool): whether return other features used in tracking model """ __category__ = 'architecture' __inject__ = ['post_process'] __shared__ = ['for_mot'] def __init__(self, backbone, neck='CenterNetDLAFPN', head='CenterNetHead', post_process='CenterNetPostProcess', for_mot=False): super(CenterNet, self).__init__() self.backbone = backbone self.neck = neck self.head = head self.post_process = post_process self.for_mot = for_mot @classmethod def from_config(cls, cfg, *args, **kwargs): backbone = create(cfg['backbone']) kwargs = {'input_shape': backbone.out_shape} neck = cfg['neck'] and create(cfg['neck'], **kwargs) out_shape = neck and neck.out_shape or backbone.out_shape kwargs = {'input_shape': out_shape} head = create(cfg['head'], **kwargs) return {'backbone': backbone, 'neck': neck, "head": head} def _forward(self): neck_feat = self.backbone(self.inputs) if self.neck is not None: neck_feat = self.neck(neck_feat) head_out = self.head(neck_feat, self.inputs) if self.for_mot: head_out.update({'neck_feat': neck_feat}) elif self.training: head_out['loss'] = head_out.pop('det_loss') return head_out def get_pred(self): head_out = self._forward() bbox, bbox_num, bbox_inds, topk_clses, topk_ys, topk_xs = self.post_process( head_out['heatmap'], head_out['size'], head_out['offset'], im_shape=self.inputs['im_shape'], scale_factor=self.inputs['scale_factor']) if self.for_mot: output = { "bbox": bbox, "bbox_num": bbox_num, "bbox_inds": bbox_inds, "topk_clses": topk_clses, "topk_ys": topk_ys, "topk_xs": topk_xs, "neck_feat": head_out['neck_feat'] } else: output = {"bbox": bbox, "bbox_num": bbox_num} return output def get_loss(self): return self._forward() ================================================ FILE: ppdet/modeling/architectures/centertrack.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import copy import math import numpy as np import paddle from ppdet.core.workspace import register, create from .meta_arch import BaseArch from ..keypoint_utils import affine_transform from ppdet.data.transform.op_helper import gaussian_radius, gaussian2D, draw_umich_gaussian __all__ = ['CenterTrack'] @register class CenterTrack(BaseArch): """ CenterTrack network, see http://arxiv.org/abs/2004.01177 Args: detector (object): 'CenterNet' instance plugin_head (object): 'CenterTrackHead' instance tracker (object): 'CenterTracker' instance """ __category__ = 'architecture' __shared__ = ['mot_metric'] def __init__(self, detector='CenterNet', plugin_head='CenterTrackHead', tracker='CenterTracker', mot_metric=False): super(CenterTrack, self).__init__() self.detector = detector self.plugin_head = plugin_head self.tracker = tracker self.mot_metric = mot_metric self.pre_image = None self.deploy = False @classmethod def from_config(cls, cfg, *args, **kwargs): detector = create(cfg['detector']) detector_out_shape = detector.neck and detector.neck.out_shape or detector.backbone.out_shape kwargs = {'input_shape': detector_out_shape} plugin_head = create(cfg['plugin_head'], **kwargs) tracker = create(cfg['tracker']) return { 'detector': detector, 'plugin_head': plugin_head, 'tracker': tracker, } def _forward(self): if self.training: det_outs = self.detector(self.inputs) neck_feat = det_outs['neck_feat'] losses = {} for k, v in det_outs.items(): if 'loss' not in k: continue losses.update({k: v}) plugin_outs = self.plugin_head(neck_feat, self.inputs) for k, v in plugin_outs.items(): if 'loss' not in k: continue losses.update({k: v}) losses['loss'] = det_outs['det_loss'] + plugin_outs['plugin_loss'] return losses else: if not self.mot_metric: # detection, support bs>=1 det_outs = self.detector(self.inputs) return { 'bbox': det_outs['bbox'], 'bbox_num': det_outs['bbox_num'] } else: # MOT, only support bs=1 if not self.deploy: if self.pre_image is None: self.pre_image = self.inputs['image'] # initializing tracker for the first frame self.tracker.init_track([]) self.inputs['pre_image'] = self.pre_image self.pre_image = self.inputs[ 'image'] # Note: update for next image # render input heatmap from tracker status pre_hm = self.get_additional_inputs( self.tracker.tracks, self.inputs, with_hm=True) self.inputs['pre_hm'] = paddle.to_tensor(pre_hm) # model inference det_outs = self.detector(self.inputs) neck_feat = det_outs['neck_feat'] result = self.plugin_head( neck_feat, self.inputs, det_outs['bbox'], det_outs['bbox_inds'], det_outs['topk_clses'], det_outs['topk_ys'], det_outs['topk_xs']) if not self.deploy: # convert the cropped and 4x downsampled output coordinate system # back to the input image coordinate system result = self.plugin_head.centertrack_post_process( result, self.inputs, self.tracker.out_thresh) return result def get_pred(self): return self._forward() def get_loss(self): return self._forward() def reset_tracking(self): self.tracker.reset() self.pre_image = None def get_additional_inputs(self, dets, meta, with_hm=True): # Render input heatmap from previous trackings. trans_input = meta['trans_input'][0].numpy() inp_width, inp_height = int(meta['inp_width'][0]), int(meta[ 'inp_height'][0]) input_hm = np.zeros((1, inp_height, inp_width), dtype=np.float32) for det in dets: if det['score'] < self.tracker.pre_thresh: continue bbox = affine_transform_bbox(det['bbox'], trans_input, inp_width, inp_height) h, w = bbox[3] - bbox[1], bbox[2] - bbox[0] if (h > 0 and w > 0): radius = gaussian_radius( (math.ceil(h), math.ceil(w)), min_overlap=0.7) radius = max(0, int(radius)) ct = np.array( [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32) ct_int = ct.astype(np.int32) if with_hm: input_hm[0] = draw_umich_gaussian(input_hm[0], ct_int, radius) if with_hm: input_hm = input_hm[np.newaxis] return input_hm def affine_transform_bbox(bbox, trans, width, height): bbox = np.array(copy.deepcopy(bbox), dtype=np.float32) bbox[:2] = affine_transform(bbox[:2], trans) bbox[2:] = affine_transform(bbox[2:], trans) bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, width - 1) bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, height - 1) return bbox ================================================ FILE: ppdet/modeling/architectures/clrnet.py ================================================ from .meta_arch import BaseArch from ppdet.core.workspace import register, create from paddle import in_dynamic_mode __all__ = ['CLRNet'] @register class CLRNet(BaseArch): __category__ = 'architecture' def __init__(self, backbone="CLRResNet", neck="CLRFPN", clr_head="CLRHead", post_process=None): super(CLRNet, self).__init__() self.backbone = backbone self.neck = neck self.heads = clr_head self.post_process = post_process @classmethod def from_config(cls, cfg, *args, **kwargs): # backbone backbone = create(cfg['backbone']) # fpn kwargs = {'input_shape': backbone.out_shape} neck = create(cfg['neck'], **kwargs) # head kwargs = {'input_shape': neck.out_shape} clr_head = create(cfg['clr_head'], **kwargs) return { 'backbone': backbone, 'neck': neck, 'clr_head': clr_head, } def _forward(self): # Backbone body_feats = self.backbone(self.inputs['image']) # neck neck_feats = self.neck(body_feats) # CRL Head if self.training: output = self.heads(neck_feats, self.inputs) else: output = self.heads(neck_feats) output = {'lanes': output} # TODO: hard code fix as_lanes=False problem in clrnet_head.py "get_lanes" function for static mode if in_dynamic_mode(): output = self.heads.get_lanes(output['lanes']) output = { "lanes": output, "img_path": self.inputs['full_img_path'], "img_name": self.inputs['img_name'] } return output def get_loss(self): return self._forward() def get_pred(self): return self._forward() ================================================ FILE: ppdet/modeling/architectures/deepsort.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle from ppdet.core.workspace import register, create from .meta_arch import BaseArch from ppdet.modeling.mot.utils import Detection, get_crops, scale_coords, clip_box __all__ = ['DeepSORT'] @register class DeepSORT(BaseArch): """ DeepSORT network, see https://arxiv.org/abs/1703.07402 Args: detector (object): detector model instance reid (object): reid model instance tracker (object): tracker instance """ __category__ = 'architecture' def __init__(self, detector='YOLOv3', reid='PCBPyramid', tracker='DeepSORTTracker'): super(DeepSORT, self).__init__() self.detector = detector self.reid = reid self.tracker = tracker @classmethod def from_config(cls, cfg, *args, **kwargs): if cfg['detector'] != 'None': detector = create(cfg['detector']) else: detector = None reid = create(cfg['reid']) tracker = create(cfg['tracker']) return { "detector": detector, "reid": reid, "tracker": tracker, } def _forward(self): crops = self.inputs['crops'] outs = {} outs['embeddings'] = self.reid(crops) return outs def get_pred(self): return self._forward() ================================================ FILE: ppdet/modeling/architectures/detr.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle from .meta_arch import BaseArch from ppdet.core.workspace import register, create __all__ = ['DETR'] # Deformable DETR, DINO use the same architecture as DETR @register class DETR(BaseArch): __category__ = 'architecture' __inject__ = ['post_process', 'post_process_semi'] __shared__ = ['with_mask', 'exclude_post_process'] def __init__(self, backbone, transformer='DETRTransformer', detr_head='DETRHead', neck=None, post_process='DETRPostProcess', post_process_semi=None, with_mask=False, exclude_post_process=False): super(DETR, self).__init__() self.backbone = backbone self.transformer = transformer self.detr_head = detr_head self.neck = neck self.post_process = post_process self.with_mask = with_mask self.exclude_post_process = exclude_post_process self.post_process_semi = post_process_semi @classmethod def from_config(cls, cfg, *args, **kwargs): # backbone backbone = create(cfg['backbone']) # neck kwargs = {'input_shape': backbone.out_shape} neck = create(cfg['neck'], **kwargs) if cfg['neck'] else None # transformer if neck is not None: kwargs = {'input_shape': neck.out_shape} transformer = create(cfg['transformer'], **kwargs) # head kwargs = { 'hidden_dim': transformer.hidden_dim, 'nhead': transformer.nhead, 'input_shape': backbone.out_shape } detr_head = create(cfg['detr_head'], **kwargs) return { 'backbone': backbone, 'transformer': transformer, "detr_head": detr_head, "neck": neck } def _forward(self): # Backbone body_feats = self.backbone(self.inputs) # Neck if self.neck is not None: body_feats = self.neck(body_feats) # Transformer pad_mask = self.inputs.get('pad_mask', None) out_transformer = self.transformer(body_feats, pad_mask, self.inputs) # DETR Head if self.training: detr_losses = self.detr_head(out_transformer, body_feats, self.inputs) detr_losses.update({ 'loss': paddle.add_n( [v for k, v in detr_losses.items() if 'log' not in k]) }) return detr_losses else: preds = self.detr_head(out_transformer, body_feats) if self.exclude_post_process: bbox, bbox_num, mask = preds else: bbox, bbox_num, mask = self.post_process( preds, self.inputs['im_shape'], self.inputs['scale_factor'], self.inputs['image'][2:].shape) output = {'bbox': bbox, 'bbox_num': bbox_num} if self.with_mask: output['mask'] = mask return output def get_loss(self): return self._forward() def get_pred(self): return self._forward() ================================================ FILE: ppdet/modeling/architectures/detr_ssod.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function from ppdet.core.workspace import register, create, merge_config import paddle import numpy as np import paddle import paddle.nn.functional as F from ppdet.core.workspace import register, create from ppdet.utils.logger import setup_logger from ppdet.modeling.ssod.utils import filter_invalid from .multi_stream_detector import MultiSteamDetector logger = setup_logger(__name__) __all__ = ['DETR_SSOD'] __shared__ = ['num_classes'] @register class DETR_SSOD(MultiSteamDetector): def __init__(self, teacher, student, train_cfg=None, test_cfg=None, RTDETRTransformer=None, num_classes=80): super(DETR_SSOD, self).__init__( dict( teacher=teacher, student=student), train_cfg=train_cfg, test_cfg=test_cfg, ) self.ema_start_iters = train_cfg['ema_start_iters'] self.momentum = 0.9996 self.cls_thr = None self.cls_thr_ig = None self.num_classes = num_classes if train_cfg is not None: self.freeze("teacher") self.unsup_weight = self.train_cfg['unsup_weight'] self.sup_weight = self.train_cfg['sup_weight'] self._teacher = None self._student = None self._transformer = None @classmethod def from_config(cls, cfg): teacher = create(cfg['teacher']) merge_config(cfg) student = create(cfg['student']) train_cfg = cfg['train_cfg'] test_cfg = cfg['test_cfg'] RTDETRTransformer = cfg['RTDETRTransformer'] return { 'teacher': teacher, 'student': student, 'train_cfg': train_cfg, 'test_cfg': test_cfg, 'RTDETRTransformer': RTDETRTransformer } def forward_train(self, inputs, **kwargs): if isinstance(inputs, dict): iter_id = inputs['iter_id'] elif isinstance(inputs, list): iter_id = inputs[-1] if iter_id == self.ema_start_iters: self.update_ema_model(momentum=0) elif iter_id > self.ema_start_iters: self.update_ema_model(momentum=self.momentum) if iter_id > self.ema_start_iters: data_sup_w, data_sup_s, data_unsup_w, data_unsup_s, _ = inputs if data_sup_w['image'].shape != data_sup_s['image'].shape: data_sup_w, data_sup_s = align_weak_strong_shape(data_sup_w, data_sup_s) if 'gt_bbox' in data_unsup_s.keys(): del data_unsup_s['gt_bbox'] if 'gt_class' in data_unsup_s.keys(): del data_unsup_s['gt_class'] if 'gt_class' in data_unsup_w.keys(): del data_unsup_w['gt_class'] if 'gt_bbox' in data_unsup_w.keys(): del data_unsup_w['gt_bbox'] for k, v in data_sup_s.items(): if k in ['epoch_id']: continue elif k in ['gt_class', 'gt_bbox', 'is_crowd']: data_sup_s[k].extend(data_sup_w[k]) else: data_sup_s[k] = paddle.concat([v, data_sup_w[k]]) loss = {} body_feats = self.student.backbone(data_sup_s) if self.student.neck is not None: body_feats = self.student.neck(body_feats) out_transformer = self.student.transformer(body_feats, None, data_sup_s) sup_loss = self.student.detr_head(out_transformer, body_feats, data_sup_s) sup_loss.update({ 'loss': paddle.add_n( [v for k, v in sup_loss.items() if 'log' not in k]) }) sup_loss = {"sup_" + k: v for k, v in sup_loss.items()} loss.update(**sup_loss) unsup_loss = self.foward_unsup_train(data_unsup_w, data_unsup_s) unsup_loss.update({ 'loss': paddle.add_n( [v for k, v in unsup_loss.items() if 'log' not in k]) }) unsup_loss = {"unsup_" + k: v for k, v in unsup_loss.items()} unsup_loss.update({ 'loss': paddle.add_n( [v for k, v in unsup_loss.items() if 'log' not in k]) }) loss.update(**unsup_loss) loss.update({'loss': loss['sup_loss'] + loss['unsup_loss']}) else: if iter_id == self.ema_start_iters: logger.info("start semi_supervised_traing") data_sup_w, data_sup_s, data_unsup_w, data_unsup_s, _ = inputs if data_sup_w['image'].shape != data_sup_s['image'].shape: data_sup_w, data_sup_s = align_weak_strong_shape(data_sup_w, data_sup_s) for k, v in data_sup_s.items(): if k in ['epoch_id']: continue elif k in ['gt_class', 'gt_bbox', 'is_crowd']: data_sup_s[k].extend(data_sup_w[k]) else: data_sup_s[k] = paddle.concat([v, data_sup_w[k]]) loss = {} sup_loss = self.student(data_sup_s) unsup_loss = { "unsup_" + k: v * paddle.to_tensor(0) for k, v in sup_loss.items() } sup_loss = {"sup_" + k: v for k, v in sup_loss.items()} loss.update(**sup_loss) unsup_loss.update({ 'loss': paddle.add_n( [v * 0 for k, v in sup_loss.items() if 'log' not in k]) }) unsup_loss = {"unsup_" + k: v * 0 for k, v in unsup_loss.items()} loss.update(**unsup_loss) loss.update({'loss': loss['sup_loss']}) return loss def foward_unsup_train(self, data_unsup_w, data_unsup_s): with paddle.no_grad(): body_feats = self.teacher.backbone(data_unsup_w) if self.teacher.neck is not None: body_feats = self.teacher.neck(body_feats, is_teacher=True) out_transformer = self.teacher.transformer( body_feats, None, data_unsup_w, is_teacher=True) preds = self.teacher.detr_head(out_transformer, body_feats) bbox, bbox_num = self.teacher.post_process_semi(preds) self.place = body_feats[0].place proposal_bbox_list = bbox[:, -4:] proposal_bbox_list = proposal_bbox_list.split( tuple(np.array(bbox_num)), 0) proposal_label_list = paddle.cast(bbox[:, :1], np.float32) proposal_label_list = proposal_label_list.split( tuple(np.array(bbox_num)), 0) proposal_score_list = paddle.cast(bbox[:, 1:self.num_classes + 1], np.float32) proposal_score_list = proposal_score_list.split( tuple(np.array(bbox_num)), 0) proposal_bbox_list = [ paddle.to_tensor( p, place=self.place) for p in proposal_bbox_list ] proposal_label_list = [ paddle.to_tensor( p, place=self.place) for p in proposal_label_list ] # filter invalid box roughly if isinstance(self.train_cfg['pseudo_label_initial_score_thr'], float): thr = self.train_cfg['pseudo_label_initial_score_thr'] else: # TODO: use dynamic threshold raise NotImplementedError( "Dynamic Threshold is not implemented yet.") proposal_bbox_list, proposal_label_list, proposal_score_list = list( zip(* [ filter_invalid( proposal[:, :4], proposal_label, proposal_score, thr=thr, min_size=self.train_cfg['min_pseduo_box_size'], ) for proposal, proposal_label, proposal_score in zip(proposal_bbox_list, proposal_label_list, proposal_score_list) ])) teacher_bboxes = list(proposal_bbox_list) teacher_labels = proposal_label_list teacher_info = [teacher_bboxes, teacher_labels] student_unsup = data_unsup_s return self.compute_pseudo_label_loss(student_unsup, teacher_info, proposal_score_list) def compute_pseudo_label_loss(self, student_unsup, teacher_info, proposal_score_list): pseudo_bboxes = list(teacher_info[0]) pseudo_labels = list(teacher_info[1]) losses = dict() for i in range(len(pseudo_bboxes)): if pseudo_labels[i].shape[0] == 0: pseudo_bboxes[i] = paddle.zeros([0, 4]).numpy() pseudo_labels[i] = paddle.zeros([0, 1]).numpy() else: pseudo_bboxes[i] = pseudo_bboxes[i][:, :4].numpy() pseudo_labels[i] = pseudo_labels[i].numpy() for i in range(len(pseudo_bboxes)): pseudo_labels[i] = paddle.to_tensor( pseudo_labels[i], dtype=paddle.int32, place=self.place) pseudo_bboxes[i] = paddle.to_tensor( pseudo_bboxes[i], dtype=paddle.float32, place=self.place) student_unsup.update({ 'gt_bbox': pseudo_bboxes, 'gt_class': pseudo_labels }) pseudo_sum = 0 for i in range(len(pseudo_bboxes)): pseudo_sum += pseudo_bboxes[i].sum() if pseudo_sum == 0: #input fake data when there are no pseudo labels pseudo_bboxes[0] = paddle.ones([1, 4]) - 0.5 pseudo_labels[0] = paddle.ones([1, 1]).astype('int32') student_unsup.update({ 'gt_bbox': pseudo_bboxes, 'gt_class': pseudo_labels }) body_feats = self.student.backbone(student_unsup) if self.student.neck is not None: body_feats = self.student.neck(body_feats) out_transformer = self.student.transformer(body_feats, None, student_unsup) losses = self.student.detr_head(out_transformer, body_feats, student_unsup) for n, v in losses.items(): losses[n] = v * 0 else: gt_bbox = [] gt_class = [] images = [] proposal_score = [] for i in range(len(pseudo_bboxes)): if pseudo_labels[i].shape[0] == 0: continue else: proposal_score.append(proposal_score_list[i].max(-1) .unsqueeze(-1)) gt_class.append(pseudo_labels[i]) gt_bbox.append(pseudo_bboxes[i]) images.append(student_unsup['image'][i]) images = paddle.stack(images) student_unsup.update({ 'image': images, 'gt_bbox': gt_bbox, 'gt_class': gt_class }) body_feats = self.student.backbone(student_unsup) if self.student.neck is not None: body_feats = self.student.neck(body_feats) out_transformer = self.student.transformer(body_feats, None, student_unsup) student_unsup.update({'gt_score': proposal_score}) losses = self.student.detr_head(out_transformer, body_feats, student_unsup) return losses def box_cxcywh_to_xyxy(x): x_c, y_c, w, h = x.unbind(-1) b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)] return paddle.stack(b, axis=-1) def box_xyxy_to_cxcywh(x): x0, y0, x1, y1 = x.unbind(-1) b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)] return paddle.stack(b, axis=-1) def get_size_with_aspect_ratio(image_size, size, max_size=None): w, h = image_size if max_size is not None: min_original_size = float(min((w, h))) max_original_size = float(max((w, h))) if max_original_size / min_original_size * size > max_size: size = int(round(max_size * min_original_size / max_original_size)) if (w <= h and w == size) or (h <= w and h == size): return (w, h) if w < h: ow = size oh = int(size * h / w) else: oh = size ow = int(size * w / h) return (ow, oh) def align_weak_strong_shape(data_weak, data_strong): shape_x = data_strong['image'].shape[2] shape_y = data_strong['image'].shape[3] target_size = [shape_x, shape_y] data_weak['image'] = F.interpolate( data_weak['image'], size=target_size, mode='bilinear', align_corners=False) return data_weak, data_strong ================================================ FILE: ppdet/modeling/architectures/fairmot.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle from ppdet.core.workspace import register, create from .meta_arch import BaseArch __all__ = ['FairMOT'] @register class FairMOT(BaseArch): """ FairMOT network, see http://arxiv.org/abs/2004.01888 Args: detector (object): 'CenterNet' instance reid (object): 'FairMOTEmbeddingHead' instance tracker (object): 'JDETracker' instance loss (object): 'FairMOTLoss' instance """ __category__ = 'architecture' __inject__ = ['loss'] def __init__(self, detector='CenterNet', reid='FairMOTEmbeddingHead', tracker='JDETracker', loss='FairMOTLoss'): super(FairMOT, self).__init__() self.detector = detector self.reid = reid self.tracker = tracker self.loss = loss @classmethod def from_config(cls, cfg, *args, **kwargs): detector = create(cfg['detector']) detector_out_shape = detector.neck and detector.neck.out_shape or detector.backbone.out_shape kwargs = {'input_shape': detector_out_shape} reid = create(cfg['reid'], **kwargs) loss = create(cfg['loss']) tracker = create(cfg['tracker']) return { 'detector': detector, 'reid': reid, 'loss': loss, 'tracker': tracker } def _forward(self): loss = dict() # det_outs keys: # train: neck_feat, det_loss, heatmap_loss, size_loss, offset_loss (optional: iou_loss) # eval/infer: neck_feat, bbox, bbox_inds det_outs = self.detector(self.inputs) neck_feat = det_outs['neck_feat'] if self.training: reid_loss = self.reid(neck_feat, self.inputs) det_loss = det_outs['det_loss'] loss = self.loss(det_loss, reid_loss) for k, v in det_outs.items(): if 'loss' not in k: continue loss.update({k: v}) loss.update({'reid_loss': reid_loss}) return loss else: pred_dets, pred_embs = self.reid( neck_feat, self.inputs, det_outs['bbox'], det_outs['bbox_inds'], det_outs['topk_clses']) return pred_dets, pred_embs def get_pred(self): output = self._forward() return output def get_loss(self): loss = self._forward() return loss ================================================ FILE: ppdet/modeling/architectures/faster_rcnn.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle from ppdet.core.workspace import register, create from .meta_arch import BaseArch import numpy as np __all__ = ['FasterRCNN'] @register class FasterRCNN(BaseArch): """ Faster R-CNN network, see https://arxiv.org/abs/1506.01497 Args: backbone (object): backbone instance rpn_head (object): `RPNHead` instance bbox_head (object): `BBoxHead` instance bbox_post_process (object): `BBoxPostProcess` instance neck (object): 'FPN' instance """ __category__ = 'architecture' __inject__ = ['bbox_post_process'] def __init__(self, backbone, rpn_head, bbox_head, bbox_post_process, neck=None): super(FasterRCNN, self).__init__() self.backbone = backbone self.neck = neck self.rpn_head = rpn_head self.bbox_head = bbox_head self.bbox_post_process = bbox_post_process def init_cot_head(self, relationship): self.bbox_head.init_cot_head(relationship) @classmethod def from_config(cls, cfg, *args, **kwargs): backbone = create(cfg['backbone']) kwargs = {'input_shape': backbone.out_shape} neck = cfg['neck'] and create(cfg['neck'], **kwargs) out_shape = neck and neck.out_shape or backbone.out_shape kwargs = {'input_shape': out_shape} rpn_head = create(cfg['rpn_head'], **kwargs) bbox_head = create(cfg['bbox_head'], **kwargs) return { 'backbone': backbone, 'neck': neck, "rpn_head": rpn_head, "bbox_head": bbox_head, } def _forward(self): body_feats = self.backbone(self.inputs) if self.neck is not None: body_feats = self.neck(body_feats) if self.training: rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs) bbox_loss, _ = self.bbox_head(body_feats, rois, rois_num, self.inputs) return rpn_loss, bbox_loss else: rois, rois_num, _ = self.rpn_head(body_feats, self.inputs) preds, _ = self.bbox_head(body_feats, rois, rois_num, None) im_shape = self.inputs['im_shape'] scale_factor = self.inputs['scale_factor'] bbox, bbox_num, nms_keep_idx = self.bbox_post_process( preds, (rois, rois_num), im_shape, scale_factor) # rescale the prediction back to origin image bboxes, bbox_pred, bbox_num = self.bbox_post_process.get_pred( bbox, bbox_num, im_shape, scale_factor) if self.use_extra_data: extra_data = { } # record the bbox output before nms, such like scores and nms_keep_idx """extra_data:{ 'scores': predict scores, 'nms_keep_idx': bbox index before nms, } """ extra_data['scores'] = preds[1] # predict scores (probability) # Todo: get logits output extra_data[ 'nms_keep_idx'] = nms_keep_idx # bbox index before nms return bbox_pred, bbox_num, extra_data else: return bbox_pred, bbox_num def get_loss(self, ): rpn_loss, bbox_loss = self._forward() loss = {} loss.update(rpn_loss) loss.update(bbox_loss) total_loss = paddle.add_n(list(loss.values())) loss.update({'loss': total_loss}) return loss def get_pred(self): if self.use_extra_data: bbox_pred, bbox_num, extra_data = self._forward() output = { 'bbox': bbox_pred, 'bbox_num': bbox_num, 'extra_data': extra_data } else: bbox_pred, bbox_num = self._forward() output = {'bbox': bbox_pred, 'bbox_num': bbox_num} return output def target_bbox_forward(self, data): body_feats = self.backbone(data) if self.neck is not None: body_feats = self.neck(body_feats) rois = [roi for roi in data['gt_bbox']] rois_num = paddle.concat([paddle.shape(roi)[0:1] for roi in rois]) preds, _ = self.bbox_head(body_feats, rois, rois_num, None, cot=True) return preds def relationship_learning(self, loader, num_classes_novel): print('computing relationship') train_labels_list = [] label_list = [] for step_id, data in enumerate(loader): _, bbox_prob = self.target_bbox_forward(data) batch_size = data['im_id'].shape[0] for i in range(batch_size): num_bbox = data['gt_class'][i].shape[0] train_labels = data['gt_class'][i] train_labels_list.append(train_labels.numpy().squeeze(1)) base_labels = bbox_prob.detach().numpy()[:, :-1] label_list.append(base_labels) labels = np.concatenate(train_labels_list, 0) probabilities = np.concatenate(label_list, 0) N_t = np.max(labels) + 1 conditional = [] for i in range(N_t): this_class = probabilities[labels == i] average = np.mean(this_class, axis=0, keepdims=True) conditional.append(average) return np.concatenate(conditional) ================================================ FILE: ppdet/modeling/architectures/fcos.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle from ppdet.core.workspace import register, create from .meta_arch import BaseArch __all__ = ['FCOS', 'ARSL_FCOS'] @register class FCOS(BaseArch): """ FCOS network, see https://arxiv.org/abs/1904.01355 Args: backbone (object): backbone instance neck (object): 'FPN' instance fcos_head (object): 'FCOSHead' instance ssod_loss (object): 'SSODFCOSLoss' instance, only used for semi-det(ssod) by DenseTeacher """ __category__ = 'architecture' __inject__ = ['ssod_loss'] def __init__(self, backbone='ResNet', neck='FPN', fcos_head='FCOSHead', ssod_loss='SSODFCOSLoss'): super(FCOS, self).__init__() self.backbone = backbone self.neck = neck self.fcos_head = fcos_head # for ssod, semi-det self.is_teacher = False self.ssod_loss = ssod_loss @classmethod def from_config(cls, cfg, *args, **kwargs): backbone = create(cfg['backbone']) kwargs = {'input_shape': backbone.out_shape} neck = create(cfg['neck'], **kwargs) kwargs = {'input_shape': neck.out_shape} fcos_head = create(cfg['fcos_head'], **kwargs) return { 'backbone': backbone, 'neck': neck, "fcos_head": fcos_head, } def _forward(self): body_feats = self.backbone(self.inputs) fpn_feats = self.neck(body_feats) self.is_teacher = self.inputs.get('is_teacher', False) if self.training or self.is_teacher: losses = self.fcos_head(fpn_feats, self.inputs) return losses else: fcos_head_outs = self.fcos_head(fpn_feats) bbox_pred, bbox_num = self.fcos_head.post_process( fcos_head_outs, self.inputs['scale_factor']) return {'bbox': bbox_pred, 'bbox_num': bbox_num} def get_loss(self): return self._forward() def get_pred(self): return self._forward() def get_loss_keys(self): return ['loss_cls', 'loss_box', 'loss_quality'] def get_ssod_loss(self, student_head_outs, teacher_head_outs, train_cfg): ssod_losses = self.ssod_loss(student_head_outs, teacher_head_outs, train_cfg) return ssod_losses @register class ARSL_FCOS(BaseArch): """ FCOS ARSL network, see https://arxiv.org/abs/ Args: backbone (object): backbone instance neck (object): 'FPN' instance fcos_head (object): 'FCOSHead_ARSL' instance fcos_cr_loss (object): 'FCOSLossCR' instance, only used for semi-det(ssod) by ARSL """ __category__ = 'architecture' __inject__ = ['fcos_cr_loss'] def __init__(self, backbone, neck, fcos_head='FCOSHead_ARSL', fcos_cr_loss='FCOSLossCR'): super(ARSL_FCOS, self).__init__() self.backbone = backbone self.neck = neck self.fcos_head = fcos_head self.fcos_cr_loss = fcos_cr_loss @classmethod def from_config(cls, cfg, *args, **kwargs): backbone = create(cfg['backbone']) kwargs = {'input_shape': backbone.out_shape} neck = create(cfg['neck'], **kwargs) kwargs = {'input_shape': neck.out_shape} fcos_head = create(cfg['fcos_head'], **kwargs) # consistency regularization loss fcos_cr_loss = create(cfg['fcos_cr_loss']) return { 'backbone': backbone, 'neck': neck, 'fcos_head': fcos_head, 'fcos_cr_loss': fcos_cr_loss, } def forward(self, inputs, branch="supervised", teacher_prediction=None): assert branch in ['supervised', 'semi_supervised'], \ print('In ARSL, type must be supervised or semi_supervised.') if self.data_format == 'NHWC': image = inputs['image'] inputs['image'] = paddle.transpose(image, [0, 2, 3, 1]) self.inputs = inputs if self.training: if branch == "supervised": out = self.get_loss() else: out = self.get_pseudo_loss(teacher_prediction) else: # norm test if branch == "supervised": out = self.get_pred() # predict pseudo labels else: out = self.get_pseudo_pred() return out # model forward def model_forward(self): body_feats = self.backbone(self.inputs) fpn_feats = self.neck(body_feats) fcos_head_outs = self.fcos_head(fpn_feats) return fcos_head_outs # supervised loss for labeled data def get_loss(self): loss = {} tag_labels, tag_bboxes, tag_centerness = [], [], [] for i in range(len(self.fcos_head.fpn_stride)): # labels, reg_target, centerness k_lbl = 'labels{}'.format(i) if k_lbl in self.inputs: tag_labels.append(self.inputs[k_lbl]) k_box = 'reg_target{}'.format(i) if k_box in self.inputs: tag_bboxes.append(self.inputs[k_box]) k_ctn = 'centerness{}'.format(i) if k_ctn in self.inputs: tag_centerness.append(self.inputs[k_ctn]) fcos_head_outs = self.model_forward() loss_fcos = self.fcos_head.get_loss(fcos_head_outs, tag_labels, tag_bboxes, tag_centerness) loss.update(loss_fcos) return loss # unsupervised loss for unlabeled data def get_pseudo_loss(self, teacher_prediction): loss = {} fcos_head_outs = self.model_forward() unsup_loss = self.fcos_cr_loss(fcos_head_outs, teacher_prediction) for k in unsup_loss.keys(): loss[k + '_pseudo'] = unsup_loss[k] return loss # get detection results for test, decode and rescale the results to original size def get_pred(self): fcos_head_outs = self.model_forward() scale_factor = self.inputs['scale_factor'] bbox_pred, bbox_num = self.fcos_head.post_process(fcos_head_outs, scale_factor) output = {'bbox': bbox_pred, 'bbox_num': bbox_num} return output # generate pseudo labels to guide student def get_pseudo_pred(self): fcos_head_outs = self.model_forward() pred_cls, pred_loc, pred_iou = fcos_head_outs[1:] # 0 is locations for lvl, _ in enumerate(pred_loc): pred_loc[lvl] = pred_loc[lvl] / self.fcos_head.fpn_stride[lvl] return [pred_cls, pred_loc, pred_iou, self.fcos_head.fpn_stride] ================================================ FILE: ppdet/modeling/architectures/gfl.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle from ppdet.core.workspace import register, create from .meta_arch import BaseArch __all__ = ['GFL'] @register class GFL(BaseArch): """ Generalized Focal Loss network, see https://arxiv.org/abs/2006.04388 Args: backbone (object): backbone instance neck (object): 'FPN' instance head (object): 'GFLHead' instance """ __category__ = 'architecture' def __init__(self, backbone, neck, head='GFLHead'): super(GFL, self).__init__() self.backbone = backbone self.neck = neck self.head = head @classmethod def from_config(cls, cfg, *args, **kwargs): backbone = create(cfg['backbone']) kwargs = {'input_shape': backbone.out_shape} neck = create(cfg['neck'], **kwargs) kwargs = {'input_shape': neck.out_shape} head = create(cfg['head'], **kwargs) return { 'backbone': backbone, 'neck': neck, "head": head, } def _forward(self): body_feats = self.backbone(self.inputs) fpn_feats = self.neck(body_feats) head_outs = self.head(fpn_feats) if not self.training: im_shape = self.inputs['im_shape'] scale_factor = self.inputs['scale_factor'] bboxes, bbox_num = self.head.post_process(head_outs, im_shape, scale_factor) return bboxes, bbox_num else: return head_outs def get_loss(self, ): loss = {} head_outs = self._forward() loss_gfl = self.head.get_loss(head_outs, self.inputs) loss.update(loss_gfl) total_loss = paddle.add_n(list(loss.values())) loss.update({'loss': total_loss}) return loss def get_pred(self): bbox_pred, bbox_num = self._forward() output = {'bbox': bbox_pred, 'bbox_num': bbox_num} return output ================================================ FILE: ppdet/modeling/architectures/jde.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function from ppdet.core.workspace import register, create from .meta_arch import BaseArch __all__ = ['JDE'] @register class JDE(BaseArch): __category__ = 'architecture' __shared__ = ['metric'] """ JDE network, see https://arxiv.org/abs/1909.12605v1 Args: detector (object): detector model instance reid (object): reid model instance tracker (object): tracker instance metric (str): 'MOTDet' for training and detection evaluation, 'ReID' for ReID embedding evaluation, or 'MOT' for multi object tracking evaluation. """ def __init__(self, detector='YOLOv3', reid='JDEEmbeddingHead', tracker='JDETracker', metric='MOT'): super(JDE, self).__init__() self.detector = detector self.reid = reid self.tracker = tracker self.metric = metric @classmethod def from_config(cls, cfg, *args, **kwargs): detector = create(cfg['detector']) kwargs = {'input_shape': detector.neck.out_shape} reid = create(cfg['reid'], **kwargs) tracker = create(cfg['tracker']) return { "detector": detector, "reid": reid, "tracker": tracker, } def _forward(self): det_outs = self.detector(self.inputs) if self.training: emb_feats = det_outs['emb_feats'] loss_confs = det_outs['det_losses']['loss_confs'] loss_boxes = det_outs['det_losses']['loss_boxes'] jde_losses = self.reid( emb_feats, self.inputs, loss_confs=loss_confs, loss_boxes=loss_boxes) return jde_losses else: if self.metric == 'MOTDet': det_results = { 'bbox': det_outs['bbox'], 'bbox_num': det_outs['bbox_num'], } return det_results elif self.metric == 'MOT': emb_feats = det_outs['emb_feats'] bboxes = det_outs['bbox'] boxes_idx = det_outs['boxes_idx'] nms_keep_idx = det_outs['nms_keep_idx'] pred_dets, pred_embs = self.reid( emb_feats, self.inputs, bboxes=bboxes, boxes_idx=boxes_idx, nms_keep_idx=nms_keep_idx) return pred_dets, pred_embs else: raise ValueError("Unknown metric {} for multi object tracking.". format(self.metric)) def get_loss(self): return self._forward() def get_pred(self): return self._forward() ================================================ FILE: ppdet/modeling/architectures/keypoint_hrhrnet.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function from scipy.optimize import linear_sum_assignment from collections import abc, defaultdict import numpy as np import paddle from ppdet.core.workspace import register, create, serializable from .meta_arch import BaseArch from .. import layers as L from ..keypoint_utils import transpred __all__ = ['HigherHRNet'] @register class HigherHRNet(BaseArch): __category__ = 'architecture' def __init__(self, backbone='HRNet', hrhrnet_head='HrHRNetHead', post_process='HrHRNetPostProcess', eval_flip=True, flip_perm=None, max_num_people=30): """ HigherHRNet network, see https://arxiv.org/abs/1908.10357; HigherHRNet+swahr, see https://arxiv.org/abs/2012.15175 Args: backbone (nn.Layer): backbone instance hrhrnet_head (nn.Layer): keypoint_head instance bbox_post_process (object): `BBoxPostProcess` instance """ super(HigherHRNet, self).__init__() self.backbone = backbone self.hrhrnet_head = hrhrnet_head self.post_process = post_process self.flip = eval_flip self.flip_perm = paddle.to_tensor(flip_perm) self.deploy = False self.interpolate = L.Upsample(2, mode='bilinear') self.pool = L.MaxPool(5, 1, 2) self.max_num_people = max_num_people @classmethod def from_config(cls, cfg, *args, **kwargs): # backbone backbone = create(cfg['backbone']) # head kwargs = {'input_shape': backbone.out_shape} hrhrnet_head = create(cfg['hrhrnet_head'], **kwargs) post_process = create(cfg['post_process']) return { 'backbone': backbone, "hrhrnet_head": hrhrnet_head, "post_process": post_process, } def _forward(self): if self.flip and not self.training and not self.deploy: self.inputs['image'] = paddle.concat( (self.inputs['image'], paddle.flip(self.inputs['image'], [3]))) body_feats = self.backbone(self.inputs) if self.training: return self.hrhrnet_head(body_feats, self.inputs) else: outputs = self.hrhrnet_head(body_feats) if self.flip and not self.deploy: outputs = [paddle.split(o, 2) for o in outputs] output_rflip = [ paddle.flip(paddle.gather(o[1], self.flip_perm, 1), [3]) for o in outputs ] output1 = [o[0] for o in outputs] heatmap = (output1[0] + output_rflip[0]) / 2. tagmaps = [output1[1], output_rflip[1]] outputs = [heatmap] + tagmaps outputs = self.get_topk(outputs) if self.deploy: return outputs res_lst = [] h = self.inputs['im_shape'][0, 0].numpy().item() w = self.inputs['im_shape'][0, 1].numpy().item() kpts, scores = self.post_process(*outputs, h, w) res_lst.append([kpts, scores]) return res_lst def get_loss(self): return self._forward() def get_pred(self): outputs = {} res_lst = self._forward() outputs['keypoint'] = res_lst return outputs def get_topk(self, outputs): # resize to image size outputs = [self.interpolate(x) for x in outputs] if len(outputs) == 3: tagmap = paddle.concat( (outputs[1].unsqueeze(4), outputs[2].unsqueeze(4)), axis=4) else: tagmap = outputs[1].unsqueeze(4) heatmap = outputs[0] N, J = 1, self.hrhrnet_head.num_joints heatmap_maxpool = self.pool(heatmap) # topk maxmap = heatmap * (heatmap == heatmap_maxpool) maxmap = maxmap.reshape([N, J, -1]) heat_k, inds_k = maxmap.topk(self.max_num_people, axis=2) outputs = [heatmap, tagmap, heat_k, inds_k] return outputs @register @serializable class HrHRNetPostProcess(object): ''' HrHRNet postprocess contain: 1) get topk keypoints in the output heatmap 2) sample the tagmap's value corresponding to each of the topk coordinate 3) match different joints to combine to some people with Hungary algorithm 4) adjust the coordinate by +-0.25 to decrease error std 5) salvage missing joints by check positivity of heatmap - tagdiff_norm Args: max_num_people (int): max number of people support in postprocess heat_thresh (float): value of topk below this threshhold will be ignored tag_thresh (float): coord's value sampled in tagmap below this threshold belong to same people for init inputs(list[heatmap]): the output list of model, [heatmap, heatmap_maxpool, tagmap], heatmap_maxpool used to get topk original_height, original_width (float): the original image size ''' def __init__(self, max_num_people=30, heat_thresh=0.1, tag_thresh=1.): self.max_num_people = max_num_people self.heat_thresh = heat_thresh self.tag_thresh = tag_thresh def lerp(self, j, y, x, heatmap): H, W = heatmap.shape[-2:] left = np.clip(x - 1, 0, W - 1) right = np.clip(x + 1, 0, W - 1) up = np.clip(y - 1, 0, H - 1) down = np.clip(y + 1, 0, H - 1) offset_y = np.where(heatmap[j, down, x] > heatmap[j, up, x], 0.25, -0.25) offset_x = np.where(heatmap[j, y, right] > heatmap[j, y, left], 0.25, -0.25) return offset_y + 0.5, offset_x + 0.5 def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height, original_width): N, J, H, W = heatmap.shape assert N == 1, "only support batch size 1" heatmap = heatmap[0].cpu().detach().numpy() tagmap = tagmap[0].cpu().detach().numpy() heats = heat_k[0].cpu().detach().numpy() inds_np = inds_k[0].cpu().detach().numpy() y = inds_np // W x = inds_np % W tags = tagmap[np.arange(J)[None, :].repeat(self.max_num_people), y.flatten(), x.flatten()].reshape(J, -1, tagmap.shape[-1]) coords = np.stack((y, x), axis=2) # threshold mask = heats > self.heat_thresh # cluster cluster = defaultdict(lambda: { 'coords': np.zeros((J, 2), dtype=np.float32), 'scores': np.zeros(J, dtype=np.float32), 'tags': [] }) for jid, m in enumerate(mask): num_valid = m.sum() if num_valid == 0: continue valid_inds = np.where(m)[0] valid_tags = tags[jid, m, :] if len(cluster) == 0: # initialize for i in valid_inds: tag = tags[jid, i] key = tag[0] cluster[key]['tags'].append(tag) cluster[key]['scores'][jid] = heats[jid, i] cluster[key]['coords'][jid] = coords[jid, i] continue candidates = list(cluster.keys())[:self.max_num_people] centroids = [ np.mean( cluster[k]['tags'], axis=0) for k in candidates ] num_clusters = len(centroids) # shape is (num_valid, num_clusters, tag_dim) dist = valid_tags[:, None, :] - np.array(centroids)[None, ...] l2_dist = np.linalg.norm(dist, ord=2, axis=2) # modulate dist with heat value, see `use_detection_val` cost = np.round(l2_dist) * 100 - heats[jid, m, None] # pad the cost matrix, otherwise new pose are ignored if num_valid > num_clusters: cost = np.pad(cost, ((0, 0), (0, num_valid - num_clusters)), 'constant', constant_values=((0, 0), (0, 1e-10))) rows, cols = linear_sum_assignment(cost) for y, x in zip(rows, cols): tag = tags[jid, y] if y < num_valid and x < num_clusters and \ l2_dist[y, x] < self.tag_thresh: key = candidates[x] # merge to cluster else: key = tag[0] # initialize new cluster cluster[key]['tags'].append(tag) cluster[key]['scores'][jid] = heats[jid, y] cluster[key]['coords'][jid] = coords[jid, y] # shape is [k, J, 2] and [k, J] pose_tags = np.array([cluster[k]['tags'] for k in cluster]) pose_coords = np.array([cluster[k]['coords'] for k in cluster]) pose_scores = np.array([cluster[k]['scores'] for k in cluster]) valid = pose_scores > 0 pose_kpts = np.zeros((pose_scores.shape[0], J, 3), dtype=np.float32) if valid.sum() == 0: return pose_kpts, pose_kpts # refine coords valid_coords = pose_coords[valid].astype(np.int32) y = valid_coords[..., 0].flatten() x = valid_coords[..., 1].flatten() _, j = np.nonzero(valid) offsets = self.lerp(j, y, x, heatmap) pose_coords[valid, 0] += offsets[0] pose_coords[valid, 1] += offsets[1] # mean score before salvage mean_score = pose_scores.mean(axis=1) pose_kpts[valid, 2] = pose_scores[valid] # salvage missing joints if True: for pid, coords in enumerate(pose_coords): tag_mean = np.array(pose_tags[pid]).mean(axis=0) norm = np.sum((tagmap - tag_mean)**2, axis=3)**0.5 score = heatmap - np.round(norm) # (J, H, W) flat_score = score.reshape(J, -1) max_inds = np.argmax(flat_score, axis=1) max_scores = np.max(flat_score, axis=1) salvage_joints = (pose_scores[pid] == 0) & (max_scores > 0) if salvage_joints.sum() == 0: continue y = max_inds[salvage_joints] // W x = max_inds[salvage_joints] % W offsets = self.lerp(salvage_joints.nonzero()[0], y, x, heatmap) y = y.astype(np.float32) + offsets[0] x = x.astype(np.float32) + offsets[1] pose_coords[pid][salvage_joints, 0] = y pose_coords[pid][salvage_joints, 1] = x pose_kpts[pid][salvage_joints, 2] = max_scores[salvage_joints] pose_kpts[..., :2] = transpred(pose_coords[..., :2][..., ::-1], original_height, original_width, min(H, W)) return pose_kpts, mean_score ================================================ FILE: ppdet/modeling/architectures/keypoint_hrnet.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import numpy as np import math import cv2 from ppdet.core.workspace import register, create from .meta_arch import BaseArch from ..keypoint_utils import transform_preds from .. import layers as L from paddle.nn import functional as F __all__ = ['TopDownHRNet', 'TinyPose3DHRNet', 'TinyPose3DHRHeatmapNet'] @register class TopDownHRNet(BaseArch): __category__ = 'architecture' __inject__ = ['loss'] def __init__(self, width, num_joints, backbone='HRNet', loss='KeyPointMSELoss', post_process='HRNetPostProcess', flip_perm=None, flip=True, shift_heatmap=True, use_dark=True): """ HRNet network, see https://arxiv.org/abs/1902.09212 Args: backbone (nn.Layer): backbone instance post_process (object): `HRNetPostProcess` instance flip_perm (list): The left-right joints exchange order list use_dark(bool): Whether to use DARK in post processing """ super(TopDownHRNet, self).__init__() self.backbone = backbone self.post_process = HRNetPostProcess(use_dark) self.loss = loss self.flip_perm = flip_perm self.flip = flip self.final_conv = L.Conv2d(width, num_joints, 1, 1, 0, bias=True) self.shift_heatmap = shift_heatmap self.deploy = False @classmethod def from_config(cls, cfg, *args, **kwargs): # backbone backbone = create(cfg['backbone']) return {'backbone': backbone, } def _forward(self): feats = self.backbone(self.inputs) hrnet_outputs = self.final_conv(feats[0]) if self.training: return self.loss(hrnet_outputs, self.inputs) elif self.deploy: outshape = hrnet_outputs.shape max_idx = paddle.argmax( hrnet_outputs.reshape( (outshape[0], outshape[1], outshape[2] * outshape[3])), axis=-1) return hrnet_outputs, max_idx else: if self.flip: self.inputs['image'] = self.inputs['image'].flip([3]) feats = self.backbone(self.inputs) output_flipped = self.final_conv(feats[0]) output_flipped = self.flip_back(output_flipped.numpy(), self.flip_perm) output_flipped = paddle.to_tensor(output_flipped.copy()) if self.shift_heatmap: output_flipped[:, :, :, 1:] = output_flipped.clone( )[:, :, :, 0:-1] hrnet_outputs = (hrnet_outputs + output_flipped) * 0.5 imshape = (self.inputs['im_shape'].numpy() )[:, ::-1] if 'im_shape' in self.inputs else None center = self.inputs['center'].numpy( ) if 'center' in self.inputs else np.round(imshape / 2.) scale = self.inputs['scale'].numpy( ) if 'scale' in self.inputs else imshape / 200. outputs = self.post_process(hrnet_outputs, center, scale) return outputs def get_loss(self): return self._forward() def get_pred(self): res_lst = self._forward() outputs = {'keypoint': res_lst} return outputs def flip_back(self, output_flipped, matched_parts): assert output_flipped.ndim == 4,\ 'output_flipped should be [batch_size, num_joints, height, width]' output_flipped = output_flipped[:, :, :, ::-1] for pair in matched_parts: tmp = output_flipped[:, pair[0], :, :].copy() output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :] output_flipped[:, pair[1], :, :] = tmp return output_flipped class HRNetPostProcess(object): def __init__(self, use_dark=True): self.use_dark = use_dark def get_max_preds(self, heatmaps): '''get predictions from score maps Args: heatmaps: numpy.ndarray([batch_size, num_joints, height, width]) Returns: preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords maxvals: numpy.ndarray([batch_size, num_joints, 2]), the maximum confidence of the keypoints ''' assert isinstance(heatmaps, np.ndarray), 'heatmaps should be numpy.ndarray' assert heatmaps.ndim == 4, 'batch_images should be 4-ndim' batch_size = heatmaps.shape[0] num_joints = heatmaps.shape[1] width = heatmaps.shape[3] heatmaps_reshaped = heatmaps.reshape((batch_size, num_joints, -1)) idx = np.argmax(heatmaps_reshaped, 2) maxvals = np.amax(heatmaps_reshaped, 2) maxvals = maxvals.reshape((batch_size, num_joints, 1)) idx = idx.reshape((batch_size, num_joints, 1)) preds = np.tile(idx, (1, 1, 2)).astype(np.float32) preds[:, :, 0] = (preds[:, :, 0]) % width preds[:, :, 1] = np.floor((preds[:, :, 1]) / width) pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2)) pred_mask = pred_mask.astype(np.float32) preds *= pred_mask return preds, maxvals def gaussian_blur(self, heatmap, kernel): border = (kernel - 1) // 2 batch_size = heatmap.shape[0] num_joints = heatmap.shape[1] height = heatmap.shape[2] width = heatmap.shape[3] for i in range(batch_size): for j in range(num_joints): origin_max = np.max(heatmap[i, j]) dr = np.zeros((height + 2 * border, width + 2 * border)) dr[border:-border, border:-border] = heatmap[i, j].copy() dr = cv2.GaussianBlur(dr, (kernel, kernel), 0) heatmap[i, j] = dr[border:-border, border:-border].copy() heatmap[i, j] *= origin_max / np.max(heatmap[i, j]) return heatmap def dark_parse(self, hm, coord): heatmap_height = hm.shape[0] heatmap_width = hm.shape[1] px = int(coord[0]) py = int(coord[1]) if 1 < px < heatmap_width - 2 and 1 < py < heatmap_height - 2: dx = 0.5 * (hm[py][px + 1] - hm[py][px - 1]) dy = 0.5 * (hm[py + 1][px] - hm[py - 1][px]) dxx = 0.25 * (hm[py][px + 2] - 2 * hm[py][px] + hm[py][px - 2]) dxy = 0.25 * (hm[py+1][px+1] - hm[py-1][px+1] - hm[py+1][px-1] \ + hm[py-1][px-1]) dyy = 0.25 * ( hm[py + 2 * 1][px] - 2 * hm[py][px] + hm[py - 2 * 1][px]) derivative = np.matrix([[dx], [dy]]) hessian = np.matrix([[dxx, dxy], [dxy, dyy]]) if dxx * dyy - dxy**2 != 0: hessianinv = hessian.I offset = -hessianinv * derivative offset = np.squeeze(np.array(offset.T), axis=0) coord += offset return coord def dark_postprocess(self, hm, coords, kernelsize): '''DARK postpocessing, Zhang et al. Distribution-Aware Coordinate Representation for Human Pose Estimation (CVPR 2020). ''' hm = self.gaussian_blur(hm, kernelsize) hm = np.maximum(hm, 1e-10) hm = np.log(hm) for n in range(coords.shape[0]): for p in range(coords.shape[1]): coords[n, p] = self.dark_parse(hm[n][p], coords[n][p]) return coords def get_final_preds(self, heatmaps, center, scale, kernelsize=3): """the highest heatvalue location with a quarter offset in the direction from the highest response to the second highest response. Args: heatmaps (numpy.ndarray): The predicted heatmaps center (numpy.ndarray): The boxes center scale (numpy.ndarray): The scale factor Returns: preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords maxvals: numpy.ndarray([batch_size, num_joints, 1]), the maximum confidence of the keypoints """ coords, maxvals = self.get_max_preds(heatmaps) heatmap_height = heatmaps.shape[2] heatmap_width = heatmaps.shape[3] if self.use_dark: coords = self.dark_postprocess(heatmaps, coords, kernelsize) else: for n in range(coords.shape[0]): for p in range(coords.shape[1]): hm = heatmaps[n][p] px = int(math.floor(coords[n][p][0] + 0.5)) py = int(math.floor(coords[n][p][1] + 0.5)) if 1 < px < heatmap_width - 1 and 1 < py < heatmap_height - 1: diff = np.array([ hm[py][px + 1] - hm[py][px - 1], hm[py + 1][px] - hm[py - 1][px] ]) coords[n][p] += np.sign(diff) * .25 preds = coords.copy() # Transform back for i in range(coords.shape[0]): preds[i] = transform_preds(coords[i], center[i], scale[i], [heatmap_width, heatmap_height]) return preds, maxvals def __call__(self, output, center, scale): preds, maxvals = self.get_final_preds(output.numpy(), center, scale) outputs = [[ np.concatenate( (preds, maxvals), axis=-1), np.mean( maxvals, axis=1) ]] return outputs class TinyPose3DPostProcess(object): def __init__(self): pass def __call__(self, output, center, scale): """ Args: output (numpy.ndarray): numpy.ndarray([batch_size, num_joints, 3]), keypoints coords scale (numpy.ndarray): The scale factor Returns: preds: numpy.ndarray([batch_size, num_joints, 3]), keypoints coords """ preds = output.numpy().copy() # Transform back for i in range(output.shape[0]): # batch_size preds[i][:, 0] = preds[i][:, 0] * scale[i][0] preds[i][:, 1] = preds[i][:, 1] * scale[i][1] return preds def soft_argmax(heatmaps, joint_num): dims = heatmaps.shape depth_dim = (int)(dims[1] / joint_num) heatmaps = heatmaps.reshape((-1, joint_num, depth_dim * dims[2] * dims[3])) heatmaps = F.softmax(heatmaps, 2) heatmaps = heatmaps.reshape((-1, joint_num, depth_dim, dims[2], dims[3])) accu_x = heatmaps.sum(axis=(2, 3)) accu_y = heatmaps.sum(axis=(2, 4)) accu_z = heatmaps.sum(axis=(3, 4)) accu_x = accu_x * paddle.arange(1, 33) accu_y = accu_y * paddle.arange(1, 33) accu_z = accu_z * paddle.arange(1, 33) accu_x = accu_x.sum(axis=2, keepdim=True) - 1 accu_y = accu_y.sum(axis=2, keepdim=True) - 1 accu_z = accu_z.sum(axis=2, keepdim=True) - 1 coord_out = paddle.concat( (accu_x, accu_y, accu_z), axis=2) # [batch_size, joint_num, 3] return coord_out @register class TinyPose3DHRHeatmapNet(BaseArch): __category__ = 'architecture' __inject__ = ['loss'] def __init__( self, width, # 40, backbone输出的channel数目 num_joints, backbone='HRNet', loss='KeyPointRegressionMSELoss', post_process=TinyPose3DPostProcess): """ Args: backbone (nn.Layer): backbone instance post_process (object): post process instance """ super(TinyPose3DHRHeatmapNet, self).__init__() self.backbone = backbone self.post_process = TinyPose3DPostProcess() self.loss = loss self.deploy = False self.num_joints = num_joints self.final_conv = L.Conv2d(width, num_joints * 32, 1, 1, 0, bias=True) @classmethod def from_config(cls, cfg, *args, **kwargs): # backbone backbone = create(cfg['backbone']) return {'backbone': backbone, } def _forward(self): feats = self.backbone(self.inputs) # feats:[[batch_size, 40, 32, 24]] hrnet_outputs = self.final_conv(feats[0]) res = soft_argmax(hrnet_outputs, self.num_joints) return res def get_loss(self): pose3d = self._forward() loss = self.loss(pose3d, None, self.inputs) outputs = {'loss': loss} return outputs def get_pred(self): res_lst = self._forward() outputs = {'pose3d': res_lst} return outputs def flip_back(self, output_flipped, matched_parts): assert output_flipped.ndim == 4,\ 'output_flipped should be [batch_size, num_joints, height, width]' output_flipped = output_flipped[:, :, :, ::-1] for pair in matched_parts: tmp = output_flipped[:, pair[0], :, :].copy() output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :] output_flipped[:, pair[1], :, :] = tmp return output_flipped @register class TinyPose3DHRNet(BaseArch): __category__ = 'architecture' __inject__ = ['loss'] def __init__(self, width, num_joints, fc_channel=768, backbone='HRNet', loss='KeyPointRegressionMSELoss', post_process=TinyPose3DPostProcess): """ Args: backbone (nn.Layer): backbone instance post_process (object): post process instance """ super(TinyPose3DHRNet, self).__init__() self.backbone = backbone self.post_process = TinyPose3DPostProcess() self.loss = loss self.deploy = False self.num_joints = num_joints self.final_conv = L.Conv2d(width, num_joints, 1, 1, 0, bias=True) self.flatten = paddle.nn.Flatten(start_axis=2, stop_axis=3) self.fc1 = paddle.nn.Linear(fc_channel, 256) self.act1 = paddle.nn.ReLU() self.fc2 = paddle.nn.Linear(256, 64) self.act2 = paddle.nn.ReLU() self.fc3 = paddle.nn.Linear(64, 3) @classmethod def from_config(cls, cfg, *args, **kwargs): # backbone backbone = create(cfg['backbone']) return {'backbone': backbone, } def _forward(self): ''' self.inputs is a dict ''' feats = self.backbone( self.inputs) # feats:[[batch_size, 40, width/4, height/4]] hrnet_outputs = self.final_conv( feats[0]) # hrnet_outputs: [batch_size, num_joints*32,32,32] flatten_res = self.flatten( hrnet_outputs) # [batch_size,num_joints*32,32*32] res = self.fc1(flatten_res) res = self.act1(res) res = self.fc2(res) res = self.act2(res) res = self.fc3(res) if self.training: return self.loss(res, self.inputs) else: # export model need return res def get_loss(self): return self._forward() def get_pred(self): res_lst = self._forward() outputs = {'pose3d': res_lst} return outputs def flip_back(self, output_flipped, matched_parts): assert output_flipped.ndim == 4,\ 'output_flipped should be [batch_size, num_joints, height, width]' output_flipped = output_flipped[:, :, :, ::-1] for pair in matched_parts: tmp = output_flipped[:, pair[0], :, :].copy() output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :] output_flipped[:, pair[1], :, :] = tmp return output_flipped ================================================ FILE: ppdet/modeling/architectures/keypoint_petr.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ this code is base on https://github.com/hikvision-research/opera/blob/main/opera/models/detectors/petr.py """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle from ppdet.core.workspace import register from .meta_arch import BaseArch from .. import layers as L __all__ = ['PETR'] @register class PETR(BaseArch): __category__ = 'architecture' __inject__ = ['backbone', 'neck', 'bbox_head'] def __init__(self, backbone='ResNet', neck='ChannelMapper', bbox_head='PETRHead'): """ PETR, see https://openaccess.thecvf.com/content/CVPR2022/papers/Shi_End-to-End_Multi-Person_Pose_Estimation_With_Transformers_CVPR_2022_paper.pdf Args: backbone (nn.Layer): backbone instance neck (nn.Layer): neck between backbone and head bbox_head (nn.Layer): model output and loss """ super(PETR, self).__init__() self.backbone = backbone if neck is not None: self.with_neck = True self.neck = neck self.bbox_head = bbox_head self.deploy = False def extract_feat(self, img): """Directly extract features from the backbone+neck.""" x = self.backbone(img) if self.with_neck: x = self.neck(x) return x def get_inputs(self): img_metas = [] gt_bboxes = [] gt_labels = [] gt_keypoints = [] gt_areas = [] pad_gt_mask = self.inputs['pad_gt_mask'].astype("bool").squeeze(-1) for idx, im_shape in enumerate(self.inputs['im_shape']): img_meta = { 'img_shape': im_shape.astype("int32").tolist() + [1, ], 'batch_input_shape': self.inputs['image'].shape[-2:], 'image_name': self.inputs['image_file'][idx] } img_metas.append(img_meta) if (not pad_gt_mask[idx].any()): gt_keypoints.append(self.inputs['gt_joints'][idx][:1]) gt_labels.append(self.inputs['gt_class'][idx][:1]) gt_bboxes.append(self.inputs['gt_bbox'][idx][:1]) gt_areas.append(self.inputs['gt_areas'][idx][:1]) continue gt_keypoints.append(self.inputs['gt_joints'][idx][pad_gt_mask[idx]]) gt_labels.append(self.inputs['gt_class'][idx][pad_gt_mask[idx]]) gt_bboxes.append(self.inputs['gt_bbox'][idx][pad_gt_mask[idx]]) gt_areas.append(self.inputs['gt_areas'][idx][pad_gt_mask[idx]]) return img_metas, gt_bboxes, gt_labels, gt_keypoints, gt_areas def get_loss(self): """ Args: img (Tensor): Input images of shape (N, C, H, W). Typically these should be mean centered and std scaled. img_metas (list[dict]): A List of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see :class:`mmdet.datasets.pipelines.Collect`. gt_bboxes (list[Tensor]): Each item are the truth boxes for each image in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): Class indices corresponding to each box. gt_keypoints (list[Tensor]): Each item are the truth keypoints for each image in [p^{1}_x, p^{1}_y, p^{1}_v, ..., p^{K}_x, p^{K}_y, p^{K}_v] format. gt_areas (list[Tensor]): mask areas corresponding to each box. gt_bboxes_ignore (None | list[Tensor]): Specify which bounding boxes can be ignored when computing the loss. Returns: dict[str, Tensor]: A dictionary of loss components. """ img_metas, gt_bboxes, gt_labels, gt_keypoints, gt_areas = self.get_inputs( ) gt_bboxes_ignore = getattr(self.inputs, 'gt_bboxes_ignore', None) x = self.extract_feat(self.inputs) losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes, gt_labels, gt_keypoints, gt_areas, gt_bboxes_ignore) loss = 0 for k, v in losses.items(): loss += v losses['loss'] = loss return losses def get_pred_numpy(self): """Used for computing network flops. """ img = self.inputs['image'] batch_size, _, height, width = img.shape dummy_img_metas = [ dict( batch_input_shape=(height, width), img_shape=(height, width, 3), scale_factor=(1., 1., 1., 1.)) for _ in range(batch_size) ] x = self.extract_feat(img) outs = self.bbox_head(x, img_metas=dummy_img_metas) bbox_list = self.bbox_head.get_bboxes( *outs, dummy_img_metas, rescale=True) return bbox_list def get_pred(self): """ """ img = self.inputs['image'] batch_size, _, height, width = img.shape img_metas = [ dict( batch_input_shape=(height, width), img_shape=(height, width, 3), scale_factor=self.inputs['scale_factor'][i]) for i in range(batch_size) ] kptpred = self.simple_test( self.inputs, img_metas=img_metas, rescale=True) keypoints = kptpred[0][1][0] bboxs = kptpred[0][0][0] keypoints[..., 2] = bboxs[:, None, 4] res_lst = [[keypoints, bboxs[:, 4]]] outputs = {'keypoint': res_lst} return outputs def simple_test(self, inputs, img_metas, rescale=False): """Test function without test time augmentation. Args: inputs (list[paddle.Tensor]): List of multiple images. img_metas (list[dict]): List of image information. rescale (bool, optional): Whether to rescale the results. Defaults to False. Returns: list[list[np.ndarray]]: BBox and keypoint results of each image and classes. The outer list corresponds to each image. The inner list corresponds to each class. """ batch_size = len(img_metas) assert batch_size == 1, 'Currently only batch_size 1 for inference ' \ f'mode is supported. Found batch_size {batch_size}.' feat = self.extract_feat(inputs) results_list = self.bbox_head.simple_test( feat, img_metas, rescale=rescale) bbox_kpt_results = [ self.bbox_kpt2result(det_bboxes, det_labels, det_kpts, self.bbox_head.num_classes) for det_bboxes, det_labels, det_kpts in results_list ] return bbox_kpt_results def bbox_kpt2result(self, bboxes, labels, kpts, num_classes): """Convert detection results to a list of numpy arrays. Args: bboxes (paddle.Tensor | np.ndarray): shape (n, 5). labels (paddle.Tensor | np.ndarray): shape (n, ). kpts (paddle.Tensor | np.ndarray): shape (n, K, 3). num_classes (int): class number, including background class. Returns: list(ndarray): bbox and keypoint results of each class. """ if bboxes.shape[0] == 0: return [np.zeros((0, 5), dtype=np.float32) for i in range(num_classes)], \ [np.zeros((0, kpts.size(1), 3), dtype=np.float32) for i in range(num_classes)] else: if isinstance(bboxes, paddle.Tensor): bboxes = bboxes.numpy() labels = labels.numpy() kpts = kpts.numpy() return [bboxes[labels == i, :] for i in range(num_classes)], \ [kpts[labels == i, :, :] for i in range(num_classes)] ================================================ FILE: ppdet/modeling/architectures/keypoint_vitpose.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import numpy as np import math import cv2 from ppdet.core.workspace import register, create, serializable from .meta_arch import BaseArch from ..keypoint_utils import transform_preds from .. import layers as L __all__ = ['VitPose_TopDown', 'VitPosePostProcess'] @register class VitPose_TopDown(BaseArch): __category__ = 'architecture' __inject__ = ['loss'] def __init__(self, backbone, head, loss, post_process, flip_test): """ VitPose network, see https://arxiv.org/pdf/2204.12484v2.pdf Args: backbone (nn.Layer): backbone instance post_process (object): `HRNetPostProcess` instance """ super(VitPose_TopDown, self).__init__() self.backbone = backbone self.head = head self.loss = loss self.post_process = post_process self.flip_test = flip_test @classmethod def from_config(cls, cfg, *args, **kwargs): # backbone backbone = create(cfg['backbone']) #head head = create(cfg['head']) #post_process post_process = create(cfg['post_process']) return { 'backbone': backbone, 'head': head, 'post_process': post_process } def _forward_train(self): feats = self.backbone.forward_features(self.inputs['image']) vitpost_output = self.head(feats) return self.loss(vitpost_output, self.inputs) def _forward_test(self): feats = self.backbone.forward_features(self.inputs['image']) output_heatmap = self.head(feats) if self.flip_test: img_flipped = self.inputs['image'].flip(3) features_flipped = self.backbone.forward_features(img_flipped) output_flipped_heatmap = self.head.inference_model(features_flipped, self.flip_test) output_heatmap = (output_heatmap + output_flipped_heatmap) * 0.5 imshape = (self.inputs['im_shape'].numpy() )[:, ::-1] if 'im_shape' in self.inputs else None center = self.inputs['center'].numpy( ) if 'center' in self.inputs else np.round(imshape / 2.) scale = self.inputs['scale'].numpy( ) if 'scale' in self.inputs else imshape / 200. result = self.post_process(output_heatmap.cpu().numpy(), center, scale) return result def get_loss(self): return self._forward_train() def get_pred(self): res_lst = self._forward_test() outputs = {'keypoint': res_lst} return outputs @register @serializable class VitPosePostProcess(object): def __init__(self, use_dark=False): self.use_dark = use_dark def get_max_preds(self, heatmaps): '''get predictions from score maps Args: heatmaps: numpy.ndarray([batch_size, num_joints, height, width]) Returns: preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords maxvals: numpy.ndarray([batch_size, num_joints, 2]), the maximum confidence of the keypoints ''' assert isinstance(heatmaps, np.ndarray), 'heatmaps should be numpy.ndarray' assert heatmaps.ndim == 4, 'batch_images should be 4-ndim' batch_size = heatmaps.shape[0] num_joints = heatmaps.shape[1] width = heatmaps.shape[3] heatmaps_reshaped = heatmaps.reshape((batch_size, num_joints, -1)) idx = np.argmax(heatmaps_reshaped, 2) maxvals = np.amax(heatmaps_reshaped, 2) maxvals = maxvals.reshape((batch_size, num_joints, 1)) idx = idx.reshape((batch_size, num_joints, 1)) preds = np.tile(idx, (1, 1, 2)).astype(np.float32) preds[:, :, 0] = (preds[:, :, 0]) % width preds[:, :, 1] = np.floor((preds[:, :, 1]) // width) pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2)) pred_mask = pred_mask.astype(np.float32) preds *= pred_mask return preds, maxvals def post_datk_udp(self, coords, batch_heatmaps, kernel=3): """DARK post-pocessing. Implemented by udp. Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020). Zhang et al. Distribution-Aware Coordinate Representation for Human Pose Estimation (CVPR 2020). Note: - batch size: B - num keypoints: K - num persons: N - height of heatmaps: H - width of heatmaps: W B=1 for bottom_up paradigm where all persons share the same heatmap. B=N for top_down paradigm where each person has its own heatmaps. Args: coords (np.ndarray[N, K, 2]): Initial coordinates of human pose. batch_heatmaps (np.ndarray[B, K, H, W]): batch_heatmaps kernel (int): Gaussian kernel size (K) for modulation. Returns: np.ndarray([N, K, 2]): Refined coordinates. """ if not isinstance(batch_heatmaps, np.ndarray): batch_heatmaps = batch_heatmaps.cpu().numpy() B, K, H, W = batch_heatmaps.shape N = coords.shape[0] assert (B == 1 or B == N) for heatmaps in batch_heatmaps: for heatmap in heatmaps: cv2.GaussianBlur(heatmap, (kernel, kernel), 0, heatmap) np.clip(batch_heatmaps, 0.001, 50, batch_heatmaps) np.log(batch_heatmaps, batch_heatmaps) batch_heatmaps_pad = np.pad(batch_heatmaps, ((0, 0), (0, 0), (1, 1), (1, 1)), mode='edge').flatten() index = coords[..., 0] + 1 + (coords[..., 1] + 1) * (W + 2) index += (W + 2) * (H + 2) * np.arange(0, B * K).reshape(-1, K) index = index.astype(int).reshape(-1, 1) i_ = batch_heatmaps_pad[index] ix1 = batch_heatmaps_pad[index + 1] iy1 = batch_heatmaps_pad[index + W + 2] ix1y1 = batch_heatmaps_pad[index + W + 3] ix1_y1_ = batch_heatmaps_pad[index - W - 3] ix1_ = batch_heatmaps_pad[index - 1] iy1_ = batch_heatmaps_pad[index - 2 - W] dx = 0.5 * (ix1 - ix1_) dy = 0.5 * (iy1 - iy1_) derivative = np.concatenate([dx, dy], axis=1) derivative = derivative.reshape(N, K, 2, 1) dxx = ix1 - 2 * i_ + ix1_ dyy = iy1 - 2 * i_ + iy1_ dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_) hessian = np.concatenate([dxx, dxy, dxy, dyy], axis=1) hessian = hessian.reshape(N, K, 2, 2) hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2)) coords -= np.einsum('ijmn,ijnk->ijmk', hessian, derivative).squeeze() return coords def transform_preds_udp(self, coords, center, scale, output_size, use_udp=True): """Get final keypoint predictions from heatmaps and apply scaling and translation to map them back to the image. Note: num_keypoints: K Args: coords (np.ndarray[K, ndims]): * If ndims=2, corrds are predicted keypoint location. * If ndims=4, corrds are composed of (x, y, scores, tags) * If ndims=5, corrds are composed of (x, y, scores, tags, flipped_tags) center (np.ndarray[2, ]): Center of the bounding box (x, y). scale (np.ndarray[2, ]): Scale of the bounding box wrt [width, height]. output_size (np.ndarray[2, ] | list(2,)): Size of the destination heatmaps. use_udp (bool): Use unbiased data processing Returns: np.ndarray: Predicted coordinates in the images. """ assert coords.shape[1] in (2, 4, 5) assert len(center) == 2 assert len(scale) == 2 assert len(output_size) == 2 # Recover the scale which is normalized by a factor of 200. scale = scale * 200.0 if use_udp: scale_x = scale[0] / (output_size[0] - 1.0) scale_y = scale[1] / (output_size[1] - 1.0) else: scale_x = scale[0] / output_size[0] scale_y = scale[1] / output_size[1] target_coords = np.ones_like(coords) target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[ 0] * 0.5 target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[ 1] * 0.5 return target_coords def get_final_preds(self, heatmaps, center, scale, kernelsize=11): """the highest heatvalue location with a quarter offset in the direction from the highest response to the second highest response. Args: heatmaps (numpy.ndarray): The predicted heatmaps center (numpy.ndarray): The boxes center scale (numpy.ndarray): The scale factor Returns: preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords maxvals: numpy.ndarray([batch_size, num_joints, 1]), the maximum confidence of the keypoints """ coords, maxvals = self.get_max_preds(heatmaps) N, K, H, W = heatmaps.shape if self.use_dark: coords = self.post_datk_udp(coords, heatmaps, kernelsize) preds = coords.copy() # Transform back to the image for i in range(N): preds[i] = self.transform_preds_udp(preds[i], center[i], scale[i], [W, H]) else: for n in range(coords.shape[0]): for p in range(coords.shape[1]): hm = heatmaps[n][p] px = int(math.floor(coords[n][p][0] + 0.5)) py = int(math.floor(coords[n][p][1] + 0.5)) if 1 < px < W - 1 and 1 < py < H - 1: diff = np.array([ hm[py][px + 1] - hm[py][px - 1], hm[py + 1][px] - hm[py - 1][px] ]) coords[n][p] += np.sign(diff) * .25 preds = coords.copy() # Transform back for i in range(coords.shape[0]): preds[i] = transform_preds(coords[i], center[i], scale[i], [W, H]) return preds, maxvals def __call__(self, output, center, scale): preds, maxvals = self.get_final_preds(output, center, scale) outputs = [[ np.concatenate( (preds, maxvals), axis=-1), np.mean( maxvals, axis=1) ]] return outputs ================================================ FILE: ppdet/modeling/architectures/mask_rcnn.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle from ppdet.core.workspace import register, create from .meta_arch import BaseArch __all__ = ['MaskRCNN'] @register class MaskRCNN(BaseArch): """ Mask R-CNN network, see https://arxiv.org/abs/1703.06870 Args: backbone (object): backbone instance rpn_head (object): `RPNHead` instance bbox_head (object): `BBoxHead` instance mask_head (object): `MaskHead` instance bbox_post_process (object): `BBoxPostProcess` instance mask_post_process (object): `MaskPostProcess` instance neck (object): 'FPN' instance """ __category__ = 'architecture' __inject__ = [ 'bbox_post_process', 'mask_post_process', ] def __init__(self, backbone, rpn_head, bbox_head, mask_head, bbox_post_process, mask_post_process, neck=None): super(MaskRCNN, self).__init__() self.backbone = backbone self.neck = neck self.rpn_head = rpn_head self.bbox_head = bbox_head self.mask_head = mask_head self.bbox_post_process = bbox_post_process self.mask_post_process = mask_post_process @classmethod def from_config(cls, cfg, *args, **kwargs): backbone = create(cfg['backbone']) kwargs = {'input_shape': backbone.out_shape} neck = cfg['neck'] and create(cfg['neck'], **kwargs) out_shape = neck and neck.out_shape or backbone.out_shape kwargs = {'input_shape': out_shape} rpn_head = create(cfg['rpn_head'], **kwargs) bbox_head = create(cfg['bbox_head'], **kwargs) out_shape = neck and out_shape or bbox_head.get_head().out_shape kwargs = {'input_shape': out_shape} mask_head = create(cfg['mask_head'], **kwargs) return { 'backbone': backbone, 'neck': neck, "rpn_head": rpn_head, "bbox_head": bbox_head, "mask_head": mask_head, } def _forward(self): body_feats = self.backbone(self.inputs) if self.neck is not None: body_feats = self.neck(body_feats) if self.training: rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs) bbox_loss, bbox_feat = self.bbox_head(body_feats, rois, rois_num, self.inputs) rois, rois_num = self.bbox_head.get_assigned_rois() bbox_targets = self.bbox_head.get_assigned_targets() # Mask Head needs bbox_feat in Mask RCNN mask_loss = self.mask_head(body_feats, rois, rois_num, self.inputs, bbox_targets, bbox_feat) return rpn_loss, bbox_loss, mask_loss else: rois, rois_num, _ = self.rpn_head(body_feats, self.inputs) preds, feat_func = self.bbox_head(body_feats, rois, rois_num, None) im_shape = self.inputs['im_shape'] scale_factor = self.inputs['scale_factor'] bbox, bbox_num, nms_keep_idx = self.bbox_post_process( preds, (rois, rois_num), im_shape, scale_factor) mask_out = self.mask_head( body_feats, bbox, bbox_num, self.inputs, feat_func=feat_func) # rescale the prediction back to origin image bbox, bbox_pred, bbox_num = self.bbox_post_process.get_pred( bbox, bbox_num, im_shape, scale_factor) origin_shape = self.bbox_post_process.get_origin_shape() mask_pred = self.mask_post_process(mask_out, bbox_pred, bbox_num, origin_shape) if self.use_extra_data: extra_data = {} # record the bbox output before nms, such like scores and nms_keep_idx """extra_data:{ 'scores': predict scores, 'nms_keep_idx': bbox index before nms, } """ extra_data['scores'] = preds[1] # predict scores (probability) # Todo: get logits output extra_data['nms_keep_idx'] = nms_keep_idx # bbox index before nms return bbox_pred, bbox_num, mask_pred, extra_data else: return bbox_pred, bbox_num, mask_pred def get_loss(self, ): bbox_loss, mask_loss, rpn_loss = self._forward() loss = {} loss.update(rpn_loss) loss.update(bbox_loss) loss.update(mask_loss) total_loss = paddle.add_n(list(loss.values())) loss.update({'loss': total_loss}) return loss def get_pred(self): if self.use_extra_data: bbox_pred, bbox_num, mask_pred, extra_data = self._forward() output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred, 'extra_data': extra_data} else: bbox_pred, bbox_num, mask_pred = self._forward() output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred} return output ================================================ FILE: ppdet/modeling/architectures/meta_arch.py ================================================ from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import paddle import paddle.nn as nn import typing from ppdet.core.workspace import register from ppdet.modeling.post_process import nms __all__ = ['BaseArch'] @register class BaseArch(nn.Layer): def __init__(self, data_format='NCHW', use_extra_data=False): super(BaseArch, self).__init__() self.data_format = data_format self.inputs = {} self.fuse_norm = False self.use_extra_data = use_extra_data def load_meanstd(self, cfg_transform): scale = 1. mean = np.array([0.485, 0.456, 0.406], dtype=np.float32) std = np.array([0.229, 0.224, 0.225], dtype=np.float32) for item in cfg_transform: if 'NormalizeImage' in item: mean = np.array( item['NormalizeImage']['mean'], dtype=np.float32) std = np.array(item['NormalizeImage']['std'], dtype=np.float32) if item['NormalizeImage'].get('is_scale', True): scale = 1. / 255. break if self.data_format == 'NHWC': self.scale = paddle.to_tensor(scale / std).reshape((1, 1, 1, 3)) self.bias = paddle.to_tensor(-mean / std).reshape((1, 1, 1, 3)) else: self.scale = paddle.to_tensor(scale / std).reshape((1, 3, 1, 1)) self.bias = paddle.to_tensor(-mean / std).reshape((1, 3, 1, 1)) def forward(self, inputs): if self.data_format == 'NHWC': image = inputs['image'] inputs['image'] = paddle.transpose(image, [0, 2, 3, 1]) if self.fuse_norm: image = inputs['image'] self.inputs['image'] = image * self.scale + self.bias self.inputs['im_shape'] = inputs['im_shape'] self.inputs['scale_factor'] = inputs['scale_factor'] else: self.inputs = inputs self.model_arch() if self.training: out = self.get_loss() else: inputs_list = [] # multi-scale input if not isinstance(inputs, typing.Sequence): inputs_list.append(inputs) else: inputs_list.extend(inputs) outs = [] for inp in inputs_list: if self.fuse_norm: self.inputs['image'] = inp['image'] * self.scale + self.bias self.inputs['im_shape'] = inp['im_shape'] self.inputs['scale_factor'] = inp['scale_factor'] else: self.inputs = inp outs.append(self.get_pred()) # multi-scale test if len(outs) > 1: out = self.merge_multi_scale_predictions(outs) else: out = outs[0] return out def merge_multi_scale_predictions(self, outs): # default values for architectures not included in following list num_classes = 80 nms_threshold = 0.5 keep_top_k = 100 if self.__class__.__name__ in ('CascadeRCNN', 'FasterRCNN', 'MaskRCNN'): num_classes = self.bbox_head.num_classes keep_top_k = self.bbox_post_process.nms.keep_top_k nms_threshold = self.bbox_post_process.nms.nms_threshold else: raise Exception( "Multi scale test only supports CascadeRCNN, FasterRCNN and MaskRCNN for now" ) final_boxes = [] all_scale_outs = paddle.concat([o['bbox'] for o in outs]).numpy() for c in range(num_classes): idxs = all_scale_outs[:, 0] == c if np.count_nonzero(idxs) == 0: continue r = nms(all_scale_outs[idxs, 1:], nms_threshold) final_boxes.append( np.concatenate([np.full((r.shape[0], 1), c), r], 1)) out = np.concatenate(final_boxes) out = np.concatenate(sorted( out, key=lambda e: e[1])[-keep_top_k:]).reshape((-1, 6)) out = { 'bbox': paddle.to_tensor(out), 'bbox_num': paddle.to_tensor(np.array([out.shape[0], ])) } return out def build_inputs(self, data, input_def): inputs = {} for i, k in enumerate(input_def): inputs[k] = data[i] return inputs def model_arch(self, ): pass def get_loss(self, ): raise NotImplementedError("Should implement get_loss method!") def get_pred(self, ): raise NotImplementedError("Should implement get_pred method!") ================================================ FILE: ppdet/modeling/architectures/multi_stream_detector.py ================================================ from typing import Dict from collections import OrderedDict from ppdet.modeling.architectures.meta_arch import BaseArch class MultiSteamDetector(BaseArch): def __init__(self, model: Dict[str, BaseArch], train_cfg=None, test_cfg=None): super(MultiSteamDetector, self).__init__() self.submodules = list(model.keys()) for k, v in model.items(): setattr(self, k, v) self.train_cfg = train_cfg self.test_cfg = test_cfg self.inference_on = self.test_cfg.get("inference_on", self.submodules[0]) self.first_load = True def forward(self, inputs, return_loss=True, **kwargs): """Calls either :func:`forward_train` or :func:`forward_test` depending on whether ``return_loss`` is ``True``. Note this setting will change the expected inputs. When ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor and List[dict]), and when ``resturn_loss=False``, img and img_meta should be double nested (i.e. List[Tensor], List[List[dict]]), with the outer list indicating test time augmentations. """ if return_loss: return self.forward_train(inputs, **kwargs) else: return self.forward_test(inputs, **kwargs) def get_loss(self, **kwargs): # losses = self(**data) return self.forward_train(self, **kwargs) def model(self, **kwargs) -> BaseArch: if "submodule" in kwargs: assert (kwargs["submodule"] in self.submodules ), "Detector does not contain submodule {}".format(kwargs[ "submodule"]) model: BaseArch = getattr(self, kwargs["submodule"]) else: model: BaseArch = getattr(self, self.inference_on) return model def freeze(self, model_ref: str): assert model_ref in self.submodules model = getattr(self, model_ref) model.eval() for param in model.parameters(): param.stop_gradient = True def update_ema_model(self, momentum=0.9996): # print(momentum) model_dict = self.student.state_dict() new_dict = OrderedDict() for key, value in self.teacher.state_dict().items(): if key in model_dict.keys(): new_dict[key] = (model_dict[key] * (1 - momentum) + value * momentum) else: raise Exception("{} is not found in student model".format(key)) self.teacher.set_dict(new_dict) ================================================ FILE: ppdet/modeling/architectures/picodet.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle from ppdet.core.workspace import register, create from .meta_arch import BaseArch __all__ = ['PicoDet'] @register class PicoDet(BaseArch): """ Generalized Focal Loss network, see https://arxiv.org/abs/2006.04388 Args: backbone (object): backbone instance neck (object): 'FPN' instance head (object): 'PicoHead' instance """ __category__ = 'architecture' def __init__(self, backbone, neck, head='PicoHead', nms_cpu=False): super(PicoDet, self).__init__() self.backbone = backbone self.neck = neck self.head = head self.export_post_process = True self.export_nms = True self.nms_cpu = nms_cpu @classmethod def from_config(cls, cfg, *args, **kwargs): backbone = create(cfg['backbone']) kwargs = {'input_shape': backbone.out_shape} neck = create(cfg['neck'], **kwargs) kwargs = {'input_shape': neck.out_shape} head = create(cfg['head'], **kwargs) return { 'backbone': backbone, 'neck': neck, "head": head, } def _forward(self): body_feats = self.backbone(self.inputs) fpn_feats = self.neck(body_feats) head_outs = self.head(fpn_feats, self.export_post_process) if self.training or not self.export_post_process: return head_outs, None else: scale_factor = self.inputs['scale_factor'] bboxes, bbox_num = self.head.post_process( head_outs, scale_factor, export_nms=self.export_nms, nms_cpu=self.nms_cpu) return bboxes, bbox_num def get_loss(self, ): loss = {} head_outs, _ = self._forward() loss_gfl = self.head.get_loss(head_outs, self.inputs) loss.update(loss_gfl) total_loss = paddle.add_n(list(loss.values())) loss.update({'loss': total_loss}) return loss def get_pred(self): if not self.export_post_process: return {'picodet': self._forward()[0]} elif self.export_nms: bbox_pred, bbox_num = self._forward() output = {'bbox': bbox_pred, 'bbox_num': bbox_num} return output else: bboxes, mlvl_scores = self._forward() output = {'bbox': bboxes, 'scores': mlvl_scores} return output ================================================ FILE: ppdet/modeling/architectures/pose3d_metro.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register, create from .meta_arch import BaseArch from .. import layers as L __all__ = ['METRO_Body'] def orthographic_projection(X, camera): """Perform orthographic projection of 3D points X using the camera parameters Args: X: size = [B, N, 3] camera: size = [B, 3] Returns: Projected 2D points -- size = [B, N, 2] """ camera = camera.reshape((-1, 1, 3)) X_trans = X[:, :, :2] + camera[:, :, 1:] shape = X_trans.shape X_2d = (camera[:, :, 0] * X_trans.reshape((shape[0], -1))).reshape(shape) return X_2d @register class METRO_Body(BaseArch): __category__ = 'architecture' __inject__ = ['loss'] def __init__( self, num_joints, backbone='HRNet', trans_encoder='', loss='Pose3DLoss', ): """ Modified from METRO network, see https://arxiv.org/abs/2012.09760 Args: backbone (nn.Layer): backbone instance """ super(METRO_Body, self).__init__() self.num_joints = num_joints self.backbone = backbone self.loss = loss self.deploy = False self.trans_encoder = trans_encoder self.conv_learn_tokens = paddle.nn.Conv1D(49, num_joints + 10, 1) self.cam_param_fc = paddle.nn.Linear(3, 2) @classmethod def from_config(cls, cfg, *args, **kwargs): # backbone backbone = create(cfg['backbone']) trans_encoder = create(cfg['trans_encoder']) return {'backbone': backbone, 'trans_encoder': trans_encoder} def _forward(self): batch_size = self.inputs['image'].shape[0] image_feat = self.backbone(self.inputs) image_feat_flatten = image_feat.reshape((batch_size, 2048, 49)) image_feat_flatten = image_feat_flatten.transpose(perm=(0, 2, 1)) # and apply a conv layer to learn image token for each 3d joint/vertex position features = self.conv_learn_tokens(image_feat_flatten) # (B, J, C) if self.training: # apply mask vertex/joint modeling # meta_masks is a tensor of all the masks, randomly generated in dataloader # we pre-define a [MASK] token, which is a floating-value vector with 0.01s meta_masks = self.inputs['mjm_mask'].expand((-1, -1, 2048)) constant_tensor = paddle.ones_like(features) * 0.01 features = features * meta_masks + constant_tensor * (1 - meta_masks ) pred_out = self.trans_encoder(features) pred_3d_joints = pred_out[:, :self.num_joints, :] cam_features = pred_out[:, self.num_joints:, :] # learn camera parameters pred_2d_joints = self.cam_param_fc(cam_features) return pred_3d_joints, pred_2d_joints def get_loss(self): preds_3d, preds_2d = self._forward() loss = self.loss(preds_3d, preds_2d, self.inputs) output = {'loss': loss} return output def get_pred(self): preds_3d, preds_2d = self._forward() outputs = {'pose3d': preds_3d, 'pose2d': preds_2d} return outputs ================================================ FILE: ppdet/modeling/architectures/ppyoloe.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import copy import paddle from ppdet.core.workspace import register, create from .meta_arch import BaseArch __all__ = ['PPYOLOE', 'PPYOLOEWithAuxHead'] # PP-YOLOE and PP-YOLOE+ are recommended to use this architecture, especially when use distillation or aux head # PP-YOLOE and PP-YOLOE+ can also use the same architecture of YOLOv3 in yolo.py when not use distillation or aux head @register class PPYOLOE(BaseArch): """ PPYOLOE network, see https://arxiv.org/abs/2203.16250 Args: backbone (nn.Layer): backbone instance neck (nn.Layer): neck instance yolo_head (nn.Layer): anchor_head instance post_process (object): `BBoxPostProcess` instance ssod_loss (object): 'SSODPPYOLOELoss' instance, only used for semi-det(ssod) for_distill (bool): whether for distillation feat_distill_place (str): distill which feature for distillation for_mot (bool): whether return other features for multi-object tracking models, default False in pure object detection models. """ __category__ = 'architecture' __shared__ = ['for_distill'] __inject__ = ['post_process', 'ssod_loss'] def __init__(self, backbone='CSPResNet', neck='CustomCSPPAN', yolo_head='PPYOLOEHead', post_process='BBoxPostProcess', ssod_loss='SSODPPYOLOELoss', for_distill=False, feat_distill_place='neck_feats', with_mask=False, for_mot=False): super(PPYOLOE, self).__init__() self.backbone = backbone self.neck = neck self.yolo_head = yolo_head self.post_process = post_process self.for_mot = for_mot self.with_mask = with_mask # for ssod, semi-det self.is_teacher = False self.ssod_loss = ssod_loss # distill self.for_distill = for_distill self.feat_distill_place = feat_distill_place if for_distill: assert feat_distill_place in ['backbone_feats', 'neck_feats'] @classmethod def from_config(cls, cfg, *args, **kwargs): backbone = create(cfg['backbone']) kwargs = {'input_shape': backbone.out_shape} neck = create(cfg['neck'], **kwargs) kwargs = {'input_shape': neck.out_shape} yolo_head = create(cfg['yolo_head'], **kwargs) return { 'backbone': backbone, 'neck': neck, "yolo_head": yolo_head, } def _forward(self): body_feats = self.backbone(self.inputs) neck_feats = self.neck(body_feats, self.for_mot) self.is_teacher = self.inputs.get('is_teacher', False) # for semi-det if self.training or self.is_teacher: yolo_losses = self.yolo_head(neck_feats, self.inputs) if self.for_distill: if self.feat_distill_place == 'backbone_feats': self.yolo_head.distill_pairs['backbone_feats'] = body_feats elif self.feat_distill_place == 'neck_feats': self.yolo_head.distill_pairs['neck_feats'] = neck_feats else: raise ValueError return yolo_losses else: yolo_head_outs = self.yolo_head(neck_feats) if self.post_process is not None: bbox, bbox_num, nms_keep_idx = self.post_process( yolo_head_outs, self.yolo_head.mask_anchors, self.inputs['im_shape'], self.inputs['scale_factor']) else: if not self.with_mask: bbox, bbox_num, nms_keep_idx = self.yolo_head.post_process( yolo_head_outs, self.inputs['scale_factor']) else: bbox, bbox_num, mask, nms_keep_idx = self.yolo_head.post_process( yolo_head_outs, im_shape=self.inputs['im_shape'], scale_factor=self.inputs['scale_factor'], infer_shape=self.inputs['image'].shape[2:]) if not self.with_mask: output = {'bbox': bbox, 'bbox_num': bbox_num} else: output = {'bbox': bbox, 'bbox_num': bbox_num, 'mask': mask} if self.with_mask: output['mask'] = mask return output def get_loss(self): return self._forward() def get_pred(self): return self._forward() def get_loss_keys(self): return ['loss_cls', 'loss_iou', 'loss_dfl', 'loss_contrast'] def get_ssod_loss(self, student_head_outs, teacher_head_outs, train_cfg): ssod_losses = self.ssod_loss(student_head_outs, teacher_head_outs, train_cfg) return ssod_losses @register class PPYOLOEWithAuxHead(BaseArch): __category__ = 'architecture' __inject__ = ['post_process'] def __init__(self, backbone='CSPResNet', neck='CustomCSPPAN', yolo_head='PPYOLOEHead', aux_head='SimpleConvHead', post_process='BBoxPostProcess', for_mot=False, detach_epoch=5): """ PPYOLOE network, see https://arxiv.org/abs/2203.16250 Args: backbone (nn.Layer): backbone instance neck (nn.Layer): neck instance yolo_head (nn.Layer): anchor_head instance post_process (object): `BBoxPostProcess` instance for_mot (bool): whether return other features for multi-object tracking models, default False in pure object detection models. """ super(PPYOLOEWithAuxHead, self).__init__() self.backbone = backbone self.neck = neck self.aux_neck = copy.deepcopy(self.neck) self.yolo_head = yolo_head self.aux_head = aux_head self.post_process = post_process self.for_mot = for_mot self.detach_epoch = detach_epoch @classmethod def from_config(cls, cfg, *args, **kwargs): # backbone backbone = create(cfg['backbone']) # fpn kwargs = {'input_shape': backbone.out_shape} neck = create(cfg['neck'], **kwargs) aux_neck = copy.deepcopy(neck) # head kwargs = {'input_shape': neck.out_shape} yolo_head = create(cfg['yolo_head'], **kwargs) aux_head = create(cfg['aux_head'], **kwargs) return { 'backbone': backbone, 'neck': neck, "yolo_head": yolo_head, 'aux_head': aux_head, } def _forward(self): body_feats = self.backbone(self.inputs) neck_feats = self.neck(body_feats, self.for_mot) if self.training: if self.inputs['epoch_id'] >= self.detach_epoch: aux_neck_feats = self.aux_neck([f.detach() for f in body_feats]) dual_neck_feats = (paddle.concat( [f.detach(), aux_f], axis=1) for f, aux_f in zip(neck_feats, aux_neck_feats)) else: aux_neck_feats = self.aux_neck(body_feats) dual_neck_feats = (paddle.concat( [f, aux_f], axis=1) for f, aux_f in zip(neck_feats, aux_neck_feats)) aux_cls_scores, aux_bbox_preds = self.aux_head(dual_neck_feats) loss = self.yolo_head( neck_feats, self.inputs, aux_pred=[aux_cls_scores, aux_bbox_preds]) return loss else: yolo_head_outs = self.yolo_head(neck_feats) if self.post_process is not None: bbox, bbox_num = self.post_process( yolo_head_outs, self.yolo_head.mask_anchors, self.inputs['im_shape'], self.inputs['scale_factor']) else: bbox, bbox_num = self.yolo_head.post_process( yolo_head_outs, self.inputs['scale_factor']) output = {'bbox': bbox, 'bbox_num': bbox_num} return output def get_loss(self): return self._forward() def get_pred(self): return self._forward() ================================================ FILE: ppdet/modeling/architectures/queryinst.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle from ppdet.core.workspace import register, create from .meta_arch import BaseArch __all__ = ['QueryInst'] @register class QueryInst(BaseArch): __category__ = 'architecture' __inject__ = ['post_process'] def __init__(self, backbone, neck, rpn_head, roi_head, post_process='SparsePostProcess'): super(QueryInst, self).__init__() self.backbone = backbone self.neck = neck self.rpn_head = rpn_head self.roi_head = roi_head self.post_process = post_process @classmethod def from_config(cls, cfg, *args, **kwargs): backbone = create(cfg['backbone']) kwargs = {'input_shape': backbone.out_shape} neck = create(cfg['neck'], **kwargs) kwargs = {'input_shape': neck.out_shape} rpn_head = create(cfg['rpn_head'], **kwargs) roi_head = create(cfg['roi_head'], **kwargs) return { 'backbone': backbone, 'neck': neck, 'rpn_head': rpn_head, "roi_head": roi_head } def _forward(self, targets=None): features = self.backbone(self.inputs) features = self.neck(features) proposal_bboxes, proposal_features = self.rpn_head(self.inputs[ 'img_whwh']) outputs = self.roi_head(features, proposal_bboxes, proposal_features, targets) if self.training: return outputs else: bbox_pred, bbox_num, mask_pred = self.post_process( outputs['class_logits'], outputs['bbox_pred'], self.inputs['scale_factor_whwh'], self.inputs['ori_shape'], outputs['mask_logits']) return bbox_pred, bbox_num, mask_pred def get_loss(self): targets = [] for i in range(len(self.inputs['img_whwh'])): boxes = self.inputs['gt_bbox'][i] labels = self.inputs['gt_class'][i].squeeze(-1) img_whwh = self.inputs['img_whwh'][i] if boxes.shape[0] != 0: img_whwh_tgt = img_whwh.unsqueeze(0).tile([boxes.shape[0], 1]) else: img_whwh_tgt = paddle.zeros_like(boxes) gt_segm = self.inputs['gt_segm'][i].astype('float32') targets.append({ 'boxes': boxes, 'labels': labels, 'img_whwh': img_whwh, 'img_whwh_tgt': img_whwh_tgt, 'gt_segm': gt_segm }) losses = self._forward(targets) losses.update({'loss': sum(losses.values())}) return losses def get_pred(self): bbox_pred, bbox_num, mask_pred = self._forward() return {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred} ================================================ FILE: ppdet/modeling/architectures/retinanet.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function from ppdet.core.workspace import register, create from .meta_arch import BaseArch import paddle import paddle.nn.functional as F __all__ = ['RetinaNet'] @register class RetinaNet(BaseArch): __category__ = 'architecture' def __init__(self, backbone, neck, head): super(RetinaNet, self).__init__() self.backbone = backbone self.neck = neck self.head = head @classmethod def from_config(cls, cfg, *args, **kwargs): backbone = create(cfg['backbone']) kwargs = {'input_shape': backbone.out_shape} neck = create(cfg['neck'], **kwargs) kwargs = {'input_shape': neck.out_shape} head = create(cfg['head'], **kwargs) return { 'backbone': backbone, 'neck': neck, 'head': head, } def _forward(self): body_feats = self.backbone(self.inputs) neck_feats = self.neck(body_feats) if self.training: return self.head(neck_feats, self.inputs) else: head_outs = self.head(neck_feats) bbox, bbox_num, nms_keep_idx = self.head.post_process( head_outs, self.inputs['im_shape'], self.inputs['scale_factor']) if self.use_extra_data: extra_data = {} # record the bbox output before nms, such like scores and nms_keep_idx """extra_data:{ 'scores': predict scores, 'nms_keep_idx': bbox index before nms, } """ preds_logits = self.head.decode_cls_logits(head_outs[0]) preds_scores = F.sigmoid(preds_logits) extra_data['logits'] = preds_logits extra_data['scores'] = preds_scores extra_data['nms_keep_idx'] = nms_keep_idx # bbox index before nms return {'bbox': bbox, 'bbox_num': bbox_num, "extra_data": extra_data} else: return {'bbox': bbox, 'bbox_num': bbox_num} def get_loss(self): return self._forward() def get_pred(self): return self._forward() ================================================ FILE: ppdet/modeling/architectures/rtdetrv3.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle from .meta_arch import BaseArch from ppdet.core.workspace import register, create __all__ = ['RTDETRV3'] # Deformable DETR, DINO use the same architecture as DETR @register class RTDETRV3(BaseArch): __category__ = 'architecture' __inject__ = ['post_process', 'post_process_semi'] __shared__ = ['with_mask', 'exclude_post_process'] def __init__(self, backbone, transformer='DETRTransformer', detr_head='DETRHead', neck=None, aux_o2m_head=None, post_process='DETRPostProcess', post_process_semi=None, with_mask=False, exclude_post_process=False): super(RTDETRV3, self).__init__() self.backbone = backbone self.transformer = transformer self.detr_head = detr_head self.neck = neck self.aux_o2m_head = aux_o2m_head self.post_process = post_process self.with_mask = with_mask self.exclude_post_process = exclude_post_process self.post_process_semi = post_process_semi @classmethod def from_config(cls, cfg, *args, **kwargs): # backbone backbone = create(cfg['backbone']) # neck kwargs = {'input_shape': backbone.out_shape} neck = create(cfg['neck'], **kwargs) if cfg['neck'] else None # transformer if neck is not None: kwargs = {'input_shape': neck.out_shape} transformer = create(cfg['transformer'], **kwargs) # head kwargs = { 'hidden_dim': transformer.hidden_dim, 'nhead': transformer.nhead, 'input_shape': backbone.out_shape } detr_head = create(cfg['detr_head'], **kwargs) kwargs = {'input_shape': neck.out_shape} aux_o2m_head = create(cfg['aux_o2m_head'], **kwargs) return { 'backbone': backbone, 'transformer': transformer, "detr_head": detr_head, "neck": neck, "aux_o2m_head": aux_o2m_head } def _forward(self): # Backbone body_feats = self.backbone(self.inputs) # Neck if self.neck is not None: body_feats = self.neck(body_feats) # Transformer pad_mask = self.inputs.get('pad_mask', None) out_transformer = self.transformer(body_feats, pad_mask, self.inputs) # DETR Head if self.training: detr_losses = self.detr_head(out_transformer, body_feats, self.inputs) detr_losses.update({ 'loss': paddle.add_n( [v for k, v in detr_losses.items() if 'log' not in k]) }) if self.aux_o2m_head is not None: aux_o2m_losses = self.aux_o2m_head(body_feats, self.inputs) for k, v in aux_o2m_losses.items(): if k == 'loss': detr_losses[k] += v k = k + '_aux_o2m' detr_losses[k] = v return detr_losses else: preds = self.detr_head(out_transformer, body_feats) if self.exclude_post_process: bbox, bbox_num, mask = preds else: bbox, bbox_num, mask = self.post_process( preds, self.inputs['im_shape'], self.inputs['scale_factor'], self.inputs['image'][2:].shape) # aux_o2m_outs = self.aux_o2m_head(body_feats) # bbox, bbox_num, nms_keep_idx = self.aux_o2m_head.post_process( # aux_o2m_outs, self.inputs['scale_factor']) output = {'bbox': bbox, 'bbox_num': bbox_num} if self.with_mask: output['mask'] = mask return output def get_loss(self): return self._forward() def get_pred(self): return self._forward() ================================================ FILE: ppdet/modeling/architectures/s2anet.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle from ppdet.core.workspace import register, create from .meta_arch import BaseArch __all__ = ['S2ANet'] @register class S2ANet(BaseArch): __category__ = 'architecture' __inject__ = ['head'] def __init__(self, backbone, neck, head): """ S2ANet, see https://arxiv.org/pdf/2008.09397.pdf Args: backbone (object): backbone instance neck (object): `FPN` instance head (object): `Head` instance """ super(S2ANet, self).__init__() self.backbone = backbone self.neck = neck self.s2anet_head = head @classmethod def from_config(cls, cfg, *args, **kwargs): backbone = create(cfg['backbone']) kwargs = {'input_shape': backbone.out_shape} neck = cfg['neck'] and create(cfg['neck'], **kwargs) out_shape = neck and neck.out_shape or backbone.out_shape kwargs = {'input_shape': out_shape} head = create(cfg['head'], **kwargs) return {'backbone': backbone, 'neck': neck, "head": head} def _forward(self): body_feats = self.backbone(self.inputs) if self.neck is not None: body_feats = self.neck(body_feats) if self.training: loss = self.s2anet_head(body_feats, self.inputs) return loss else: head_outs = self.s2anet_head(body_feats) # post_process bboxes, bbox_num = self.s2anet_head.get_bboxes(head_outs) # rescale the prediction back to origin image im_shape = self.inputs['im_shape'] scale_factor = self.inputs['scale_factor'] bboxes = self.s2anet_head.get_pred(bboxes, bbox_num, im_shape, scale_factor) # output output = {'bbox': bboxes, 'bbox_num': bbox_num} return output def get_loss(self, ): loss = self._forward() return loss def get_pred(self): output = self._forward() return output ================================================ FILE: ppdet/modeling/architectures/solov2.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle from ppdet.core.workspace import register, create from .meta_arch import BaseArch __all__ = ['SOLOv2'] @register class SOLOv2(BaseArch): """ SOLOv2 network, see https://arxiv.org/abs/2003.10152 Args: backbone (object): an backbone instance solov2_head (object): an `SOLOv2Head` instance mask_head (object): an `SOLOv2MaskHead` instance neck (object): neck of network, such as feature pyramid network instance """ __category__ = 'architecture' def __init__(self, backbone, solov2_head, mask_head, neck=None): super(SOLOv2, self).__init__() self.backbone = backbone self.neck = neck self.solov2_head = solov2_head self.mask_head = mask_head @classmethod def from_config(cls, cfg, *args, **kwargs): backbone = create(cfg['backbone']) kwargs = {'input_shape': backbone.out_shape} neck = create(cfg['neck'], **kwargs) kwargs = {'input_shape': neck.out_shape} solov2_head = create(cfg['solov2_head'], **kwargs) mask_head = create(cfg['mask_head'], **kwargs) return { 'backbone': backbone, 'neck': neck, 'solov2_head': solov2_head, 'mask_head': mask_head, } def model_arch(self): body_feats = self.backbone(self.inputs) body_feats = self.neck(body_feats) self.seg_pred = self.mask_head(body_feats) self.cate_pred_list, self.kernel_pred_list = self.solov2_head( body_feats) def get_loss(self, ): loss = {} # get gt_ins_labels, gt_cate_labels, etc. gt_ins_labels, gt_cate_labels, gt_grid_orders = [], [], [] fg_num = self.inputs['fg_num'] for i in range(len(self.solov2_head.seg_num_grids)): ins_label = 'ins_label{}'.format(i) if ins_label in self.inputs: gt_ins_labels.append(self.inputs[ins_label]) cate_label = 'cate_label{}'.format(i) if cate_label in self.inputs: gt_cate_labels.append(self.inputs[cate_label]) grid_order = 'grid_order{}'.format(i) if grid_order in self.inputs: gt_grid_orders.append(self.inputs[grid_order]) loss_solov2 = self.solov2_head.get_loss( self.cate_pred_list, self.kernel_pred_list, self.seg_pred, gt_ins_labels, gt_cate_labels, gt_grid_orders, fg_num) loss.update(loss_solov2) total_loss = paddle.add_n(list(loss.values())) loss.update({'loss': total_loss}) return loss def get_pred(self): seg_masks, cate_labels, cate_scores, bbox_num = self.solov2_head.get_prediction( self.cate_pred_list, self.kernel_pred_list, self.seg_pred, self.inputs['im_shape'], self.inputs['scale_factor']) outs = { "segm": seg_masks, "bbox_num": bbox_num, 'cate_label': cate_labels, 'cate_score': cate_scores } return outs ================================================ FILE: ppdet/modeling/architectures/sparse_rcnn.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function from ppdet.core.workspace import register, create from .meta_arch import BaseArch __all__ = ["SparseRCNN"] @register class SparseRCNN(BaseArch): __category__ = 'architecture' __inject__ = ["postprocess"] def __init__(self, backbone, neck, head="SparsercnnHead", postprocess="SparsePostProcess"): super(SparseRCNN, self).__init__() self.backbone = backbone self.neck = neck self.head = head self.postprocess = postprocess @classmethod def from_config(cls, cfg, *args, **kwargs): backbone = create(cfg['backbone']) kwargs = {'input_shape': backbone.out_shape} neck = create(cfg['neck'], **kwargs) kwargs = {'roi_input_shape': neck.out_shape} head = create(cfg['head'], **kwargs) return { 'backbone': backbone, 'neck': neck, "head": head, } def _forward(self): body_feats = self.backbone(self.inputs) fpn_feats = self.neck(body_feats) head_outs = self.head(fpn_feats, self.inputs["img_whwh"]) if not self.training: bbox_pred, bbox_num = self.postprocess( head_outs["pred_logits"], head_outs["pred_boxes"], self.inputs["scale_factor_whwh"], self.inputs["ori_shape"]) return bbox_pred, bbox_num else: return head_outs def get_loss(self): batch_gt_class = self.inputs["gt_class"] batch_gt_box = self.inputs["gt_bbox"] batch_whwh = self.inputs["img_whwh"] targets = [] for i in range(len(batch_gt_class)): boxes = batch_gt_box[i] labels = batch_gt_class[i].squeeze(-1) img_whwh = batch_whwh[i] img_whwh_tgt = img_whwh.unsqueeze(0).tile([int(boxes.shape[0]), 1]) targets.append({ "boxes": boxes, "labels": labels, "img_whwh": img_whwh, "img_whwh_tgt": img_whwh_tgt }) outputs = self._forward() loss_dict = self.head.get_loss(outputs, targets) acc = loss_dict["acc"] loss_dict.pop("acc") total_loss = sum(loss_dict.values()) loss_dict.update({"loss": total_loss, "acc": acc}) return loss_dict def get_pred(self): bbox_pred, bbox_num = self._forward() output = {'bbox': bbox_pred, 'bbox_num': bbox_num} return output ================================================ FILE: ppdet/modeling/architectures/ssd.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function from ppdet.core.workspace import register, create from .meta_arch import BaseArch import paddle import paddle.nn.functional as F __all__ = ['SSD'] @register class SSD(BaseArch): """ Single Shot MultiBox Detector, see https://arxiv.org/abs/1512.02325 Args: backbone (nn.Layer): backbone instance ssd_head (nn.Layer): `SSDHead` instance post_process (object): `BBoxPostProcess` instance """ __category__ = 'architecture' __inject__ = ['post_process'] def __init__(self, backbone, ssd_head, post_process, r34_backbone=False): super(SSD, self).__init__() self.backbone = backbone self.ssd_head = ssd_head self.post_process = post_process self.r34_backbone = r34_backbone if self.r34_backbone: from ppdet.modeling.backbones.resnet import ResNet assert isinstance(self.backbone, ResNet) and \ self.backbone.depth == 34, \ "If you set r34_backbone=True, please use ResNet-34 as backbone." self.backbone.res_layers[2].blocks[0].branch2a.conv._stride = [1, 1] self.backbone.res_layers[2].blocks[0].short.conv._stride = [1, 1] @classmethod def from_config(cls, cfg, *args, **kwargs): # backbone backbone = create(cfg['backbone']) # head kwargs = {'input_shape': backbone.out_shape} ssd_head = create(cfg['ssd_head'], **kwargs) return { 'backbone': backbone, "ssd_head": ssd_head, } def _forward(self): # Backbone body_feats = self.backbone(self.inputs) # SSD Head if self.training: return self.ssd_head(body_feats, self.inputs['image'], self.inputs['gt_bbox'], self.inputs['gt_class']) else: preds, anchors = self.ssd_head(body_feats, self.inputs['image']) bbox, bbox_num, nms_keep_idx = self.post_process( preds, anchors, self.inputs['im_shape'], self.inputs['scale_factor']) if self.use_extra_data: extra_data = {} # record the bbox output before nms, such like scores and nms_keep_idx """extra_data:{ 'scores': predict scores, 'nms_keep_idx': bbox index before nms, } """ preds_logits = preds[1] # [[1xNumBBoxNumClass]] extra_data['scores'] = F.softmax(paddle.concat( preds_logits, axis=1)).transpose([0, 2, 1]) extra_data['logits'] = paddle.concat( preds_logits, axis=1).transpose([0, 2, 1]) extra_data['nms_keep_idx'] = nms_keep_idx # bbox index before nms return bbox, bbox_num, extra_data else: return bbox, bbox_num def get_loss(self, ): return {"loss": self._forward()} def get_pred(self): if self.use_extra_data: bbox_pred, bbox_num, extra_data = self._forward() output = { "bbox": bbox_pred, "bbox_num": bbox_num, "extra_data": extra_data } else: bbox_pred, bbox_num = self._forward() output = { "bbox": bbox_pred, "bbox_num": bbox_num, } return output ================================================ FILE: ppdet/modeling/architectures/tood.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function from ppdet.core.workspace import register, create from .meta_arch import BaseArch __all__ = ['TOOD'] @register class TOOD(BaseArch): """ TOOD: Task-aligned One-stage Object Detection, see https://arxiv.org/abs/2108.07755 Args: backbone (nn.Layer): backbone instance neck (nn.Layer): 'FPN' instance head (nn.Layer): 'TOODHead' instance """ __category__ = 'architecture' def __init__(self, backbone, neck, head): super(TOOD, self).__init__() self.backbone = backbone self.neck = neck self.head = head @classmethod def from_config(cls, cfg, *args, **kwargs): backbone = create(cfg['backbone']) kwargs = {'input_shape': backbone.out_shape} neck = create(cfg['neck'], **kwargs) kwargs = {'input_shape': neck.out_shape} head = create(cfg['head'], **kwargs) return { 'backbone': backbone, 'neck': neck, "head": head, } def _forward(self): body_feats = self.backbone(self.inputs) fpn_feats = self.neck(body_feats) head_outs = self.head(fpn_feats) if not self.training: bboxes, bbox_num = self.head.post_process( head_outs, self.inputs['im_shape'], self.inputs['scale_factor']) return bboxes, bbox_num else: loss = self.head.get_loss(head_outs, self.inputs) return loss def get_loss(self): return self._forward() def get_pred(self): bbox_pred, bbox_num = self._forward() output = {'bbox': bbox_pred, 'bbox_num': bbox_num} return output ================================================ FILE: ppdet/modeling/architectures/ttfnet.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle from ppdet.core.workspace import register, create from .meta_arch import BaseArch __all__ = ['TTFNet'] @register class TTFNet(BaseArch): """ TTFNet network, see https://arxiv.org/abs/1909.00700 Args: backbone (object): backbone instance neck (object): 'TTFFPN' instance ttf_head (object): 'TTFHead' instance post_process (object): 'BBoxPostProcess' instance """ __category__ = 'architecture' __inject__ = ['post_process'] def __init__(self, backbone='DarkNet', neck='TTFFPN', ttf_head='TTFHead', post_process='BBoxPostProcess'): super(TTFNet, self).__init__() self.backbone = backbone self.neck = neck self.ttf_head = ttf_head self.post_process = post_process @classmethod def from_config(cls, cfg, *args, **kwargs): backbone = create(cfg['backbone']) kwargs = {'input_shape': backbone.out_shape} neck = create(cfg['neck'], **kwargs) kwargs = {'input_shape': neck.out_shape} ttf_head = create(cfg['ttf_head'], **kwargs) return { 'backbone': backbone, 'neck': neck, "ttf_head": ttf_head, } def _forward(self): body_feats = self.backbone(self.inputs) body_feats = self.neck(body_feats) hm, wh = self.ttf_head(body_feats) if self.training: return hm, wh else: bbox, bbox_num = self.post_process(hm, wh, self.inputs['im_shape'], self.inputs['scale_factor']) return bbox, bbox_num def get_loss(self, ): loss = {} heatmap = self.inputs['ttf_heatmap'] box_target = self.inputs['ttf_box_target'] reg_weight = self.inputs['ttf_reg_weight'] hm, wh = self._forward() head_loss = self.ttf_head.get_loss(hm, wh, heatmap, box_target, reg_weight) loss.update(head_loss) total_loss = paddle.add_n(list(loss.values())) loss.update({'loss': total_loss}) return loss def get_pred(self): bbox_pred, bbox_num = self._forward() output = { "bbox": bbox_pred, "bbox_num": bbox_num, } return output ================================================ FILE: ppdet/modeling/architectures/yolo.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function from ppdet.core.workspace import register, create from .meta_arch import BaseArch from ..post_process import JDEBBoxPostProcess __all__ = ['YOLOv3'] # YOLOv3,PP-YOLO,PP-YOLOv2,PP-YOLOE,PP-YOLOE+ use the same architecture as YOLOv3 # PP-YOLOE and PP-YOLOE+ are recommended to use PPYOLOE architecture in ppyoloe.py, especially when use distillation or aux head @register class YOLOv3(BaseArch): __category__ = 'architecture' __shared__ = ['data_format'] __inject__ = ['post_process'] def __init__(self, backbone='DarkNet', neck='YOLOv3FPN', yolo_head='YOLOv3Head', post_process='BBoxPostProcess', data_format='NCHW', for_mot=False): """ YOLOv3 network, see https://arxiv.org/abs/1804.02767 Args: backbone (nn.Layer): backbone instance neck (nn.Layer): neck instance yolo_head (nn.Layer): anchor_head instance bbox_post_process (object): `BBoxPostProcess` instance data_format (str): data format, NCHW or NHWC for_mot (bool): whether return other features for multi-object tracking models, default False in pure object detection models. """ super(YOLOv3, self).__init__(data_format=data_format) self.backbone = backbone self.neck = neck self.yolo_head = yolo_head self.post_process = post_process self.for_mot = for_mot self.return_idx = isinstance(post_process, JDEBBoxPostProcess) @classmethod def from_config(cls, cfg, *args, **kwargs): # backbone backbone = create(cfg['backbone']) # fpn kwargs = {'input_shape': backbone.out_shape} neck = create(cfg['neck'], **kwargs) # head kwargs = {'input_shape': neck.out_shape} yolo_head = create(cfg['yolo_head'], **kwargs) return { 'backbone': backbone, 'neck': neck, "yolo_head": yolo_head, } def _forward(self): body_feats = self.backbone(self.inputs) if self.for_mot: neck_feats = self.neck(body_feats, self.for_mot) else: neck_feats = self.neck(body_feats) if isinstance(neck_feats, dict): assert self.for_mot == True emb_feats = neck_feats['emb_feats'] neck_feats = neck_feats['yolo_feats'] if self.training: yolo_losses = self.yolo_head(neck_feats, self.inputs) if self.for_mot: return {'det_losses': yolo_losses, 'emb_feats': emb_feats} else: return yolo_losses else: yolo_head_outs = self.yolo_head(neck_feats) if self.for_mot: # the detection part of JDE MOT model boxes_idx, bbox, bbox_num, nms_keep_idx = self.post_process( yolo_head_outs, self.yolo_head.mask_anchors) output = { 'bbox': bbox, 'bbox_num': bbox_num, 'boxes_idx': boxes_idx, 'nms_keep_idx': nms_keep_idx, 'emb_feats': emb_feats, } else: if self.return_idx: # the detection part of JDE MOT model _, bbox, bbox_num, nms_keep_idx = self.post_process( yolo_head_outs, self.yolo_head.mask_anchors) elif self.post_process is not None: # anchor based YOLOs: YOLOv3,PP-YOLO,PP-YOLOv2 use mask_anchors bbox, bbox_num, nms_keep_idx = self.post_process( yolo_head_outs, self.yolo_head.mask_anchors, self.inputs['im_shape'], self.inputs['scale_factor']) else: # anchor free YOLOs: PP-YOLOE, PP-YOLOE+ bbox, bbox_num, nms_keep_idx = self.yolo_head.post_process( yolo_head_outs, self.inputs['scale_factor']) if self.use_extra_data: extra_data = {} # record the bbox output before nms, such like scores and nms_keep_idx """extra_data:{ 'scores': predict scores, 'nms_keep_idx': bbox index before nms, } """ extra_data['scores'] = yolo_head_outs[0] # predict scores (probability) # Todo: get logits output extra_data['nms_keep_idx'] = nms_keep_idx # Todo support for mask_anchors yolo output = {'bbox': bbox, 'bbox_num': bbox_num, 'extra_data': extra_data} else: output = {'bbox': bbox, 'bbox_num': bbox_num} return output def get_loss(self): return self._forward() def get_pred(self): return self._forward() ================================================ FILE: ppdet/modeling/architectures/yolof.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function from ppdet.core.workspace import register, create from .meta_arch import BaseArch __all__ = ['YOLOF'] @register class YOLOF(BaseArch): __category__ = 'architecture' def __init__(self, backbone='ResNet', neck='DilatedEncoder', head='YOLOFHead', for_mot=False): """ YOLOF network, see https://arxiv.org/abs/2103.09460 Args: backbone (nn.Layer): backbone instance neck (nn.Layer): DilatedEncoder instance head (nn.Layer): YOLOFHead instance for_mot (bool): whether return other features for multi-object tracking models, default False in pure object detection models. """ super(YOLOF, self).__init__() self.backbone = backbone self.neck = neck self.head = head self.for_mot = for_mot @classmethod def from_config(cls, cfg, *args, **kwargs): # backbone backbone = create(cfg['backbone']) # fpn kwargs = {'input_shape': backbone.out_shape} neck = create(cfg['neck'], **kwargs) # head kwargs = {'input_shape': neck.out_shape} head = create(cfg['head'], **kwargs) return { 'backbone': backbone, 'neck': neck, "head": head, } def _forward(self): body_feats = self.backbone(self.inputs) neck_feats = self.neck(body_feats, self.for_mot) if self.training: yolo_losses = self.head(neck_feats, self.inputs) return yolo_losses else: yolo_head_outs = self.head(neck_feats) bbox, bbox_num = self.head.post_process(yolo_head_outs, self.inputs['im_shape'], self.inputs['scale_factor']) output = {'bbox': bbox, 'bbox_num': bbox_num} return output def get_loss(self): return self._forward() def get_pred(self): return self._forward() ================================================ FILE: ppdet/modeling/architectures/yolox.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function from ppdet.core.workspace import register, create from .meta_arch import BaseArch import random import paddle import paddle.nn.functional as F import paddle.distributed as dist __all__ = ['YOLOX'] @register class YOLOX(BaseArch): """ YOLOX network, see https://arxiv.org/abs/2107.08430 Args: backbone (nn.Layer): backbone instance neck (nn.Layer): neck instance head (nn.Layer): head instance for_mot (bool): whether used for MOT or not input_size (list[int]): initial scale, will be reset by self._preprocess() size_stride (int): stride of the size range size_range (list[int]): multi-scale range for training random_interval (int): interval of iter to change self._input_size """ __category__ = 'architecture' def __init__(self, backbone='CSPDarkNet', neck='YOLOCSPPAN', head='YOLOXHead', for_mot=False, input_size=[640, 640], size_stride=32, size_range=[15, 25], random_interval=10): super(YOLOX, self).__init__() self.backbone = backbone self.neck = neck self.head = head self.for_mot = for_mot self.input_size = input_size self._input_size = paddle.to_tensor(input_size) self.size_stride = size_stride self.size_range = size_range self.random_interval = random_interval self._step = 0 @classmethod def from_config(cls, cfg, *args, **kwargs): # backbone backbone = create(cfg['backbone']) # fpn kwargs = {'input_shape': backbone.out_shape} neck = create(cfg['neck'], **kwargs) # head kwargs = {'input_shape': neck.out_shape} head = create(cfg['head'], **kwargs) return { 'backbone': backbone, 'neck': neck, "head": head, } def _forward(self): if self.training: self._preprocess() body_feats = self.backbone(self.inputs) neck_feats = self.neck(body_feats, self.for_mot) if self.training: yolox_losses = self.head(neck_feats, self.inputs) yolox_losses.update({'size': self._input_size[0]}) return yolox_losses else: head_outs = self.head(neck_feats) bbox, bbox_num = self.head.post_process( head_outs, self.inputs['im_shape'], self.inputs['scale_factor']) return {'bbox': bbox, 'bbox_num': bbox_num} def get_loss(self): return self._forward() def get_pred(self): return self._forward() def _preprocess(self): # YOLOX multi-scale training, interpolate resize before inputs of the network. self._get_size() scale_y = self._input_size[0] / self.input_size[0] scale_x = self._input_size[1] / self.input_size[1] if scale_x != 1 or scale_y != 1: self.inputs['image'] = F.interpolate( self.inputs['image'], size=self._input_size, mode='bilinear', align_corners=False) gt_bboxes = self.inputs['gt_bbox'] for i in range(len(gt_bboxes)): if len(gt_bboxes[i]) > 0: gt_bboxes[i][:, 0::2] = gt_bboxes[i][:, 0::2] * scale_x gt_bboxes[i][:, 1::2] = gt_bboxes[i][:, 1::2] * scale_y self.inputs['gt_bbox'] = gt_bboxes def _get_size(self): # random_interval = 10 as default, every 10 iters to change self._input_size image_ratio = self.input_size[1] * 1.0 / self.input_size[0] if self._step % self.random_interval == 0: size_factor = random.randint(*self.size_range) size = [ self.size_stride * size_factor, self.size_stride * int(size_factor * image_ratio) ] self._input_size = paddle.to_tensor(size) self._step += 1 ================================================ FILE: ppdet/modeling/assigners/__init__.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import utils from . import task_aligned_assigner from . import atss_assigner from . import simota_assigner from . import max_iou_assigner from . import fcosr_assigner from . import rotated_task_aligned_assigner from . import task_aligned_assigner_cr from . import uniform_assigner from .utils import * from .task_aligned_assigner import * from .atss_assigner import * from .simota_assigner import * from .max_iou_assigner import * from .fcosr_assigner import * from .rotated_task_aligned_assigner import * from .task_aligned_assigner_cr import * from .uniform_assigner import * from .hungarian_assigner import * from .pose_utils import * ================================================ FILE: ppdet/modeling/assigners/atss_assigner.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from ..bbox_utils import iou_similarity, batch_iou_similarity from ..bbox_utils import bbox_center from .utils import (check_points_inside_bboxes, compute_max_iou_anchor, compute_max_iou_gt) __all__ = ['ATSSAssigner'] @register class ATSSAssigner(nn.Layer): """Bridging the Gap Between Anchor-based and Anchor-free Detection via Adaptive Training Sample Selection """ __shared__ = ['num_classes'] def __init__(self, topk=9, num_classes=80, force_gt_matching=False, eps=1e-9, sm_use=False): super(ATSSAssigner, self).__init__() self.topk = topk self.num_classes = num_classes self.force_gt_matching = force_gt_matching self.eps = eps self.sm_use = sm_use def _gather_topk_pyramid(self, gt2anchor_distances, num_anchors_list, pad_gt_mask): gt2anchor_distances_list = paddle.split( gt2anchor_distances, num_anchors_list, axis=-1) num_anchors_index = np.cumsum(num_anchors_list).tolist() num_anchors_index = [0, ] + num_anchors_index[:-1] is_in_topk_list = [] topk_idxs_list = [] for distances, anchors_index in zip(gt2anchor_distances_list, num_anchors_index): num_anchors = distances.shape[-1] _, topk_idxs = paddle.topk( distances, self.topk, axis=-1, largest=False) topk_idxs_list.append(topk_idxs + anchors_index) is_in_topk = F.one_hot(topk_idxs, num_anchors).sum( axis=-2).astype(gt2anchor_distances.dtype) is_in_topk_list.append(is_in_topk * pad_gt_mask) is_in_topk_list = paddle.concat(is_in_topk_list, axis=-1) topk_idxs_list = paddle.concat(topk_idxs_list, axis=-1) return is_in_topk_list, topk_idxs_list @paddle.no_grad() def forward(self, anchor_bboxes, num_anchors_list, gt_labels, gt_bboxes, pad_gt_mask, bg_index, gt_scores=None, pred_bboxes=None): r"""This code is based on https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/atss_assigner.py The assignment is done in following steps 1. compute iou between all bbox (bbox of all pyramid levels) and gt 2. compute center distance between all bbox and gt 3. on each pyramid level, for each gt, select k bbox whose center are closest to the gt center, so we total select k*l bbox as candidates for each gt 4. get corresponding iou for the these candidates, and compute the mean and std, set mean + std as the iou threshold 5. select these candidates whose iou are greater than or equal to the threshold as positive 6. limit the positive sample's center in gt 7. if an anchor box is assigned to multiple gts, the one with the highest iou will be selected. Args: anchor_bboxes (Tensor, float32): pre-defined anchors, shape(L, 4), "xmin, xmax, ymin, ymax" format num_anchors_list (List): num of anchors in each level gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1) gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4) pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1) bg_index (int): background index gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1), if None, then it will initialize with one_hot label pred_bboxes (Tensor, float32, optional): predicted bounding boxes, shape(B, L, 4) Returns: assigned_labels (Tensor): (B, L) assigned_bboxes (Tensor): (B, L, 4) assigned_scores (Tensor): (B, L, C), if pred_bboxes is not None, then output ious """ assert gt_labels.ndim == gt_bboxes.ndim and \ gt_bboxes.ndim == 3 num_anchors, _ = anchor_bboxes.shape batch_size, num_max_boxes, _ = gt_bboxes.shape # negative batch if num_max_boxes == 0: assigned_labels = paddle.full( [batch_size, num_anchors], bg_index, dtype='int32') assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4]) assigned_scores = paddle.zeros( [batch_size, num_anchors, self.num_classes]) return assigned_labels, assigned_bboxes, assigned_scores # 1. compute iou between gt and anchor bbox, [B, n, L] ious = iou_similarity(gt_bboxes.reshape([-1, 4]), anchor_bboxes) ious = ious.reshape([batch_size, -1, num_anchors]) # 2. compute center distance between all anchors and gt, [B, n, L] gt_centers = bbox_center(gt_bboxes.reshape([-1, 4])).unsqueeze(1) anchor_centers = bbox_center(anchor_bboxes) gt2anchor_distances = (gt_centers - anchor_centers.unsqueeze(0)) \ .norm(2, axis=-1).reshape([batch_size, -1, num_anchors]) # 3. on each pyramid level, selecting topk closest candidates # based on the center distance, [B, n, L] is_in_topk, topk_idxs = self._gather_topk_pyramid( gt2anchor_distances, num_anchors_list, pad_gt_mask) # 4. get corresponding iou for the these candidates, and compute the # mean and std, 5. set mean + std as the iou threshold iou_candidates = ious * is_in_topk iou_threshold = paddle.index_sample( iou_candidates.flatten(stop_axis=-2), topk_idxs.flatten(stop_axis=-2)) iou_threshold = iou_threshold.reshape([batch_size, num_max_boxes, -1]) iou_threshold = iou_threshold.mean(axis=-1, keepdim=True) + \ iou_threshold.std(axis=-1, keepdim=True) is_in_topk = paddle.where(iou_candidates > iou_threshold, is_in_topk, paddle.zeros_like(is_in_topk)) # 6. check the positive sample's center in gt, [B, n, L] if self.sm_use: is_in_gts = check_points_inside_bboxes( anchor_centers, gt_bboxes, sm_use=True) else: is_in_gts = check_points_inside_bboxes(anchor_centers, gt_bboxes) # select positive sample, [B, n, L] mask_positive = is_in_topk * is_in_gts * pad_gt_mask # 7. if an anchor box is assigned to multiple gts, # the one with the highest iou will be selected. mask_positive_sum = mask_positive.sum(axis=-2) if mask_positive_sum.max() > 1: mask_multiple_gts = ( mask_positive_sum.unsqueeze(1) > 1).astype('int32').tile( [1, num_max_boxes, 1]).astype('bool') if self.sm_use: is_max_iou = compute_max_iou_anchor(ious * mask_positive) else: is_max_iou = compute_max_iou_anchor(ious) mask_positive = paddle.where(mask_multiple_gts, is_max_iou, mask_positive) mask_positive_sum = mask_positive.sum(axis=-2) # 8. make sure every gt_bbox matches the anchor if self.force_gt_matching: is_max_iou = compute_max_iou_gt(ious) * pad_gt_mask mask_max_iou = (is_max_iou.sum(-2, keepdim=True) == 1).tile( [1, num_max_boxes, 1]) mask_positive = paddle.where(mask_max_iou, is_max_iou, mask_positive) mask_positive_sum = mask_positive.sum(axis=-2) assigned_gt_index = mask_positive.argmax(axis=-2) # assigned target batch_ind = paddle.arange( end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1) assigned_gt_index = assigned_gt_index + (batch_ind * num_max_boxes).astype(assigned_gt_index.dtype) assigned_labels = paddle.gather( gt_labels.flatten(), assigned_gt_index.flatten(), axis=0) assigned_labels = assigned_labels.reshape([batch_size, num_anchors]) assigned_labels = paddle.where( mask_positive_sum > 0, assigned_labels, paddle.full_like(assigned_labels, bg_index)) assigned_bboxes = paddle.gather( gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0) assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4]) assigned_scores = F.one_hot(assigned_labels, self.num_classes + 1) ind = list(range(self.num_classes + 1)) ind.remove(bg_index) assigned_scores = paddle.index_select( assigned_scores, paddle.to_tensor(ind), axis=-1) if pred_bboxes is not None: # assigned iou ious = batch_iou_similarity(gt_bboxes, pred_bboxes) * mask_positive ious = ious.max(axis=-2).unsqueeze(-1) assigned_scores *= ious elif gt_scores is not None: gather_scores = paddle.gather( gt_scores.flatten(), assigned_gt_index.flatten(), axis=0) gather_scores = gather_scores.reshape([batch_size, num_anchors]) gather_scores = paddle.where(mask_positive_sum > 0, gather_scores, paddle.zeros_like(gather_scores)) assigned_scores *= gather_scores.unsqueeze(-1) return assigned_labels, assigned_bboxes, assigned_scores ================================================ FILE: ppdet/modeling/assigners/clrnet_assigner.py ================================================ import paddle import paddle.nn.functional as F from ppdet.modeling.losses.clrnet_line_iou_loss import line_iou def distance_cost(predictions, targets, img_w): """ repeat predictions and targets to generate all combinations use the abs distance as the new distance cost """ num_priors = predictions.shape[0] num_targets = targets.shape[0] predictions = paddle.repeat_interleave( predictions, num_targets, axis=0)[..., 6:] targets = paddle.concat(x=num_priors * [targets])[..., 6:] invalid_masks = (targets < 0) | (targets >= img_w) lengths = (~invalid_masks).sum(axis=1) distances = paddle.abs(x=targets - predictions) distances[invalid_masks] = 0.0 distances = distances.sum(axis=1) / (lengths.cast("float32") + 1e-09) distances = distances.reshape([num_priors, num_targets]) return distances def focal_cost(cls_pred, gt_labels, alpha=0.25, gamma=2, eps=1e-12): """ Args: cls_pred (Tensor): Predicted classification logits, shape [num_query, num_class]. gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). Returns: torch.Tensor: cls_cost value """ cls_pred = F.sigmoid(cls_pred) neg_cost = -(1 - cls_pred + eps).log() * (1 - alpha) * cls_pred.pow(gamma) pos_cost = -(cls_pred + eps).log() * alpha * (1 - cls_pred).pow(gamma) cls_cost = pos_cost.index_select( gt_labels, axis=1) - neg_cost.index_select( gt_labels, axis=1) return cls_cost def dynamic_k_assign(cost, pair_wise_ious): """ Assign grouth truths with priors dynamically. Args: cost: the assign cost. pair_wise_ious: iou of grouth truth and priors. Returns: prior_idx: the index of assigned prior. gt_idx: the corresponding ground truth index. """ matching_matrix = paddle.zeros_like(cost) ious_matrix = pair_wise_ious ious_matrix[ious_matrix < 0] = 0.0 n_candidate_k = 4 topk_ious, _ = paddle.topk(ious_matrix, n_candidate_k, axis=0) dynamic_ks = paddle.clip(x=topk_ious.sum(0).cast("int32"), min=1) num_gt = cost.shape[1] for gt_idx in range(num_gt): _, pos_idx = paddle.topk( x=cost[:, gt_idx], k=dynamic_ks[gt_idx].item(), largest=False) matching_matrix[pos_idx, gt_idx] = 1.0 del topk_ious, dynamic_ks, pos_idx matched_gt = matching_matrix.sum(axis=1) if (matched_gt > 1).sum() > 0: matched_gt_indices = paddle.nonzero(matched_gt > 1)[:, 0] cost_argmin = paddle.argmin( cost.index_select(matched_gt_indices), axis=1) matching_matrix[matched_gt_indices][0] *= 0.0 matching_matrix[matched_gt_indices, cost_argmin] = 1.0 prior_idx = matching_matrix.sum(axis=1).nonzero() gt_idx = matching_matrix[prior_idx].argmax(axis=-1) return prior_idx.flatten(), gt_idx.flatten() def cdist_paddle(x1, x2, p=2): assert x1.shape[1] == x2.shape[1] B, M = x1.shape # if p == np.inf: # dist = np.max(np.abs(x1[:, np.newaxis, :] - x2[np.newaxis, :, :]), axis=-1) if p == 1: dist = paddle.sum( paddle.abs(x1.unsqueeze(axis=1) - x2.unsqueeze(axis=0)), axis=-1) else: dist = paddle.pow(paddle.sum(paddle.pow( paddle.abs(x1.unsqueeze(axis=1) - x2.unsqueeze(axis=0)), p), axis=-1), 1 / p) return dist def assign(predictions, targets, img_w, img_h, distance_cost_weight=3.0, cls_cost_weight=1.0): """ computes dynamicly matching based on the cost, including cls cost and lane similarity cost Args: predictions (Tensor): predictions predicted by each stage, shape: (num_priors, 78) targets (Tensor): lane targets, shape: (num_targets, 78) return: matched_row_inds (Tensor): matched predictions, shape: (num_targets) matched_col_inds (Tensor): matched targets, shape: (num_targets) """ predictions = predictions.detach().clone() predictions[:, 3] *= img_w - 1 predictions[:, 6:] *= img_w - 1 targets = targets.detach().clone() distances_score = distance_cost(predictions, targets, img_w) distances_score = 1 - distances_score / paddle.max(x=distances_score) + 0.01 cls_score = focal_cost(predictions[:, :2], targets[:, 1].cast('int64')) num_priors = predictions.shape[0] num_targets = targets.shape[0] target_start_xys = targets[:, 2:4] target_start_xys[..., 0] *= (img_h - 1) prediction_start_xys = predictions[:, 2:4] prediction_start_xys[..., 0] *= (img_h - 1) start_xys_score = cdist_paddle( prediction_start_xys, target_start_xys, p=2).reshape([num_priors, num_targets]) start_xys_score = 1 - start_xys_score / paddle.max(x=start_xys_score) + 0.01 target_thetas = targets[:, 4].unsqueeze(axis=-1) theta_score = cdist_paddle( predictions[:, 4].unsqueeze(axis=-1), target_thetas, p=1).reshape([num_priors, num_targets]) * 180 theta_score = 1 - theta_score / paddle.max(x=theta_score) + 0.01 cost = -(distances_score * start_xys_score * theta_score )**2 * distance_cost_weight + cls_score * cls_cost_weight iou = line_iou(predictions[..., 6:], targets[..., 6:], img_w, aligned=False) matched_row_inds, matched_col_inds = dynamic_k_assign(cost, iou) return matched_row_inds, matched_col_inds ================================================ FILE: ppdet/modeling/assigners/fcosr_assigner.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from ppdet.modeling.rbox_utils import box2corners, check_points_in_polys, paddle_gather __all__ = ['FCOSRAssigner'] EPS = 1e-9 @register class FCOSRAssigner(nn.Layer): """ FCOSR Assigner, refer to https://arxiv.org/abs/2111.10780 for details 1. compute normalized gaussian distribution score and refined gaussian distribution score 2. refer to ellipse center sampling, sample points whose normalized gaussian distribution score is greater than threshold 3. refer to multi-level sampling, assign ground truth to feature map which follows two conditions. i). first, the ratio between the short edge of the target and the stride of the feature map is less than 2. ii). second, the long edge of minimum bounding rectangle of the target is larger than the acceptance range of feature map 4. refer to fuzzy sample label assignment, the points satisfying 2 and 3 will be assigned to the ground truth according to gaussian distribution score """ __shared__ = ['num_classes'] def __init__(self, num_classes=80, factor=12, threshold=0.23, boundary=[[-1, 128], [128, 320], [320, 10000]], score_type='iou'): super(FCOSRAssigner, self).__init__() self.num_classes = num_classes self.factor = factor self.threshold = threshold self.boundary = [ paddle.to_tensor( l, dtype=paddle.float32).reshape([1, 1, 2]) for l in boundary ] self.score_type = score_type def get_gaussian_distribution_score(self, points, gt_rboxes, gt_polys): # projecting points to coordinate system defined by each rbox # [B, N, 4, 2] -> 4 * [B, N, 1, 2] a, b, c, d = gt_polys.split(4, axis=2) # [1, L, 2] -> [1, 1, L, 2] points = points.unsqueeze(0) ab = b - a ad = d - a # [B, N, 5] -> [B, N, 2], [B, N, 2], [B, N, 1] xy, wh, angle = gt_rboxes.split([2, 2, 1], axis=-1) # [B, N, 2] -> [B, N, 1, 2] xy = xy.unsqueeze(2) # vector of points to center [B, N, L, 2] vec = points - xy # = |ab| * |vec| * cos(theta) [B, N, L] vec_dot_ab = paddle.sum(vec * ab, axis=-1) # = |ad| * |vec| * cos(theta) [B, N, L] vec_dot_ad = paddle.sum(vec * ad, axis=-1) # norm_ab [B, N, L] norm_ab = paddle.sum(ab * ab, axis=-1).sqrt() # norm_ad [B, N, L] norm_ad = paddle.sum(ad * ad, axis=-1).sqrt() # min(h, w), [B, N, 1] min_edge = paddle.min(wh, axis=-1, keepdim=True) # delta_x, delta_y [B, N, L] delta_x = vec_dot_ab.pow(2) / (norm_ab.pow(3) * min_edge + EPS) delta_y = vec_dot_ad.pow(2) / (norm_ad.pow(3) * min_edge + EPS) # score [B, N, L] norm_score = paddle.exp(-0.5 * self.factor * (delta_x + delta_y)) # simplified calculation sigma = min_edge / self.factor refined_score = norm_score / (2 * np.pi * sigma + EPS) return norm_score, refined_score def get_rotated_inside_mask(self, points, gt_polys, scores): inside_mask = check_points_in_polys(points, gt_polys) center_mask = scores >= self.threshold return (inside_mask & center_mask).cast(paddle.float32) def get_inside_range_mask(self, points, gt_bboxes, gt_rboxes, stride_tensor, regress_range): # [1, L, 2] -> [1, 1, L, 2] points = points.unsqueeze(0) # [B, n, 4] -> [B, n, 1, 4] x1y1, x2y2 = gt_bboxes.unsqueeze(2).split(2, axis=-1) # [B, n, L, 2] lt = points - x1y1 rb = x2y2 - points # [B, n, L, 4] ltrb = paddle.concat([lt, rb], axis=-1) # [B, n, L, 4] -> [B, n, L] inside_mask = paddle.min(ltrb, axis=-1) > EPS # regress_range [1, L, 2] -> [1, 1, L, 2] regress_range = regress_range.unsqueeze(0) # stride_tensor [1, L, 1] -> [1, 1, L] stride_tensor = stride_tensor.transpose((0, 2, 1)) # fcos range # [B, n, L, 4] -> [B, n, L] ltrb_max = paddle.max(ltrb, axis=-1) # [1, 1, L, 2] -> [1, 1, L] low, high = regress_range[..., 0], regress_range[..., 1] # [B, n, L] regress_mask = (ltrb_max >= low) & (ltrb_max <= high) # mask for rotated # [B, n, 1] min_edge = paddle.min(gt_rboxes[..., 2:4], axis=-1, keepdim=True) # [B, n , L] rotated_mask = ((min_edge / stride_tensor) < 2.0) & (ltrb_max > high) mask = inside_mask & (regress_mask | rotated_mask) return mask.cast(paddle.float32) @paddle.no_grad() def forward(self, anchor_points, stride_tensor, num_anchors_list, gt_labels, gt_bboxes, gt_rboxes, pad_gt_mask, bg_index, pred_rboxes=None): r""" Args: anchor_points (Tensor, float32): pre-defined anchor points, shape(1, L, 2), "x, y" format stride_tensor (Tensor, float32): stride tensor, shape (1, L, 1) num_anchors_list (List): num of anchors in each level gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1) gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4) gt_rboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 5) pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1) bg_index (int): background index pred_rboxes (Tensor, float32, optional): predicted bounding boxes, shape(B, L, 5) Returns: assigned_labels (Tensor): (B, L) assigned_rboxes (Tensor): (B, L, 5) assigned_scores (Tensor): (B, L, C), if pred_rboxes is not None, then output ious """ _, num_anchors, _ = anchor_points.shape batch_size, num_max_boxes, _ = gt_rboxes.shape if num_max_boxes == 0: assigned_labels = paddle.full( [batch_size, num_anchors], bg_index, dtype=gt_labels.dtype) assigned_rboxes = paddle.zeros([batch_size, num_anchors, 5]) assigned_scores = paddle.zeros( [batch_size, num_anchors, self.num_classes]) return assigned_labels, assigned_rboxes, assigned_scores # get normalized gaussian distribution score and refined distribution score gt_polys = box2corners(gt_rboxes) score, refined_score = self.get_gaussian_distribution_score( anchor_points, gt_rboxes, gt_polys) inside_mask = self.get_rotated_inside_mask(anchor_points, gt_polys, score) regress_ranges = [] for num, bound in zip(num_anchors_list, self.boundary): regress_ranges.append(bound.tile((1, num, 1))) regress_ranges = paddle.concat(regress_ranges, axis=1) regress_mask = self.get_inside_range_mask( anchor_points, gt_bboxes, gt_rboxes, stride_tensor, regress_ranges) # [B, n, L] mask_positive = inside_mask * regress_mask * pad_gt_mask refined_score = refined_score * mask_positive - (1. - mask_positive) argmax_refined_score = refined_score.argmax(axis=-2) max_refined_score = refined_score.max(axis=-2) assigned_gt_index = argmax_refined_score # assigned target batch_ind = paddle.arange( end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1) assigned_gt_index = assigned_gt_index + (batch_ind * num_max_boxes).astype(assigned_gt_index.dtype) assigned_labels = paddle.gather( gt_labels.flatten(), assigned_gt_index.flatten(), axis=0) assigned_labels = assigned_labels.reshape([batch_size, num_anchors]) assigned_labels = paddle.where( max_refined_score > 0, assigned_labels, paddle.full_like(assigned_labels, bg_index)) assigned_rboxes = paddle.gather( gt_rboxes.reshape([-1, 5]), assigned_gt_index.flatten(), axis=0) assigned_rboxes = assigned_rboxes.reshape([batch_size, num_anchors, 5]) assigned_scores = F.one_hot(assigned_labels, self.num_classes + 1) ind = list(range(self.num_classes + 1)) ind.remove(bg_index) assigned_scores = paddle.index_select( assigned_scores, paddle.to_tensor(ind), axis=-1) if self.score_type == 'gaussian': selected_scores = paddle_gather( score, 1, argmax_refined_score.unsqueeze(-2)).squeeze(-2) assigned_scores = assigned_scores * selected_scores.unsqueeze(-1) elif self.score_type == 'iou': assert pred_rboxes is not None, 'If score type is iou, pred_rboxes should not be None' from ext_op import matched_rbox_iou b, l = pred_rboxes.shape[:2] iou_score = matched_rbox_iou( pred_rboxes.reshape((-1, 5)), assigned_rboxes.reshape( (-1, 5))).reshape((b, l, 1)) assigned_scores = assigned_scores * iou_score return assigned_labels, assigned_rboxes, assigned_scores ================================================ FILE: ppdet/modeling/assigners/hungarian_assigner.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function try: from scipy.optimize import linear_sum_assignment except ImportError: linear_sum_assignment = None import paddle from ppdet.core.workspace import register __all__ = ['PoseHungarianAssigner', 'PseudoSampler'] class AssignResult: """Stores assignments between predicted and truth boxes. Attributes: num_gts (int): the number of truth boxes considered when computing this assignment gt_inds (LongTensor): for each predicted box indicates the 1-based index of the assigned truth box. 0 means unassigned and -1 means ignore. max_overlaps (FloatTensor): the iou between the predicted box and its assigned truth box. labels (None | LongTensor): If specified, for each predicted box indicates the category label of the assigned truth box. """ def __init__(self, num_gts, gt_inds, max_overlaps, labels=None): self.num_gts = num_gts self.gt_inds = gt_inds self.max_overlaps = max_overlaps self.labels = labels # Interface for possible user-defined properties self._extra_properties = {} @property def num_preds(self): """int: the number of predictions in this assignment""" return len(self.gt_inds) def set_extra_property(self, key, value): """Set user-defined new property.""" assert key not in self.info self._extra_properties[key] = value def get_extra_property(self, key): """Get user-defined property.""" return self._extra_properties.get(key, None) @property def info(self): """dict: a dictionary of info about the object""" basic_info = { 'num_gts': self.num_gts, 'num_preds': self.num_preds, 'gt_inds': self.gt_inds, 'max_overlaps': self.max_overlaps, 'labels': self.labels, } basic_info.update(self._extra_properties) return basic_info @register class PoseHungarianAssigner: """Computes one-to-one matching between predictions and ground truth. This class computes an assignment between the targets and the predictions based on the costs. The costs are weighted sum of three components: classification cost, regression L1 cost and regression oks cost. The targets don't include the no_object, so generally there are more predictions than targets. After the one-to-one matching, the un-matched are treated as backgrounds. Thus each query prediction will be assigned with `0` or a positive integer indicating the ground truth index: - 0: negative sample, no assigned gt. - positive integer: positive sample, index (1-based) of assigned gt. Args: cls_weight (int | float, optional): The scale factor for classification cost. Default 1.0. kpt_weight (int | float, optional): The scale factor for regression L1 cost. Default 1.0. oks_weight (int | float, optional): The scale factor for regression oks cost. Default 1.0. """ __inject__ = ['cls_cost', 'kpt_cost', 'oks_cost'] def __init__(self, cls_cost='ClassificationCost', kpt_cost='KptL1Cost', oks_cost='OksCost'): self.cls_cost = cls_cost self.kpt_cost = kpt_cost self.oks_cost = oks_cost def assign(self, cls_pred, kpt_pred, gt_labels, gt_keypoints, gt_areas, img_meta, eps=1e-7): """Computes one-to-one matching based on the weighted costs. This method assign each query prediction to a ground truth or background. The `assigned_gt_inds` with -1 means don't care, 0 means negative sample, and positive number is the index (1-based) of assigned gt. The assignment is done in the following steps, the order matters. 1. assign every prediction to -1 2. compute the weighted costs 3. do Hungarian matching on CPU based on the costs 4. assign all to 0 (background) first, then for each matched pair between predictions and gts, treat this prediction as foreground and assign the corresponding gt index (plus 1) to it. Args: cls_pred (Tensor): Predicted classification logits, shape [num_query, num_class]. kpt_pred (Tensor): Predicted keypoints with normalized coordinates (x_{i}, y_{i}), which are all in range [0, 1]. Shape [num_query, K*2]. gt_labels (Tensor): Label of `gt_keypoints`, shape (num_gt,). gt_keypoints (Tensor): Ground truth keypoints with unnormalized coordinates [p^{1}_x, p^{1}_y, p^{1}_v, ..., \ p^{K}_x, p^{K}_y, p^{K}_v]. Shape [num_gt, K*3]. gt_areas (Tensor): Ground truth mask areas, shape (num_gt,). img_meta (dict): Meta information for current image. eps (int | float, optional): A value added to the denominator for numerical stability. Default 1e-7. Returns: :obj:`AssignResult`: The assigned result. """ num_gts, num_kpts = gt_keypoints.shape[0], kpt_pred.shape[0] if not gt_keypoints.astype('bool').any(): num_gts = 0 # 1. assign -1 by default assigned_gt_inds = paddle.full((num_kpts, ), -1, dtype="int64") assigned_labels = paddle.full((num_kpts, ), -1, dtype="int64") if num_gts == 0 or num_kpts == 0: # No ground truth or keypoints, return empty assignment if num_gts == 0: # No ground truth, assign all to background assigned_gt_inds[:] = 0 return AssignResult( num_gts, assigned_gt_inds, None, labels=assigned_labels) img_h, img_w, _ = img_meta['img_shape'] factor = paddle.to_tensor( [img_w, img_h, img_w, img_h], dtype=gt_keypoints.dtype).reshape( (1, -1)) # 2. compute the weighted costs # classification cost cls_cost = self.cls_cost(cls_pred, gt_labels) # keypoint regression L1 cost gt_keypoints_reshape = gt_keypoints.reshape((gt_keypoints.shape[0], -1, 3)) valid_kpt_flag = gt_keypoints_reshape[..., -1] kpt_pred_tmp = kpt_pred.clone().detach().reshape((kpt_pred.shape[0], -1, 2)) normalize_gt_keypoints = gt_keypoints_reshape[ ..., :2] / factor[:, :2].unsqueeze(0) kpt_cost = self.kpt_cost(kpt_pred_tmp, normalize_gt_keypoints, valid_kpt_flag) # keypoint OKS cost kpt_pred_tmp = kpt_pred.clone().detach().reshape((kpt_pred.shape[0], -1, 2)) kpt_pred_tmp = kpt_pred_tmp * factor[:, :2].unsqueeze(0) oks_cost = self.oks_cost(kpt_pred_tmp, gt_keypoints_reshape[..., :2], valid_kpt_flag, gt_areas) # weighted sum of above three costs cost = cls_cost + kpt_cost + oks_cost # 3. do Hungarian matching on CPU using linear_sum_assignment cost = cost.detach().cpu() if linear_sum_assignment is None: raise ImportError('Please run "pip install scipy" ' 'to install scipy first.') matched_row_inds, matched_col_inds = linear_sum_assignment(cost) matched_row_inds = paddle.to_tensor(matched_row_inds) matched_col_inds = paddle.to_tensor(matched_col_inds) # 4. assign backgrounds and foregrounds # assign all indices to backgrounds first assigned_gt_inds[:] = 0 # assign foregrounds based on matching results assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 assigned_labels[matched_row_inds] = gt_labels[matched_col_inds][ ..., 0].astype("int64") return AssignResult( num_gts, assigned_gt_inds, None, labels=assigned_labels) class SamplingResult: """Bbox sampling result. """ def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result, gt_flags): self.pos_inds = pos_inds self.neg_inds = neg_inds if pos_inds.size > 0: self.pos_bboxes = bboxes[pos_inds] self.neg_bboxes = bboxes[neg_inds] self.pos_is_gt = gt_flags[pos_inds] self.num_gts = gt_bboxes.shape[0] self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 if gt_bboxes.numel() == 0: # hack for index error case assert self.pos_assigned_gt_inds.numel() == 0 self.pos_gt_bboxes = paddle.zeros( gt_bboxes.shape, dtype=gt_bboxes.dtype).reshape((-1, 4)) else: if len(gt_bboxes.shape) < 2: gt_bboxes = gt_bboxes.reshape((-1, 4)) self.pos_gt_bboxes = paddle.index_select( gt_bboxes, self.pos_assigned_gt_inds.astype('int64'), axis=0) if assign_result.labels is not None: self.pos_gt_labels = assign_result.labels[pos_inds] else: self.pos_gt_labels = None @property def bboxes(self): """paddle.Tensor: concatenated positive and negative boxes""" return paddle.concat([self.pos_bboxes, self.neg_bboxes]) def __nice__(self): data = self.info.copy() data['pos_bboxes'] = data.pop('pos_bboxes').shape data['neg_bboxes'] = data.pop('neg_bboxes').shape parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())] body = ' ' + ',\n '.join(parts) return '{\n' + body + '\n}' @property def info(self): """Returns a dictionary of info about the object.""" return { 'pos_inds': self.pos_inds, 'neg_inds': self.neg_inds, 'pos_bboxes': self.pos_bboxes, 'neg_bboxes': self.neg_bboxes, 'pos_is_gt': self.pos_is_gt, 'num_gts': self.num_gts, 'pos_assigned_gt_inds': self.pos_assigned_gt_inds, } @register class PseudoSampler: """A pseudo sampler that does not do sampling actually.""" def __init__(self, **kwargs): pass def _sample_pos(self, **kwargs): """Sample positive samples.""" raise NotImplementedError def _sample_neg(self, **kwargs): """Sample negative samples.""" raise NotImplementedError def sample(self, assign_result, bboxes, gt_bboxes, *args, **kwargs): """Directly returns the positive and negative indices of samples. Args: assign_result (:obj:`AssignResult`): Assigned results bboxes (paddle.Tensor): Bounding boxes gt_bboxes (paddle.Tensor): Ground truth boxes Returns: :obj:`SamplingResult`: sampler results """ pos_inds = paddle.nonzero( assign_result.gt_inds > 0, as_tuple=False).squeeze(-1) neg_inds = paddle.nonzero( assign_result.gt_inds == 0, as_tuple=False).squeeze(-1) gt_flags = paddle.zeros([bboxes.shape[0]], dtype='int32') sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes, assign_result, gt_flags) return sampling_result ================================================ FILE: ppdet/modeling/assigners/max_iou_assigner.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function from ppdet.core.workspace import register from ppdet.modeling.proposal_generator.target import label_box __all__ = ['MaxIoUAssigner'] @register class MaxIoUAssigner(object): """a standard bbox assigner based on max IoU, use ppdet's label_box as backend. Args: positive_overlap (float): threshold for defining positive samples negative_overlap (float): threshold for denining negative samples allow_low_quality (bool): whether to lower IoU thr if a GT poorly overlaps with candidate bboxes """ def __init__(self, positive_overlap, negative_overlap, allow_low_quality=True): self.positive_overlap = positive_overlap self.negative_overlap = negative_overlap self.allow_low_quality = allow_low_quality def __call__(self, bboxes, gt_bboxes): matches, match_labels = label_box( bboxes, gt_bboxes, positive_overlap=self.positive_overlap, negative_overlap=self.negative_overlap, allow_low_quality=self.allow_low_quality, ignore_thresh=-1, is_crowd=None, assign_on_cpu=False) return matches, match_labels ================================================ FILE: ppdet/modeling/assigners/pose_utils.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import paddle import paddle.nn.functional as F from ppdet.core.workspace import register __all__ = ['KptL1Cost', 'OksCost', 'ClassificationCost'] def masked_fill(x, mask, value): y = paddle.full(x.shape, value, x.dtype) return paddle.where(mask, y, x) @register class KptL1Cost(object): """KptL1Cost. this function based on: https://github.com/hikvision-research/opera/blob/main/opera/core/bbox/match_costs/match_cost.py Args: weight (int | float, optional): loss_weight. """ def __init__(self, weight=1.0): self.weight = weight def __call__(self, kpt_pred, gt_keypoints, valid_kpt_flag): """ Args: kpt_pred (Tensor): Predicted keypoints with normalized coordinates (x_{i}, y_{i}), which are all in range [0, 1]. Shape [num_query, K, 2]. gt_keypoints (Tensor): Ground truth keypoints with normalized coordinates (x_{i}, y_{i}). Shape [num_gt, K, 2]. valid_kpt_flag (Tensor): valid flag of ground truth keypoints. Shape [num_gt, K]. Returns: paddle.Tensor: kpt_cost value with weight. """ kpt_cost = [] for i in range(len(gt_keypoints)): if gt_keypoints[i].size == 0: kpt_cost.append(kpt_pred.sum() * 0) kpt_pred_tmp = kpt_pred.clone() valid_flag = valid_kpt_flag[i] > 0 valid_flag_expand = valid_flag.unsqueeze(0).unsqueeze(-1).expand_as( kpt_pred_tmp) if not valid_flag_expand.all(): kpt_pred_tmp = masked_fill(kpt_pred_tmp, ~valid_flag_expand, 0) cost = F.pairwise_distance( kpt_pred_tmp.reshape((kpt_pred_tmp.shape[0], -1)), gt_keypoints[i].reshape((-1, )).unsqueeze(0), p=1, keepdim=True) avg_factor = paddle.clip( valid_flag.astype('float32').sum() * 2, 1.0) cost = cost / avg_factor kpt_cost.append(cost) kpt_cost = paddle.concat(kpt_cost, axis=1) return kpt_cost * self.weight @register class OksCost(object): """OksCost. this function based on: https://github.com/hikvision-research/opera/blob/main/opera/core/bbox/match_costs/match_cost.py Args: num_keypoints (int): number of keypoints weight (int | float, optional): loss_weight. """ def __init__(self, num_keypoints=17, weight=1.0): self.weight = weight if num_keypoints == 17: self.sigmas = np.array( [ .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89 ], dtype=np.float32) / 10.0 elif num_keypoints == 14: self.sigmas = np.array( [ .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89, .79, .79 ], dtype=np.float32) / 10.0 else: raise ValueError(f'Unsupported keypoints number {num_keypoints}') def __call__(self, kpt_pred, gt_keypoints, valid_kpt_flag, gt_areas): """ Args: kpt_pred (Tensor): Predicted keypoints with unnormalized coordinates (x_{i}, y_{i}). Shape [num_query, K, 2]. gt_keypoints (Tensor): Ground truth keypoints with unnormalized coordinates (x_{i}, y_{i}). Shape [num_gt, K, 2]. valid_kpt_flag (Tensor): valid flag of ground truth keypoints. Shape [num_gt, K]. gt_areas (Tensor): Ground truth mask areas. Shape [num_gt,]. Returns: paddle.Tensor: oks_cost value with weight. """ sigmas = paddle.to_tensor(self.sigmas) variances = (sigmas * 2)**2 oks_cost = [] assert len(gt_keypoints) == len(gt_areas) for i in range(len(gt_keypoints)): if gt_keypoints[i].size == 0: oks_cost.append(kpt_pred.sum() * 0) squared_distance = \ (kpt_pred[:, :, 0] - gt_keypoints[i, :, 0].unsqueeze(0)) ** 2 + \ (kpt_pred[:, :, 1] - gt_keypoints[i, :, 1].unsqueeze(0)) ** 2 vis_flag = (valid_kpt_flag[i] > 0).astype('int') vis_ind = vis_flag.nonzero(as_tuple=False)[:, 0] num_vis_kpt = vis_ind.shape[0] # assert num_vis_kpt > 0 if num_vis_kpt == 0: oks_cost.append(paddle.zeros((squared_distance.shape[0], 1))) continue area = gt_areas[i] squared_distance0 = squared_distance / (area * variances * 2) squared_distance0 = paddle.index_select( squared_distance0, vis_ind, axis=1) squared_distance1 = paddle.exp(-squared_distance0).sum(axis=1, keepdim=True) oks = squared_distance1 / num_vis_kpt # The 1 is a constant that doesn't change the matching, so omitted. oks_cost.append(-oks) oks_cost = paddle.concat(oks_cost, axis=1) return oks_cost * self.weight @register class ClassificationCost: """ClsSoftmaxCost. Args: weight (int | float, optional): loss_weight """ def __init__(self, weight=1.): self.weight = weight def __call__(self, cls_pred, gt_labels): """ Args: cls_pred (Tensor): Predicted classification logits, shape (num_query, num_class). gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). Returns: paddle.Tensor: cls_cost value with weight """ # Following the official DETR repo, contrary to the loss that # NLL is used, we approximate it in 1 - cls_score[gt_label]. # The 1 is a constant that doesn't change the matching, # so it can be omitted. cls_score = cls_pred.softmax(-1) cls_cost = -cls_score[:, gt_labels] return cls_cost * self.weight @register class FocalLossCost: """FocalLossCost. Args: weight (int | float, optional): loss_weight alpha (int | float, optional): focal_loss alpha gamma (int | float, optional): focal_loss gamma eps (float, optional): default 1e-12 binary_input (bool, optional): Whether the input is binary, default False. """ def __init__(self, weight=1., alpha=0.25, gamma=2, eps=1e-12, binary_input=False): self.weight = weight self.alpha = alpha self.gamma = gamma self.eps = eps self.binary_input = binary_input def _focal_loss_cost(self, cls_pred, gt_labels): """ Args: cls_pred (Tensor): Predicted classification logits, shape (num_query, num_class). gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). Returns: paddle.Tensor: cls_cost value with weight """ if gt_labels.size == 0: return cls_pred.sum() * 0 cls_pred = F.sigmoid(cls_pred) neg_cost = -(1 - cls_pred + self.eps).log() * ( 1 - self.alpha) * cls_pred.pow(self.gamma) pos_cost = -(cls_pred + self.eps).log() * self.alpha * ( 1 - cls_pred).pow(self.gamma) cls_cost = paddle.index_select( pos_cost, gt_labels, axis=1) - paddle.index_select( neg_cost, gt_labels, axis=1) return cls_cost * self.weight def _mask_focal_loss_cost(self, cls_pred, gt_labels): """ Args: cls_pred (Tensor): Predicted classfication logits in shape (num_query, d1, ..., dn), dtype=paddle.float32. gt_labels (Tensor): Ground truth in shape (num_gt, d1, ..., dn), dtype=paddle.long. Labels should be binary. Returns: Tensor: Focal cost matrix with weight in shape\ (num_query, num_gt). """ cls_pred = cls_pred.flatten(1) gt_labels = gt_labels.flatten(1).float() n = cls_pred.shape[1] cls_pred = F.sigmoid(cls_pred) neg_cost = -(1 - cls_pred + self.eps).log() * ( 1 - self.alpha) * cls_pred.pow(self.gamma) pos_cost = -(cls_pred + self.eps).log() * self.alpha * ( 1 - cls_pred).pow(self.gamma) cls_cost = paddle.einsum('nc,mc->nm', pos_cost, gt_labels) + \ paddle.einsum('nc,mc->nm', neg_cost, (1 - gt_labels)) return cls_cost / n * self.weight def __call__(self, cls_pred, gt_labels): """ Args: cls_pred (Tensor): Predicted classfication logits. gt_labels (Tensor)): Labels. Returns: Tensor: Focal cost matrix with weight in shape\ (num_query, num_gt). """ if self.binary_input: return self._mask_focal_loss_cost(cls_pred, gt_labels) else: return self._focal_loss_cost(cls_pred, gt_labels) ================================================ FILE: ppdet/modeling/assigners/rotated_task_aligned_assigner.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from ..rbox_utils import rotated_iou_similarity, check_points_in_rotated_boxes from .utils import gather_topk_anchors, compute_max_iou_anchor __all__ = ['RotatedTaskAlignedAssigner'] @register class RotatedTaskAlignedAssigner(nn.Layer): """TOOD: Task-aligned One-stage Object Detection """ def __init__(self, topk=13, alpha=1.0, beta=6.0, eps=1e-9): super(RotatedTaskAlignedAssigner, self).__init__() self.topk = topk self.alpha = alpha self.beta = beta self.eps = eps @paddle.no_grad() def forward(self, pred_scores, pred_bboxes, anchor_points, num_anchors_list, gt_labels, gt_bboxes, pad_gt_mask, bg_index, gt_scores=None): r"""This code is based on https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py The assignment is done in following steps 1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt 2. select top-k bbox as candidates for each gt 3. limit the positive sample's center in gt (because the anchor-free detector only can predict positive distance) 4. if an anchor box is assigned to multiple gts, the one with the highest iou will be selected. Args: pred_scores (Tensor, float32): predicted class probability, shape(B, L, C) pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 5) anchor_points (Tensor, float32): pre-defined anchors, shape(1, L, 2), "cxcy" format num_anchors_list (List): num of anchors in each level, shape(L) gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1) gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 5) pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1) bg_index (int): background index gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1) Returns: assigned_labels (Tensor): (B, L) assigned_bboxes (Tensor): (B, L, 5) assigned_scores (Tensor): (B, L, C) """ assert pred_scores.ndim == pred_bboxes.ndim assert gt_labels.ndim == gt_bboxes.ndim and \ gt_bboxes.ndim == 3 batch_size, num_anchors, num_classes = pred_scores.shape _, num_max_boxes, _ = gt_bboxes.shape # negative batch if num_max_boxes == 0: assigned_labels = paddle.full( [batch_size, num_anchors], bg_index, dtype=gt_labels.dtype) assigned_bboxes = paddle.zeros([batch_size, num_anchors, 5]) assigned_scores = paddle.zeros( [batch_size, num_anchors, num_classes]) return assigned_labels, assigned_bboxes, assigned_scores # compute iou between gt and pred bbox, [B, n, L] ious = rotated_iou_similarity(gt_bboxes, pred_bboxes) ious = paddle.where(ious > 1 + self.eps, paddle.zeros_like(ious), ious) ious.stop_gradient = True # gather pred bboxes class score pred_scores = pred_scores.transpose([0, 2, 1]) batch_ind = paddle.arange( end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1) gt_labels_ind = paddle.stack( [batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)], axis=-1) bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind) # compute alignment metrics, [B, n, L] alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow( self.beta) # check the positive sample's center in gt, [B, n, L] is_in_gts = check_points_in_rotated_boxes(anchor_points, gt_bboxes) # select topk largest alignment metrics pred bbox as candidates # for each gt, [B, n, L] is_in_topk = gather_topk_anchors( alignment_metrics * is_in_gts.astype(alignment_metrics.dtype), self.topk, topk_mask=pad_gt_mask) # select positive sample, [B, n, L] mask_positive = is_in_topk * is_in_gts.astype(is_in_topk.dtype) * pad_gt_mask # if an anchor box is assigned to multiple gts, # the one with the highest iou will be selected, [B, n, L] mask_positive_sum = mask_positive.sum(axis=-2) if mask_positive_sum.max() > 1: mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile( [1, num_max_boxes, 1]) is_max_iou = compute_max_iou_anchor(ious) mask_positive = paddle.where(mask_multiple_gts, is_max_iou, mask_positive) mask_positive_sum = mask_positive.sum(axis=-2) assigned_gt_index = mask_positive.argmax(axis=-2) # assigned target assigned_gt_index = assigned_gt_index + (batch_ind * num_max_boxes).astype(assigned_gt_index.dtype) assigned_labels = paddle.gather( gt_labels.flatten(), assigned_gt_index.flatten(), axis=0) assigned_labels = assigned_labels.reshape([batch_size, num_anchors]) assigned_labels = paddle.where( mask_positive_sum > 0, assigned_labels, paddle.full_like(assigned_labels, bg_index)) assigned_bboxes = paddle.gather( gt_bboxes.reshape([-1, 5]), assigned_gt_index.flatten(), axis=0) assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 5]) assigned_scores = F.one_hot(assigned_labels, num_classes + 1) ind = list(range(num_classes + 1)) ind.remove(bg_index) assigned_scores = paddle.index_select( assigned_scores, paddle.to_tensor(ind), axis=-1) # rescale alignment metrics alignment_metrics *= mask_positive max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True) max_ious_per_instance = (ious * mask_positive).max(axis=-1, keepdim=True) alignment_metrics = alignment_metrics / ( max_metrics_per_instance + self.eps) * max_ious_per_instance alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1) assigned_scores = assigned_scores * alignment_metrics assigned_bboxes.stop_gradient = True assigned_scores.stop_gradient = True assigned_labels.stop_gradient = True return assigned_labels, assigned_bboxes, assigned_scores ================================================ FILE: ppdet/modeling/assigners/simota_assigner.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # The code is based on: # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/assigners/sim_ota_assigner.py import paddle import numpy as np import paddle.nn.functional as F from ppdet.modeling.losses.varifocal_loss import varifocal_loss from ppdet.modeling.bbox_utils import batch_bbox_overlaps from ppdet.core.workspace import register @register class SimOTAAssigner(object): """Computes matching between predictions and ground truth. Args: center_radius (int | float, optional): Ground truth center size to judge whether a prior is in center. Default 2.5. candidate_topk (int, optional): The candidate top-k which used to get top-k ious to calculate dynamic-k. Default 10. iou_weight (int | float, optional): The scale factor for regression iou cost. Default 3.0. cls_weight (int | float, optional): The scale factor for classification cost. Default 1.0. num_classes (int): The num_classes of dataset. use_vfl (int): Whether to use varifocal_loss when calculating the cost matrix. """ __shared__ = ['num_classes'] def __init__(self, center_radius=2.5, candidate_topk=10, iou_weight=3.0, cls_weight=1.0, num_classes=80, use_vfl=True): self.center_radius = center_radius self.candidate_topk = candidate_topk self.iou_weight = iou_weight self.cls_weight = cls_weight self.num_classes = num_classes self.use_vfl = use_vfl def get_in_gt_and_in_center_info(self, flatten_center_and_stride, gt_bboxes): num_gt = gt_bboxes.shape[0] flatten_x = flatten_center_and_stride[:, 0].unsqueeze(1).tile( [1, num_gt]) flatten_y = flatten_center_and_stride[:, 1].unsqueeze(1).tile( [1, num_gt]) flatten_stride_x = flatten_center_and_stride[:, 2].unsqueeze(1).tile( [1, num_gt]) flatten_stride_y = flatten_center_and_stride[:, 3].unsqueeze(1).tile( [1, num_gt]) # is prior centers in gt bboxes, shape: [n_center, n_gt] l_ = flatten_x - gt_bboxes[:, 0] t_ = flatten_y - gt_bboxes[:, 1] r_ = gt_bboxes[:, 2] - flatten_x b_ = gt_bboxes[:, 3] - flatten_y deltas = paddle.stack([l_, t_, r_, b_], axis=1) is_in_gts = deltas.min(axis=1) > 0 is_in_gts_all = is_in_gts.sum(axis=1) > 0 # is prior centers in gt centers gt_center_xs = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0 gt_center_ys = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0 ct_bound_l = gt_center_xs - self.center_radius * flatten_stride_x ct_bound_t = gt_center_ys - self.center_radius * flatten_stride_y ct_bound_r = gt_center_xs + self.center_radius * flatten_stride_x ct_bound_b = gt_center_ys + self.center_radius * flatten_stride_y cl_ = flatten_x - ct_bound_l ct_ = flatten_y - ct_bound_t cr_ = ct_bound_r - flatten_x cb_ = ct_bound_b - flatten_y ct_deltas = paddle.stack([cl_, ct_, cr_, cb_], axis=1) is_in_cts = ct_deltas.min(axis=1) > 0 is_in_cts_all = is_in_cts.sum(axis=1) > 0 # in any of gts or gt centers, shape: [n_center] is_in_gts_or_centers_all = paddle.logical_or(is_in_gts_all, is_in_cts_all) is_in_gts_or_centers_all_inds = paddle.nonzero( is_in_gts_or_centers_all).squeeze(1) # both in gts and gt centers, shape: [num_fg, num_gt] is_in_gts_and_centers = paddle.logical_and( paddle.gather( is_in_gts.cast('int'), is_in_gts_or_centers_all_inds, axis=0).cast('bool'), paddle.gather( is_in_cts.cast('int'), is_in_gts_or_centers_all_inds, axis=0).cast('bool')) return is_in_gts_or_centers_all, is_in_gts_or_centers_all_inds, is_in_gts_and_centers def dynamic_k_matching(self, cost_matrix, pairwise_ious, num_gt): match_matrix = np.zeros_like(cost_matrix.numpy()) # select candidate topk ious for dynamic-k calculation topk_ious, _ = paddle.topk( pairwise_ious, min(self.candidate_topk, pairwise_ious.shape[0]), axis=0) # calculate dynamic k for each gt dynamic_ks = paddle.clip(topk_ious.sum(0).cast('int'), min=1) for gt_idx in range(num_gt): _, pos_idx = paddle.topk( cost_matrix[:, gt_idx], k=dynamic_ks[gt_idx], largest=False) match_matrix[:, gt_idx][pos_idx.numpy()] = 1.0 del topk_ious, dynamic_ks, pos_idx # match points more than two gts extra_match_gts_mask = match_matrix.sum(1) > 1 if extra_match_gts_mask.sum() > 0: cost_matrix = cost_matrix.numpy() cost_argmin = np.argmin( cost_matrix[extra_match_gts_mask, :], axis=1) match_matrix[extra_match_gts_mask, :] *= 0.0 match_matrix[extra_match_gts_mask, cost_argmin] = 1.0 # get foreground mask match_fg_mask_inmatrix = match_matrix.sum(1) > 0 match_gt_inds_to_fg = match_matrix[match_fg_mask_inmatrix, :].argmax(1) return match_gt_inds_to_fg, match_fg_mask_inmatrix def get_sample(self, assign_gt_inds, gt_bboxes): pos_inds = np.unique(np.nonzero(assign_gt_inds > 0)[0]) neg_inds = np.unique(np.nonzero(assign_gt_inds == 0)[0]) pos_assigned_gt_inds = assign_gt_inds[pos_inds] - 1 if gt_bboxes.size == 0: # hack for index error case assert pos_assigned_gt_inds.size == 0 pos_gt_bboxes = np.empty_like(gt_bboxes).reshape(-1, 4) else: if len(gt_bboxes.shape) < 2: gt_bboxes = gt_bboxes.resize(-1, 4) pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :] return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds def __call__(self, flatten_cls_pred_scores, flatten_center_and_stride, flatten_bboxes, gt_bboxes, gt_labels, eps=1e-7): """Assign gt to priors using SimOTA. TODO: add comment. Returns: assign_result: The assigned result. """ num_gt = gt_bboxes.shape[0] num_bboxes = flatten_bboxes.shape[0] if num_gt == 0 or num_bboxes == 0: # No ground truth or boxes label = np.ones([num_bboxes], dtype=np.int64) * self.num_classes label_weight = np.ones([num_bboxes], dtype=np.float32) bbox_target = np.zeros_like(flatten_center_and_stride) return 0, label, label_weight, bbox_target is_in_gts_or_centers_all, is_in_gts_or_centers_all_inds, is_in_boxes_and_center = self.get_in_gt_and_in_center_info( flatten_center_and_stride, gt_bboxes) # bboxes and scores to calculate matrix valid_flatten_bboxes = flatten_bboxes[is_in_gts_or_centers_all_inds] valid_cls_pred_scores = flatten_cls_pred_scores[ is_in_gts_or_centers_all_inds] num_valid_bboxes = valid_flatten_bboxes.shape[0] pairwise_ious = batch_bbox_overlaps(valid_flatten_bboxes, gt_bboxes) # [num_points,num_gts] if self.use_vfl: gt_vfl_labels = gt_labels.squeeze(-1).unsqueeze(0).tile( [num_valid_bboxes, 1]).reshape([-1]) valid_pred_scores = valid_cls_pred_scores.unsqueeze(1).tile( [1, num_gt, 1]).reshape([-1, self.num_classes]) vfl_score = np.zeros(valid_pred_scores.shape) vfl_score[np.arange(0, vfl_score.shape[0]), gt_vfl_labels.numpy( )] = pairwise_ious.reshape([-1]) vfl_score = paddle.to_tensor(vfl_score) losses_vfl = varifocal_loss( valid_pred_scores, vfl_score, use_sigmoid=False).reshape([num_valid_bboxes, num_gt]) losses_giou = batch_bbox_overlaps( valid_flatten_bboxes, gt_bboxes, mode='giou') cost_matrix = ( losses_vfl * self.cls_weight + losses_giou * self.iou_weight + paddle.logical_not(is_in_boxes_and_center).cast('float32') * 100000000) else: iou_cost = -paddle.log(pairwise_ious + eps) gt_onehot_label = (F.one_hot( gt_labels.squeeze(-1).cast(paddle.int64), flatten_cls_pred_scores.shape[-1]).cast('float32').unsqueeze(0) .tile([num_valid_bboxes, 1, 1])) valid_pred_scores = valid_cls_pred_scores.unsqueeze(1).tile( [1, num_gt, 1]) cls_cost = F.binary_cross_entropy( valid_pred_scores, gt_onehot_label, reduction='none').sum(-1) cost_matrix = ( cls_cost * self.cls_weight + iou_cost * self.iou_weight + paddle.logical_not(is_in_boxes_and_center).cast('float32') * 100000000) match_gt_inds_to_fg, match_fg_mask_inmatrix = \ self.dynamic_k_matching( cost_matrix, pairwise_ious, num_gt) # sample and assign results assigned_gt_inds = np.zeros([num_bboxes], dtype=np.int64) match_fg_mask_inall = np.zeros_like(assigned_gt_inds) match_fg_mask_inall[is_in_gts_or_centers_all.numpy( )] = match_fg_mask_inmatrix assigned_gt_inds[match_fg_mask_inall.astype( np.bool_)] = match_gt_inds_to_fg + 1 pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds \ = self.get_sample(assigned_gt_inds, gt_bboxes.numpy()) bbox_target = np.zeros(flatten_bboxes.shape, paddle.common_ops_import.convert_dtype(flatten_bboxes.dtype)) bbox_weight = np.zeros_like(bbox_target) label = np.ones([num_bboxes], dtype=np.int64) * self.num_classes label_weight = np.zeros([num_bboxes], dtype=np.float32) if len(pos_inds) > 0: gt_labels = gt_labels.numpy() pos_bbox_targets = pos_gt_bboxes bbox_target[pos_inds, :] = pos_bbox_targets bbox_weight[pos_inds, :] = 1.0 if not np.any(gt_labels): label[pos_inds] = 0 else: label[pos_inds] = gt_labels.squeeze(-1)[pos_assigned_gt_inds] label_weight[pos_inds] = 1.0 if len(neg_inds) > 0: label_weight[neg_inds] = 1.0 pos_num = max(pos_inds.size, 1) return pos_num, label, label_weight, bbox_target ================================================ FILE: ppdet/modeling/assigners/task_aligned_assigner.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from ..bbox_utils import batch_iou_similarity from .utils import (gather_topk_anchors, check_points_inside_bboxes, compute_max_iou_anchor) __all__ = ['TaskAlignedAssigner'] def is_close_gt(anchor, gt, stride_lst, max_dist=2.0, alpha=2.): """Calculate distance ratio of box1 and box2 in batch for larger stride anchors dist/stride to promote the survive of large distance match Args: anchor (Tensor): box with the shape [L, 2] gt (Tensor): box with the shape [N, M2, 4] Return: dist (Tensor): dist ratio between box1 and box2 with the shape [N, M1, M2] """ center1 = anchor.unsqueeze(0) center2 = (gt[..., :2] + gt[..., -2:]) / 2. center1 = center1.unsqueeze(1) # [N, M1, 2] -> [N, 1, M1, 2] center2 = center2.unsqueeze(2) # [N, M2, 2] -> [N, M2, 1, 2] stride = paddle.concat([ paddle.full([x], 32 / pow(2, idx)) for idx, x in enumerate(stride_lst) ]).unsqueeze(0).unsqueeze(0) dist = paddle.linalg.norm(center1 - center2, p=2, axis=-1) / stride dist_ratio = dist dist_ratio[dist < max_dist] = 1. dist_ratio[dist >= max_dist] = 0. return dist_ratio @register class TaskAlignedAssigner(nn.Layer): """TOOD: Task-aligned One-stage Object Detection """ def __init__(self, topk=13, alpha=1.0, beta=6.0, eps=1e-9, is_close_gt=False): super(TaskAlignedAssigner, self).__init__() self.topk = topk self.alpha = alpha self.beta = beta self.eps = eps self.is_close_gt = is_close_gt @paddle.no_grad() def forward(self, pred_scores, pred_bboxes, anchor_points, num_anchors_list, gt_labels, gt_bboxes, pad_gt_mask, bg_index, gt_segms=None, gt_scores=None): r"""This code is based on https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py The assignment is done in following steps 1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt 2. select top-k bbox as candidates for each gt 3. limit the positive sample's center in gt (because the anchor-free detector only can predict positive distance) 4. if an anchor box is assigned to multiple gts, the one with the highest iou will be selected. Args: pred_scores (Tensor, float32): predicted class probability, shape(B, L, C) pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 4) anchor_points (Tensor, float32): pre-defined anchors, shape(L, 2), "cxcy" format num_anchors_list (List): num of anchors in each level, shape(L) gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1) gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4) pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1) bg_index (int): background index gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1) Returns: assigned_labels (Tensor): (B, L) assigned_bboxes (Tensor): (B, L, 4) assigned_scores (Tensor): (B, L, C) """ assert pred_scores.ndim == pred_bboxes.ndim assert gt_labels.ndim == gt_bboxes.ndim and \ gt_bboxes.ndim == 3 batch_size, num_anchors, num_classes = pred_scores.shape _, num_max_boxes, _ = gt_bboxes.shape # negative batch if num_max_boxes == 0: assigned_labels = paddle.full( [batch_size, num_anchors], bg_index, dtype='int32') assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4]) assigned_scores = paddle.zeros( [batch_size, num_anchors, num_classes]) return assigned_labels, assigned_bboxes, assigned_scores # compute iou between gt and pred bbox, [B, n, L] ious = batch_iou_similarity(gt_bboxes, pred_bboxes) # gather pred bboxes class score pred_scores = pred_scores.transpose([0, 2, 1]) batch_ind = paddle.arange( end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1) gt_labels_ind = paddle.stack( [batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)], axis=-1) bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind) # compute alignment metrics, [B, n, L] alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow( self.beta) # check the positive sample's center in gt, [B, n, L] if self.is_close_gt: is_in_gts = is_close_gt(anchor_points, gt_bboxes, num_anchors_list) else: is_in_gts = check_points_inside_bboxes(anchor_points, gt_bboxes) # select topk largest alignment metrics pred bbox as candidates # for each gt, [B, n, L] is_in_topk = gather_topk_anchors( alignment_metrics * is_in_gts, self.topk, topk_mask=pad_gt_mask) # select positive sample, [B, n, L] mask_positive = is_in_topk * is_in_gts * pad_gt_mask # if an anchor box is assigned to multiple gts, # the one with the highest iou will be selected, [B, n, L] mask_positive_sum = mask_positive.sum(axis=-2) if mask_positive_sum.max() > 1: mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile( [1, num_max_boxes, 1]) is_max_iou = compute_max_iou_anchor(ious) mask_positive = paddle.where(mask_multiple_gts, is_max_iou, mask_positive) mask_positive_sum = mask_positive.sum(axis=-2) assigned_gt_index = mask_positive.argmax(axis=-2) # assigned target assigned_gt_index = assigned_gt_index + (batch_ind * num_max_boxes).astype(assigned_gt_index.dtype) assigned_labels = paddle.gather( gt_labels.flatten(), assigned_gt_index.flatten(), axis=0) assigned_labels = assigned_labels.reshape([batch_size, num_anchors]) assigned_labels = paddle.where( mask_positive_sum > 0, assigned_labels, paddle.full_like(assigned_labels, bg_index)) assigned_bboxes = paddle.gather( gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0) assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4]) assigned_scores = F.one_hot(assigned_labels, num_classes + 1) ind = list(range(num_classes + 1)) ind.remove(bg_index) assigned_scores = paddle.index_select( assigned_scores, paddle.to_tensor(ind), axis=-1) # rescale alignment metrics alignment_metrics *= mask_positive max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True) max_ious_per_instance = (ious * mask_positive).max(axis=-1, keepdim=True) alignment_metrics = alignment_metrics / ( max_metrics_per_instance + self.eps) * max_ious_per_instance alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1) assigned_scores = assigned_scores * alignment_metrics if gt_segms is not None: return assigned_labels, assigned_bboxes, assigned_scores, assigned_gt_index else: return assigned_labels, assigned_bboxes, assigned_scores ================================================ FILE: ppdet/modeling/assigners/task_aligned_assigner_cr.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from ..bbox_utils import batch_iou_similarity from .utils import (gather_topk_anchors, check_points_inside_bboxes, compute_max_iou_anchor) __all__ = ['TaskAlignedAssigner_CR'] @register class TaskAlignedAssigner_CR(nn.Layer): """TOOD: Task-aligned One-stage Object Detection with Center R """ def __init__(self, topk=13, alpha=1.0, beta=6.0, center_radius=None, eps=1e-9): super(TaskAlignedAssigner_CR, self).__init__() self.topk = topk self.alpha = alpha self.beta = beta self.center_radius = center_radius self.eps = eps @paddle.no_grad() def forward(self, pred_scores, pred_bboxes, anchor_points, stride_tensor, gt_labels, gt_bboxes, pad_gt_mask, bg_index, gt_scores=None): r"""This code is based on https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py The assignment is done in following steps 1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt 2. select top-k bbox as candidates for each gt 3. limit the positive sample's center in gt (because the anchor-free detector only can predict positive distance) 4. if an anchor box is assigned to multiple gts, the one with the highest iou will be selected. Args: pred_scores (Tensor, float32): predicted class probability, shape(B, L, C) pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 4) anchor_points (Tensor, float32): pre-defined anchors, shape(L, 2), "cxcy" format stride_tensor (Tensor, float32): stride of feature map, shape(L, 1) gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1) gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4) pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1) bg_index (int): background index gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1) Returns: assigned_labels (Tensor): (B, L) assigned_bboxes (Tensor): (B, L, 4) assigned_scores (Tensor): (B, L, C) """ assert pred_scores.ndim == pred_bboxes.ndim assert gt_labels.ndim == gt_bboxes.ndim and \ gt_bboxes.ndim == 3 batch_size, num_anchors, num_classes = pred_scores.shape _, num_max_boxes, _ = gt_bboxes.shape # negative batch if num_max_boxes == 0: assigned_labels = paddle.full( [batch_size, num_anchors], bg_index, dtype='int32') assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4]) assigned_scores = paddle.zeros( [batch_size, num_anchors, num_classes]) return assigned_labels, assigned_bboxes, assigned_scores # compute iou between gt and pred bbox, [B, n, L] ious = batch_iou_similarity(gt_bboxes, pred_bboxes) # gather pred bboxes class score pred_scores = pred_scores.transpose([0, 2, 1]) batch_ind = paddle.arange( end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1) gt_labels_ind = paddle.stack( [batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)], axis=-1) bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind) # compute alignment metrics, [B, n, L] alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow( self.beta) * pad_gt_mask # select positive sample, [B, n, L] if self.center_radius is None: # check the positive sample's center in gt, [B, n, L] is_in_gts = check_points_inside_bboxes( anchor_points, gt_bboxes, sm_use=True) # select topk largest alignment metrics pred bbox as candidates # for each gt, [B, n, L] mask_positive = gather_topk_anchors( alignment_metrics, self.topk, topk_mask=pad_gt_mask) * is_in_gts else: is_in_gts, is_in_center = check_points_inside_bboxes( anchor_points, gt_bboxes, stride_tensor * self.center_radius, sm_use=True) is_in_gts *= pad_gt_mask is_in_center *= pad_gt_mask candidate_metrics = paddle.where( is_in_gts.sum(-1, keepdim=True) == 0, alignment_metrics + is_in_center, alignment_metrics) mask_positive = gather_topk_anchors( candidate_metrics, self.topk, topk_mask=pad_gt_mask) * paddle.cast((is_in_center > 0) | (is_in_gts > 0), 'float32') # if an anchor box is assigned to multiple gts, # the one with the highest iou will be selected, [B, n, L] mask_positive_sum = mask_positive.sum(axis=-2) if mask_positive_sum.max() > 1: mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile( [1, num_max_boxes, 1]) is_max_iou = compute_max_iou_anchor(ious * mask_positive) mask_positive = paddle.where(mask_multiple_gts, is_max_iou, mask_positive) mask_positive_sum = mask_positive.sum(axis=-2) assigned_gt_index = mask_positive.argmax(axis=-2) # assigned target assigned_gt_index = assigned_gt_index + (batch_ind * num_max_boxes).astype(assigned_gt_index.dtype) assigned_labels = paddle.gather( gt_labels.flatten(), assigned_gt_index.flatten(), axis=0) assigned_labels = assigned_labels.reshape([batch_size, num_anchors]) assigned_labels = paddle.where( mask_positive_sum > 0, assigned_labels, paddle.full_like(assigned_labels, bg_index)) assigned_bboxes = paddle.gather( gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0) assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4]) assigned_scores = F.one_hot(assigned_labels, num_classes + 1) ind = list(range(num_classes + 1)) ind.remove(bg_index) assigned_scores = paddle.index_select( assigned_scores, paddle.to_tensor(ind), axis=-1) # rescale alignment metrics alignment_metrics *= mask_positive max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True) max_ious_per_instance = (ious * mask_positive).max(axis=-1, keepdim=True) alignment_metrics = alignment_metrics / ( max_metrics_per_instance + self.eps) * max_ious_per_instance alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1) assigned_scores = assigned_scores * alignment_metrics return assigned_labels, assigned_bboxes, assigned_scores ================================================ FILE: ppdet/modeling/assigners/uniform_assigner.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from ppdet.modeling.bbox_utils import batch_bbox_overlaps from ppdet.modeling.transformers import bbox_xyxy_to_cxcywh __all__ = ['UniformAssigner'] def batch_p_dist(x, y, p=2): """ calculate pairwise p_dist, the first index of x and y are batch return [x.shape[0], y.shape[0]] """ x = x.unsqueeze(1) diff = x - y return paddle.norm(diff, p=p, axis=list(range(2, diff.dim()))) @register class UniformAssigner(nn.Layer): def __init__(self, pos_ignore_thr, neg_ignore_thr, match_times=4): super(UniformAssigner, self).__init__() self.pos_ignore_thr = pos_ignore_thr self.neg_ignore_thr = neg_ignore_thr self.match_times = match_times def forward(self, bbox_pred, anchor, gt_bboxes, gt_labels=None): num_bboxes = bbox_pred.shape[0] num_gts = gt_bboxes.shape[0] match_labels = paddle.full([num_bboxes], -1, dtype=paddle.int32) pred_ious = batch_bbox_overlaps(bbox_pred, gt_bboxes) pred_max_iou = pred_ious.max(axis=1) neg_ignore = pred_max_iou > self.neg_ignore_thr # exclude potential ignored neg samples first, deal with pos samples later #match_labels: -2(ignore), -1(neg) or >=0(pos_inds) match_labels = paddle.where(neg_ignore, paddle.full_like(match_labels, -2), match_labels) bbox_pred_c = bbox_xyxy_to_cxcywh(bbox_pred) anchor_c = bbox_xyxy_to_cxcywh(anchor) gt_bboxes_c = bbox_xyxy_to_cxcywh(gt_bboxes) bbox_pred_dist = batch_p_dist(bbox_pred_c, gt_bboxes_c, p=1) anchor_dist = batch_p_dist(anchor_c, gt_bboxes_c, p=1) top_pred = bbox_pred_dist.topk( k=self.match_times, axis=0, largest=False)[1] top_anchor = anchor_dist.topk( k=self.match_times, axis=0, largest=False)[1] tar_pred = paddle.arange(num_gts).expand([self.match_times, num_gts]) tar_anchor = paddle.arange(num_gts).expand([self.match_times, num_gts]) pos_places = paddle.concat([top_pred, top_anchor]).reshape([-1]) pos_inds = paddle.concat([tar_pred, tar_anchor]).reshape([-1]) pos_anchor = anchor[pos_places] pos_tar_bbox = gt_bboxes[pos_inds] pos_ious = batch_bbox_overlaps( pos_anchor, pos_tar_bbox, is_aligned=True) pos_ignore = pos_ious < self.pos_ignore_thr pos_inds = paddle.where(pos_ignore, paddle.full_like(pos_inds, -2), pos_inds) match_labels[pos_places] = pos_inds match_labels.stop_gradient = True pos_keep = ~pos_ignore if pos_keep.sum() > 0: pos_places_keep = pos_places[pos_keep] pos_bbox_pred = bbox_pred[pos_places_keep].reshape([-1, 4]) pos_bbox_tar = pos_tar_bbox[pos_keep].reshape([-1, 4]).detach() else: pos_bbox_pred = None pos_bbox_tar = None return match_labels, pos_bbox_pred, pos_bbox_tar ================================================ FILE: ppdet/modeling/assigners/utils.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn.functional as F __all__ = [ 'pad_gt', 'gather_topk_anchors', 'check_points_inside_bboxes', 'compute_max_iou_anchor', 'compute_max_iou_gt', 'generate_anchors_for_grid_cell' ] def pad_gt(gt_labels, gt_bboxes, gt_scores=None): r""" Pad 0 in gt_labels and gt_bboxes. Args: gt_labels (Tensor|List[Tensor], int64): Label of gt_bboxes, shape is [B, n, 1] or [[n_1, 1], [n_2, 1], ...], here n = sum(n_i) gt_bboxes (Tensor|List[Tensor], float32): Ground truth bboxes, shape is [B, n, 4] or [[n_1, 4], [n_2, 4], ...], here n = sum(n_i) gt_scores (Tensor|List[Tensor]|None, float32): Score of gt_bboxes, shape is [B, n, 1] or [[n_1, 4], [n_2, 4], ...], here n = sum(n_i) Returns: pad_gt_labels (Tensor, int64): shape[B, n, 1] pad_gt_bboxes (Tensor, float32): shape[B, n, 4] pad_gt_scores (Tensor, float32): shape[B, n, 1] pad_gt_mask (Tensor, float32): shape[B, n, 1], 1 means bbox, 0 means no bbox """ if isinstance(gt_labels, paddle.Tensor) and isinstance(gt_bboxes, paddle.Tensor): assert gt_labels.ndim == gt_bboxes.ndim and \ gt_bboxes.ndim == 3 pad_gt_mask = ( gt_bboxes.sum(axis=-1, keepdim=True) > 0).astype(gt_bboxes.dtype) if gt_scores is None: gt_scores = pad_gt_mask.clone() assert gt_labels.ndim == gt_scores.ndim return gt_labels, gt_bboxes, gt_scores, pad_gt_mask elif isinstance(gt_labels, list) and isinstance(gt_bboxes, list): assert len(gt_labels) == len(gt_bboxes), \ 'The number of `gt_labels` and `gt_bboxes` is not equal. ' num_max_boxes = max([len(a) for a in gt_bboxes]) batch_size = len(gt_bboxes) # pad label and bbox pad_gt_labels = paddle.zeros( [batch_size, num_max_boxes, 1], dtype=gt_labels[0].dtype) pad_gt_bboxes = paddle.zeros( [batch_size, num_max_boxes, 4], dtype=gt_bboxes[0].dtype) pad_gt_scores = paddle.zeros( [batch_size, num_max_boxes, 1], dtype=gt_bboxes[0].dtype) pad_gt_mask = paddle.zeros( [batch_size, num_max_boxes, 1], dtype=gt_bboxes[0].dtype) for i, (label, bbox) in enumerate(zip(gt_labels, gt_bboxes)): if len(label) > 0 and len(bbox) > 0: pad_gt_labels[i, :len(label)] = label pad_gt_bboxes[i, :len(bbox)] = bbox pad_gt_mask[i, :len(bbox)] = 1. if gt_scores is not None: pad_gt_scores[i, :len(gt_scores[i])] = gt_scores[i] if gt_scores is None: pad_gt_scores = pad_gt_mask.clone() return pad_gt_labels, pad_gt_bboxes, pad_gt_scores, pad_gt_mask else: raise ValueError('The input `gt_labels` or `gt_bboxes` is invalid! ') def gather_topk_anchors(metrics, topk, largest=True, topk_mask=None, eps=1e-9): r""" Args: metrics (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors topk (int): The number of top elements to look for along the axis. largest (bool) : largest is a flag, if set to true, algorithm will sort by descending order, otherwise sort by ascending order. Default: True topk_mask (Tensor, float32): shape[B, n, 1], ignore bbox mask, Default: None eps (float): Default: 1e-9 Returns: is_in_topk (Tensor, float32): shape[B, n, L], value=1. means selected """ num_anchors = metrics.shape[-1] topk_metrics, topk_idxs = paddle.topk( metrics, topk, axis=-1, largest=largest) if topk_mask is None: topk_mask = ( topk_metrics.max(axis=-1, keepdim=True) > eps).astype(metrics.dtype) is_in_topk = F.one_hot(topk_idxs, num_anchors).sum( axis=-2).astype(metrics.dtype) return is_in_topk * topk_mask def check_points_inside_bboxes(points, bboxes, center_radius_tensor=None, eps=1e-9, sm_use=False): r""" Args: points (Tensor, float32): shape[L, 2], "xy" format, L: num_anchors bboxes (Tensor, float32): shape[B, n, 4], "xmin, ymin, xmax, ymax" format center_radius_tensor (Tensor, float32): shape [L, 1]. Default: None. eps (float): Default: 1e-9 Returns: is_in_bboxes (Tensor, float32): shape[B, n, L], value=1. means selected """ points = points.unsqueeze([0, 1]) x, y = points.chunk(2, axis=-1) xmin, ymin, xmax, ymax = bboxes.unsqueeze(2).chunk(4, axis=-1) # check whether `points` is in `bboxes` l = x - xmin t = y - ymin r = xmax - x b = ymax - y delta_ltrb = paddle.concat([l, t, r, b], axis=-1) is_in_bboxes = (delta_ltrb.min(axis=-1) > eps) if center_radius_tensor is not None: # check whether `points` is in `center_radius` center_radius_tensor = center_radius_tensor.unsqueeze([0, 1]) cx = (xmin + xmax) * 0.5 cy = (ymin + ymax) * 0.5 l = x - (cx - center_radius_tensor) t = y - (cy - center_radius_tensor) r = (cx + center_radius_tensor) - x b = (cy + center_radius_tensor) - y delta_ltrb_c = paddle.concat([l, t, r, b], axis=-1) is_in_center = (delta_ltrb_c.min(axis=-1) > eps) if sm_use: return is_in_bboxes.astype(bboxes.dtype), is_in_center.astype( bboxes.dtype) else: return (paddle.logical_and(is_in_bboxes, is_in_center), paddle.logical_or(is_in_bboxes, is_in_center)) return is_in_bboxes.astype(bboxes.dtype) def compute_max_iou_anchor(ious): r""" For each anchor, find the GT with the largest IOU. Args: ious (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors Returns: is_max_iou (Tensor, float32): shape[B, n, L], value=1. means selected """ num_max_boxes = ious.shape[-2] max_iou_index = ious.argmax(axis=-2) is_max_iou = F.one_hot(max_iou_index, num_max_boxes).transpose([0, 2, 1]) return is_max_iou.astype(ious.dtype) def compute_max_iou_gt(ious): r""" For each GT, find the anchor with the largest IOU. Args: ious (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors Returns: is_max_iou (Tensor, float32): shape[B, n, L], value=1. means selected """ num_anchors = ious.shape[-1] max_iou_index = ious.argmax(axis=-1) is_max_iou = F.one_hot(max_iou_index, num_anchors) return is_max_iou.astype(ious.dtype) def generate_anchors_for_grid_cell(feats, fpn_strides, grid_cell_size=5.0, grid_cell_offset=0.5, dtype='float32'): r""" Like ATSS, generate anchors based on grid size. Args: feats (List[Tensor]): shape[s, (b, c, h, w)] fpn_strides (tuple|list): shape[s], stride for each scale feature grid_cell_size (float): anchor size grid_cell_offset (float): The range is between 0 and 1. Returns: anchors (Tensor): shape[l, 4], "xmin, ymin, xmax, ymax" format. anchor_points (Tensor): shape[l, 2], "x, y" format. num_anchors_list (List[int]): shape[s], contains [s_1, s_2, ...]. stride_tensor (Tensor): shape[l, 1], contains the stride for each scale. """ assert len(feats) == len(fpn_strides) anchors = [] anchor_points = [] num_anchors_list = [] stride_tensor = [] for feat, stride in zip(feats, fpn_strides): _, _, h, w = feat.shape cell_half_size = grid_cell_size * stride * 0.5 shift_x = (paddle.arange(end=w) + grid_cell_offset) * stride shift_y = (paddle.arange(end=h) + grid_cell_offset) * stride shift_y, shift_x = paddle.meshgrid(shift_y, shift_x) anchor = paddle.stack( [ shift_x - cell_half_size, shift_y - cell_half_size, shift_x + cell_half_size, shift_y + cell_half_size ], axis=-1).astype(dtype) anchor_point = paddle.stack([shift_x, shift_y], axis=-1).astype(dtype) anchors.append(anchor.reshape([-1, 4])) anchor_points.append(anchor_point.reshape([-1, 2])) num_anchors_list.append(len(anchors[-1])) stride_tensor.append( paddle.full( [num_anchors_list[-1], 1], stride, dtype=dtype)) anchors = paddle.concat(anchors) anchors.stop_gradient = True anchor_points = paddle.concat(anchor_points) anchor_points.stop_gradient = True stride_tensor = paddle.concat(stride_tensor) stride_tensor.stop_gradient = True return anchors, anchor_points, num_anchors_list, stride_tensor ================================================ FILE: ppdet/modeling/backbones/__init__.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import vgg from . import resnet from . import darknet from . import mobilenet_v1 from . import mobilenet_v3 from . import hrnet from . import lite_hrnet from . import blazenet from . import ghostnet from . import senet from . import res2net from . import dla from . import shufflenet_v2 from . import swin_transformer from . import lcnet from . import hardnet from . import esnet from . import cspresnet from . import csp_darknet from . import convnext from . import vision_transformer from . import mobileone from . import trans_encoder from . import focalnet from . import vit_mae from . import hgnet_v2 from . import clrnet_resnet from .vgg import * from .resnet import * from .darknet import * from .mobilenet_v1 import * from .mobilenet_v3 import * from .hrnet import * from .lite_hrnet import * from .blazenet import * from .ghostnet import * from .senet import * from .res2net import * from .dla import * from .shufflenet_v2 import * from .swin_transformer import * from .lcnet import * from .hardnet import * from .esnet import * from .cspresnet import * from .csp_darknet import * from .convnext import * from .vision_transformer import * from .mobileone import * from .trans_encoder import * from .focalnet import * from .vitpose import * from .vit_mae import * from .hgnet_v2 import * from .clrnet_resnet import * ================================================ FILE: ppdet/modeling/backbones/blazenet.py ================================================ # copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.nn.initializer import KaimingNormal from ppdet.core.workspace import register, serializable from ..shape_spec import ShapeSpec __all__ = ['BlazeNet'] def hard_swish(x): return x * F.relu6(x + 3) / 6. class ConvBNLayer(nn.Layer): def __init__(self, in_channels, out_channels, kernel_size, stride, padding, num_groups=1, act='relu', conv_lr=0.1, conv_decay=0., norm_decay=0., norm_type='bn', name=None): super(ConvBNLayer, self).__init__() self.act = act self._conv = nn.Conv2D( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=num_groups, weight_attr=ParamAttr( learning_rate=conv_lr, initializer=KaimingNormal()), bias_attr=False) if norm_type in ['bn', 'sync_bn']: self._batch_norm = nn.BatchNorm2D(out_channels) def forward(self, x): x = self._conv(x) x = self._batch_norm(x) if self.act == "relu": x = F.relu(x) elif self.act == "relu6": x = F.relu6(x) elif self.act == 'leaky': x = F.leaky_relu(x) elif self.act == 'hard_swish': x = hard_swish(x) return x class BlazeBlock(nn.Layer): def __init__(self, in_channels, out_channels1, out_channels2, double_channels=None, stride=1, use_5x5kernel=True, act='relu', name=None): super(BlazeBlock, self).__init__() assert stride in [1, 2] self.use_pool = not stride == 1 self.use_double_block = double_channels is not None self.conv_dw = [] if use_5x5kernel: self.conv_dw.append( self.add_sublayer( name + "1_dw", ConvBNLayer( in_channels=in_channels, out_channels=out_channels1, kernel_size=5, stride=stride, padding=2, num_groups=out_channels1, name=name + "1_dw"))) else: self.conv_dw.append( self.add_sublayer( name + "1_dw_1", ConvBNLayer( in_channels=in_channels, out_channels=out_channels1, kernel_size=3, stride=1, padding=1, num_groups=out_channels1, name=name + "1_dw_1"))) self.conv_dw.append( self.add_sublayer( name + "1_dw_2", ConvBNLayer( in_channels=out_channels1, out_channels=out_channels1, kernel_size=3, stride=stride, padding=1, num_groups=out_channels1, name=name + "1_dw_2"))) self.act = act if self.use_double_block else None self.conv_pw = ConvBNLayer( in_channels=out_channels1, out_channels=out_channels2, kernel_size=1, stride=1, padding=0, act=self.act, name=name + "1_sep") if self.use_double_block: self.conv_dw2 = [] if use_5x5kernel: self.conv_dw2.append( self.add_sublayer( name + "2_dw", ConvBNLayer( in_channels=out_channels2, out_channels=out_channels2, kernel_size=5, stride=1, padding=2, num_groups=out_channels2, name=name + "2_dw"))) else: self.conv_dw2.append( self.add_sublayer( name + "2_dw_1", ConvBNLayer( in_channels=out_channels2, out_channels=out_channels2, kernel_size=3, stride=1, padding=1, num_groups=out_channels2, name=name + "1_dw_1"))) self.conv_dw2.append( self.add_sublayer( name + "2_dw_2", ConvBNLayer( in_channels=out_channels2, out_channels=out_channels2, kernel_size=3, stride=1, padding=1, num_groups=out_channels2, name=name + "2_dw_2"))) self.conv_pw2 = ConvBNLayer( in_channels=out_channels2, out_channels=double_channels, kernel_size=1, stride=1, padding=0, name=name + "2_sep") # shortcut if self.use_pool: shortcut_channel = double_channels or out_channels2 self._shortcut = [] self._shortcut.append( self.add_sublayer( name + '_shortcut_pool', nn.MaxPool2D( kernel_size=stride, stride=stride, ceil_mode=True))) self._shortcut.append( self.add_sublayer( name + '_shortcut_conv', ConvBNLayer( in_channels=in_channels, out_channels=shortcut_channel, kernel_size=1, stride=1, padding=0, name="shortcut" + name))) def forward(self, x): y = x for conv_dw_block in self.conv_dw: y = conv_dw_block(y) y = self.conv_pw(y) if self.use_double_block: for conv_dw2_block in self.conv_dw2: y = conv_dw2_block(y) y = self.conv_pw2(y) if self.use_pool: for shortcut in self._shortcut: x = shortcut(x) return F.relu(paddle.add(x, y)) @register @serializable class BlazeNet(nn.Layer): """ BlazeFace, see https://arxiv.org/abs/1907.05047 Args: blaze_filters (list): number of filter for each blaze block. double_blaze_filters (list): number of filter for each double_blaze block. use_5x5kernel (bool): whether or not filter size is 5x5 in depth-wise conv. """ def __init__( self, blaze_filters=[[24, 24], [24, 24], [24, 48, 2], [48, 48], [48, 48]], double_blaze_filters=[[48, 24, 96, 2], [96, 24, 96], [96, 24, 96], [96, 24, 96, 2], [96, 24, 96], [96, 24, 96]], use_5x5kernel=True, act=None): super(BlazeNet, self).__init__() conv1_num_filters = blaze_filters[0][0] self.conv1 = ConvBNLayer( in_channels=3, out_channels=conv1_num_filters, kernel_size=3, stride=2, padding=1, name="conv1") in_channels = conv1_num_filters self.blaze_block = [] self._out_channels = [] for k, v in enumerate(blaze_filters): assert len(v) in [2, 3], \ "blaze_filters {} not in [2, 3]" if len(v) == 2: self.blaze_block.append( self.add_sublayer( 'blaze_{}'.format(k), BlazeBlock( in_channels, v[0], v[1], use_5x5kernel=use_5x5kernel, act=act, name='blaze_{}'.format(k)))) elif len(v) == 3: self.blaze_block.append( self.add_sublayer( 'blaze_{}'.format(k), BlazeBlock( in_channels, v[0], v[1], stride=v[2], use_5x5kernel=use_5x5kernel, act=act, name='blaze_{}'.format(k)))) in_channels = v[1] for k, v in enumerate(double_blaze_filters): assert len(v) in [3, 4], \ "blaze_filters {} not in [3, 4]" if len(v) == 3: self.blaze_block.append( self.add_sublayer( 'double_blaze_{}'.format(k), BlazeBlock( in_channels, v[0], v[1], double_channels=v[2], use_5x5kernel=use_5x5kernel, act=act, name='double_blaze_{}'.format(k)))) elif len(v) == 4: self.blaze_block.append( self.add_sublayer( 'double_blaze_{}'.format(k), BlazeBlock( in_channels, v[0], v[1], double_channels=v[2], stride=v[3], use_5x5kernel=use_5x5kernel, act=act, name='double_blaze_{}'.format(k)))) in_channels = v[2] self._out_channels.append(in_channels) def forward(self, inputs): outs = [] y = self.conv1(inputs['image']) for block in self.blaze_block: y = block(y) outs.append(y) return [outs[-4], outs[-1]] @property def out_shape(self): return [ ShapeSpec(channels=c) for c in [self._out_channels[-4], self._out_channels[-1]] ] ================================================ FILE: ppdet/modeling/backbones/clrnet_resnet.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn from paddle.utils.download import get_weights_path_from_url from ppdet.core.workspace import register, serializable from ..shape_spec import ShapeSpec __all__ = ['CLRResNet'] model_urls = { 'resnet18': 'https://x2paddle.bj.bcebos.com/vision/models/resnet18-pt.pdparams', 'resnet34': 'https://x2paddle.bj.bcebos.com/vision/models/resnet34-pt.pdparams', 'resnet50': 'https://x2paddle.bj.bcebos.com/vision/models/resnet50-pt.pdparams', 'resnet101': 'https://x2paddle.bj.bcebos.com/vision/models/resnet101-pt.pdparams', 'resnet152': 'https://x2paddle.bj.bcebos.com/vision/models/resnet152-pt.pdparams', 'resnext50_32x4d': 'https://x2paddle.bj.bcebos.com/vision/models/resnext50_32x4d-pt.pdparams', 'resnext101_32x8d': 'https://x2paddle.bj.bcebos.com/vision/models/resnext101_32x8d-pt.pdparams', 'wide_resnet50_2': 'https://x2paddle.bj.bcebos.com/vision/models/wide_resnet50_2-pt.pdparams', 'wide_resnet101_2': 'https://x2paddle.bj.bcebos.com/vision/models/wide_resnet101_2-pt.pdparams', } class BasicBlock(nn.Layer): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, base_width=64, dilation=1, norm_layer=None): super(BasicBlock, self).__init__() if norm_layer is None: norm_layer = nn.BatchNorm2D if dilation > 1: raise NotImplementedError( "Dilation > 1 not supported in BasicBlock") self.conv1 = nn.Conv2D( inplanes, planes, 3, padding=1, stride=stride, bias_attr=False) self.bn1 = norm_layer(planes) self.relu = nn.ReLU() self.conv2 = nn.Conv2D(planes, planes, 3, padding=1, bias_attr=False) self.bn2 = norm_layer(planes) self.downsample = downsample self.stride = stride def forward(self, x): identity = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: identity = self.downsample(x) out += identity out = self.relu(out) return out class BottleneckBlock(nn.Layer): expansion = 4 def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, base_width=64, dilation=1, norm_layer=None): super(BottleneckBlock, self).__init__() if norm_layer is None: norm_layer = nn.BatchNorm2D width = int(planes * (base_width / 64.)) * groups self.conv1 = nn.Conv2D(inplanes, width, 1, bias_attr=False) self.bn1 = norm_layer(width) self.conv2 = nn.Conv2D( width, width, 3, padding=dilation, stride=stride, groups=groups, dilation=dilation, bias_attr=False) self.bn2 = norm_layer(width) self.conv3 = nn.Conv2D( width, planes * self.expansion, 1, bias_attr=False) self.bn3 = norm_layer(planes * self.expansion) self.relu = nn.ReLU() self.downsample = downsample self.stride = stride def forward(self, x): identity = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: identity = self.downsample(x) out += identity out = self.relu(out) return out class ResNet(nn.Layer): """ResNet model from `"Deep Residual Learning for Image Recognition" `_. Args: Block (BasicBlock|BottleneckBlock): Block module of model. depth (int, optional): Layers of ResNet, Default: 50. width (int, optional): Base width per convolution group for each convolution block, Default: 64. num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer will not be defined. Default: 1000. with_pool (bool, optional): Use pool before the last fc layer or not. Default: True. groups (int, optional): Number of groups for each convolution block, Default: 1. Returns: :ref:`api_paddle_nn_Layer`. An instance of ResNet model. Examples: .. code-block:: python import paddle from paddle.vision.models import ResNet from paddle.vision.models.resnet import BottleneckBlock, BasicBlock # build ResNet with 18 layers resnet18 = ResNet(BasicBlock, 18) # build ResNet with 50 layers resnet50 = ResNet(BottleneckBlock, 50) # build Wide ResNet model wide_resnet50_2 = ResNet(BottleneckBlock, 50, width=64*2) # build ResNeXt model resnext50_32x4d = ResNet(BottleneckBlock, 50, width=4, groups=32) x = paddle.rand([1, 3, 224, 224]) out = resnet18(x) print(out.shape) # [1, 1000] """ def __init__(self, block, depth=50, width=64, with_pool=True, groups=1): super(ResNet, self).__init__() layer_cfg = { 18: [2, 2, 2, 2], 34: [3, 4, 6, 3], 50: [3, 4, 6, 3], 101: [3, 4, 23, 3], 152: [3, 8, 36, 3] } layers = layer_cfg[depth] self.groups = groups self.base_width = width self.with_pool = with_pool self._norm_layer = nn.BatchNorm2D self.inplanes = 64 self.dilation = 1 self.conv1 = nn.Conv2D( 3, self.inplanes, kernel_size=7, stride=2, padding=3, bias_attr=False) self.bn1 = self._norm_layer(self.inplanes) self.relu = nn.ReLU() self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) if with_pool: self.avgpool = nn.AdaptiveAvgPool2D((1, 1)) ch_out_list = [64, 128, 256, 512] block = BottleneckBlock if depth >= 50 else BasicBlock self._out_channels = [block.expansion * v for v in ch_out_list] self._out_strides = [4, 8, 16, 32] self.return_idx = [0, 1, 2, 3] def _make_layer(self, block, planes, blocks, stride=1, dilate=False): norm_layer = self._norm_layer downsample = None previous_dilation = self.dilation if dilate: self.dilation *= stride stride = 1 if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( nn.Conv2D( self.inplanes, planes * block.expansion, 1, stride=stride, bias_attr=False), norm_layer(planes * block.expansion), ) layers = [] layers.append( block(self.inplanes, planes, stride, downsample, self.groups, self.base_width, previous_dilation, norm_layer)) self.inplanes = planes * block.expansion for _ in range(1, blocks): layers.append( block( self.inplanes, planes, groups=self.groups, base_width=self.base_width, norm_layer=norm_layer)) return nn.Sequential(*layers) @property def out_shape(self): return [ ShapeSpec( channels=self._out_channels[i], stride=self._out_strides[i]) for i in self.return_idx ] def forward(self, x): x = self.conv1(x) x = self.bn1(x) x = self.relu(x) x = self.maxpool(x) out_layers = [] x = self.layer1(x) out_layers.append(x) x = self.layer2(x) out_layers.append(x) x = self.layer3(x) out_layers.append(x) x = self.layer4(x) out_layers.append(x) if self.with_pool: x = self.avgpool(x) return out_layers @register @serializable class CLRResNet(nn.Layer): def __init__(self, resnet='resnet18', pretrained=True, out_conv=False, fea_stride=8, out_channel=128, in_channels=[64, 128, 256, 512], cfg=None): super(CLRResNet, self).__init__() self.cfg = cfg self.in_channels = in_channels self.model = eval(resnet)(pretrained=pretrained) self.out = None if out_conv: out_channel = 512 for chan in reversed(self.in_channels): if chan < 0: continue out_channel = chan break self.out = nn.Conv2D( out_channel * self.model.expansion, cfg.featuremap_out_channel, kernel_size=1, bias_attr=False) @property def out_shape(self): return self.model.out_shape def forward(self, x): x = self.model(x) if self.out: x[-1] = self.out(x[-1]) return x def _resnet(arch, Block, depth, pretrained, **kwargs): model = ResNet(Block, depth, **kwargs) if pretrained: assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format( arch) weight_path = get_weights_path_from_url(model_urls[arch]) param = paddle.load(weight_path) model.set_dict(param) return model def resnet18(pretrained=False, **kwargs): """ResNet 18-layer model from `"Deep Residual Learning for Image Recognition" `_. Args: pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained on ImageNet. Default: False. **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. Returns: :ref:`api_paddle_nn_Layer`. An instance of ResNet 18-layer model. Examples: .. code-block:: python import paddle from paddle.vision.models import resnet18 # build model model = resnet18() # build model and load imagenet pretrained weight # model = resnet18(pretrained=True) x = paddle.rand([1, 3, 224, 224]) out = model(x) print(out.shape) # [1, 1000] """ return _resnet('resnet18', BasicBlock, 18, pretrained, **kwargs) def resnet34(pretrained=False, **kwargs): """ResNet 34-layer model from `"Deep Residual Learning for Image Recognition" `_. Args: pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained on ImageNet. Default: False. **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. Returns: :ref:`api_paddle_nn_Layer`. An instance of ResNet 34-layer model. Examples: .. code-block:: python import paddle from paddle.vision.models import resnet34 # build model model = resnet34() # build model and load imagenet pretrained weight # model = resnet34(pretrained=True) x = paddle.rand([1, 3, 224, 224]) out = model(x) print(out.shape) # [1, 1000] """ return _resnet('resnet34', BasicBlock, 34, pretrained, **kwargs) def resnet50(pretrained=False, **kwargs): """ResNet 50-layer model from `"Deep Residual Learning for Image Recognition" `_. Args: pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained on ImageNet. Default: False. **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. Returns: :ref:`api_paddle_nn_Layer`. An instance of ResNet 50-layer model. Examples: .. code-block:: python import paddle from paddle.vision.models import resnet50 # build model model = resnet50() # build model and load imagenet pretrained weight # model = resnet50(pretrained=True) x = paddle.rand([1, 3, 224, 224]) out = model(x) print(out.shape) # [1, 1000] """ return _resnet('resnet50', BottleneckBlock, 50, pretrained, **kwargs) def resnet101(pretrained=False, **kwargs): """ResNet 101-layer model from `"Deep Residual Learning for Image Recognition" `_. Args: pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained on ImageNet. Default: False. **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. Returns: :ref:`api_paddle_nn_Layer`. An instance of ResNet 101-layer. Examples: .. code-block:: python import paddle from paddle.vision.models import resnet101 # build model model = resnet101() # build model and load imagenet pretrained weight # model = resnet101(pretrained=True) x = paddle.rand([1, 3, 224, 224]) out = model(x) print(out.shape) # [1, 1000] """ return _resnet('resnet101', BottleneckBlock, 101, pretrained, **kwargs) def resnet152(pretrained=False, **kwargs): """ResNet 152-layer model from `"Deep Residual Learning for Image Recognition" `_. Args: pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained on ImageNet. Default: False. **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. Returns: :ref:`api_paddle_nn_Layer`. An instance of ResNet 152-layer model. Examples: .. code-block:: python import paddle from paddle.vision.models import resnet152 # build model model = resnet152() # build model and load imagenet pretrained weight # model = resnet152(pretrained=True) x = paddle.rand([1, 3, 224, 224]) out = model(x) print(out.shape) # [1, 1000] """ return _resnet('resnet152', BottleneckBlock, 152, pretrained, **kwargs) def resnext50_32x4d(pretrained=False, **kwargs): """ResNeXt-50 32x4d model from `"Aggregated Residual Transformations for Deep Neural Networks" `_. Args: pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained on ImageNet. Default: False. **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. Returns: :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-50 32x4d model. Examples: .. code-block:: python import paddle from paddle.vision.models import resnext50_32x4d # build model model = resnext50_32x4d() # build model and load imagenet pretrained weight # model = resnext50_32x4d(pretrained=True) x = paddle.rand([1, 3, 224, 224]) out = model(x) print(out.shape) # [1, 1000] """ kwargs['groups'] = 32 kwargs['width'] = 4 return _resnet('resnext50_32x4d', BottleneckBlock, 50, pretrained, **kwargs) def resnext50_64x4d(pretrained=False, **kwargs): """ResNeXt-50 64x4d model from `"Aggregated Residual Transformations for Deep Neural Networks" `_. Args: pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained on ImageNet. Default: False. **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. Returns: :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-50 64x4d model. Examples: .. code-block:: python import paddle from paddle.vision.models import resnext50_64x4d # build model model = resnext50_64x4d() # build model and load imagenet pretrained weight # model = resnext50_64x4d(pretrained=True) x = paddle.rand([1, 3, 224, 224]) out = model(x) print(out.shape) # [1, 1000] """ kwargs['groups'] = 64 kwargs['width'] = 4 return _resnet('resnext50_64x4d', BottleneckBlock, 50, pretrained, **kwargs) def resnext101_32x4d(pretrained=False, **kwargs): """ResNeXt-101 32x4d model from `"Aggregated Residual Transformations for Deep Neural Networks" `_. Args: pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained on ImageNet. Default: False. **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. Returns: :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-101 32x4d model. Examples: .. code-block:: python import paddle from paddle.vision.models import resnext101_32x4d # build model model = resnext101_32x4d() # build model and load imagenet pretrained weight # model = resnext101_32x4d(pretrained=True) x = paddle.rand([1, 3, 224, 224]) out = model(x) print(out.shape) # [1, 1000] """ kwargs['groups'] = 32 kwargs['width'] = 4 return _resnet('resnext101_32x4d', BottleneckBlock, 101, pretrained, **kwargs) def resnext101_64x4d(pretrained=False, **kwargs): """ResNeXt-101 64x4d model from `"Aggregated Residual Transformations for Deep Neural Networks" `_. Args: pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained on ImageNet. Default: False. **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. Returns: :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-101 64x4d model. Examples: .. code-block:: python import paddle from paddle.vision.models import resnext101_64x4d # build model model = resnext101_64x4d() # build model and load imagenet pretrained weight # model = resnext101_64x4d(pretrained=True) x = paddle.rand([1, 3, 224, 224]) out = model(x) print(out.shape) # [1, 1000] """ kwargs['groups'] = 64 kwargs['width'] = 4 return _resnet('resnext101_64x4d', BottleneckBlock, 101, pretrained, **kwargs) def resnext152_32x4d(pretrained=False, **kwargs): """ResNeXt-152 32x4d model from `"Aggregated Residual Transformations for Deep Neural Networks" `_. Args: pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained on ImageNet. Default: False. **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. Returns: :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-152 32x4d model. Examples: .. code-block:: python import paddle from paddle.vision.models import resnext152_32x4d # build model model = resnext152_32x4d() # build model and load imagenet pretrained weight # model = resnext152_32x4d(pretrained=True) x = paddle.rand([1, 3, 224, 224]) out = model(x) print(out.shape) # [1, 1000] """ kwargs['groups'] = 32 kwargs['width'] = 4 return _resnet('resnext152_32x4d', BottleneckBlock, 152, pretrained, **kwargs) def resnext152_64x4d(pretrained=False, **kwargs): """ResNeXt-152 64x4d model from `"Aggregated Residual Transformations for Deep Neural Networks" `_. Args: pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained on ImageNet. Default: False. **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. Returns: :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-152 64x4d model. Examples: .. code-block:: python import paddle from paddle.vision.models import resnext152_64x4d # build model model = resnext152_64x4d() # build model and load imagenet pretrained weight # model = resnext152_64x4d(pretrained=True) x = paddle.rand([1, 3, 224, 224]) out = model(x) print(out.shape) # [1, 1000] """ kwargs['groups'] = 64 kwargs['width'] = 4 return _resnet('resnext152_64x4d', BottleneckBlock, 152, pretrained, **kwargs) def wide_resnet50_2(pretrained=False, **kwargs): """Wide ResNet-50-2 model from `"Wide Residual Networks" `_. Args: pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained on ImageNet. Default: False. **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. Returns: :ref:`api_paddle_nn_Layer`. An instance of Wide ResNet-50-2 model. Examples: .. code-block:: python import paddle from paddle.vision.models import wide_resnet50_2 # build model model = wide_resnet50_2() # build model and load imagenet pretrained weight # model = wide_resnet50_2(pretrained=True) x = paddle.rand([1, 3, 224, 224]) out = model(x) print(out.shape) # [1, 1000] """ kwargs['width'] = 64 * 2 return _resnet('wide_resnet50_2', BottleneckBlock, 50, pretrained, **kwargs) def wide_resnet101_2(pretrained=False, **kwargs): """Wide ResNet-101-2 model from `"Wide Residual Networks" `_. Args: pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained on ImageNet. Default: False. **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. Returns: :ref:`api_paddle_nn_Layer`. An instance of Wide ResNet-101-2 model. Examples: .. code-block:: python import paddle from paddle.vision.models import wide_resnet101_2 # build model model = wide_resnet101_2() # build model and load imagenet pretrained weight # model = wide_resnet101_2(pretrained=True) x = paddle.rand([1, 3, 224, 224]) out = model(x) print(out.shape) # [1, 1000] """ kwargs['width'] = 64 * 2 return _resnet('wide_resnet101_2', BottleneckBlock, 101, pretrained, **kwargs) ================================================ FILE: ppdet/modeling/backbones/convnext.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ''' Modified from https://github.com/facebookresearch/ConvNeXt Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. This source code is licensed under the license found in the LICENSE file in the root directory of this source tree. ''' import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.nn.initializer import Constant import numpy as np from ppdet.core.workspace import register, serializable from ..shape_spec import ShapeSpec from .transformer_utils import DropPath, trunc_normal_, zeros_ __all__ = ['ConvNeXt'] class Block(nn.Layer): r""" ConvNeXt Block. There are two equivalent implementations: (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W) (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back We use (2) as we find it slightly faster in Pypaddle Args: dim (int): Number of input channels. drop_path (float): Stochastic depth rate. Default: 0.0 layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. """ def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6): super().__init__() self.dwconv = nn.Conv2D( dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv self.norm = LayerNorm(dim, eps=1e-6) self.pwconv1 = nn.Linear( dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers self.act = nn.GELU() self.pwconv2 = nn.Linear(4 * dim, dim) if layer_scale_init_value > 0: self.gamma = self.create_parameter( shape=(dim, ), attr=ParamAttr(initializer=Constant(layer_scale_init_value))) else: self.gamma = None self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity( ) def forward(self, x): input = x x = self.dwconv(x) x = x.transpose([0, 2, 3, 1]) x = self.norm(x) x = self.pwconv1(x) x = self.act(x) x = self.pwconv2(x) if self.gamma is not None: x = self.gamma * x x = x.transpose([0, 3, 1, 2]) x = input + self.drop_path(x) return x class LayerNorm(nn.Layer): r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height, width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width). """ def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"): super().__init__() self.weight = self.create_parameter( shape=(normalized_shape, ), attr=ParamAttr(initializer=Constant(1.))) self.bias = self.create_parameter( shape=(normalized_shape, ), attr=ParamAttr(initializer=Constant(0.))) self.eps = eps self.data_format = data_format if self.data_format not in ["channels_last", "channels_first"]: raise NotImplementedError self.normalized_shape = (normalized_shape, ) def forward(self, x): if self.data_format == "channels_last": return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) elif self.data_format == "channels_first": u = x.mean(1, keepdim=True) s = (x - u).pow(2).mean(1, keepdim=True) x = (x - u) / paddle.sqrt(s + self.eps) x = self.weight[:, None, None] * x + self.bias[:, None, None] return x @register @serializable class ConvNeXt(nn.Layer): r""" ConvNeXt A Pypaddle impl of : `A ConvNet for the 2020s` - https://arxiv.org/pdf/2201.03545.pdf Args: in_chans (int): Number of input image channels. Default: 3 depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3] dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768] drop_path_rate (float): Stochastic depth rate. Default: 0. layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. """ arch_settings = { 'tiny': { 'depths': [3, 3, 9, 3], 'dims': [96, 192, 384, 768] }, 'small': { 'depths': [3, 3, 27, 3], 'dims': [96, 192, 384, 768] }, 'base': { 'depths': [3, 3, 27, 3], 'dims': [128, 256, 512, 1024] }, 'large': { 'depths': [3, 3, 27, 3], 'dims': [192, 384, 768, 1536] }, 'xlarge': { 'depths': [3, 3, 27, 3], 'dims': [256, 512, 1024, 2048] }, } def __init__( self, arch='tiny', in_chans=3, drop_path_rate=0., layer_scale_init_value=1e-6, return_idx=[1, 2, 3], norm_output=True, pretrained=None, ): super().__init__() depths = self.arch_settings[arch]['depths'] dims = self.arch_settings[arch]['dims'] self.downsample_layers = nn.LayerList( ) # stem and 3 intermediate downsampling conv layers stem = nn.Sequential( nn.Conv2D( in_chans, dims[0], kernel_size=4, stride=4), LayerNorm( dims[0], eps=1e-6, data_format="channels_first")) self.downsample_layers.append(stem) for i in range(3): downsample_layer = nn.Sequential( LayerNorm( dims[i], eps=1e-6, data_format="channels_first"), nn.Conv2D( dims[i], dims[i + 1], kernel_size=2, stride=2), ) self.downsample_layers.append(downsample_layer) self.stages = nn.LayerList( ) # 4 feature resolution stages, each consisting of multiple residual blocks dp_rates = [x for x in np.linspace(0, drop_path_rate, sum(depths))] cur = 0 for i in range(4): stage = nn.Sequential(* [ Block( dim=dims[i], drop_path=dp_rates[cur + j], layer_scale_init_value=layer_scale_init_value) for j in range(depths[i]) ]) self.stages.append(stage) cur += depths[i] self.return_idx = return_idx self.dims = [dims[i] for i in return_idx] # [::-1] self.norm_output = norm_output if norm_output: self.norms = nn.LayerList([ LayerNorm( c, eps=1e-6, data_format="channels_first") for c in self.dims ]) self.apply(self._init_weights) if pretrained is not None: if 'http' in pretrained: #URL path = paddle.utils.download.get_weights_path_from_url( pretrained) else: #model in local path path = pretrained self.set_state_dict(paddle.load(path)) def _init_weights(self, m): if isinstance(m, (nn.Conv2D, nn.Linear)): trunc_normal_(m.weight) zeros_(m.bias) def forward_features(self, x): output = [] for i in range(4): x = self.downsample_layers[i](x) x = self.stages[i](x) output.append(x) outputs = [output[i] for i in self.return_idx] if self.norm_output: outputs = [self.norms[i](out) for i, out in enumerate(outputs)] return outputs def forward(self, x): x = self.forward_features(x['image']) return x @property def out_shape(self): return [ShapeSpec(channels=c) for c in self.dims] ================================================ FILE: ppdet/modeling/backbones/csp_darknet.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.regularizer import L2Decay from ppdet.core.workspace import register, serializable from ppdet.modeling.initializer import conv_init_ from ..shape_spec import ShapeSpec __all__ = [ 'CSPDarkNet', 'BaseConv', 'DWConv', 'BottleNeck', 'SPPLayer', 'SPPFLayer' ] class BaseConv(nn.Layer): def __init__(self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu"): super(BaseConv, self).__init__() self.conv = nn.Conv2D( in_channels, out_channels, kernel_size=ksize, stride=stride, padding=(ksize - 1) // 2, groups=groups, bias_attr=bias) self.bn = nn.BatchNorm2D( out_channels, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) self._init_weights() def _init_weights(self): conv_init_(self.conv) def forward(self, x): # use 'x * F.sigmoid(x)' replace 'silu' x = self.bn(self.conv(x)) y = x * F.sigmoid(x) return y class DWConv(nn.Layer): """Depthwise Conv""" def __init__(self, in_channels, out_channels, ksize, stride=1, bias=False, act="silu"): super(DWConv, self).__init__() self.dw_conv = BaseConv( in_channels, in_channels, ksize=ksize, stride=stride, groups=in_channels, bias=bias, act=act) self.pw_conv = BaseConv( in_channels, out_channels, ksize=1, stride=1, groups=1, bias=bias, act=act) def forward(self, x): return self.pw_conv(self.dw_conv(x)) class Focus(nn.Layer): """Focus width and height information into channel space, used in YOLOX.""" def __init__(self, in_channels, out_channels, ksize=3, stride=1, bias=False, act="silu"): super(Focus, self).__init__() self.conv = BaseConv( in_channels * 4, out_channels, ksize=ksize, stride=stride, bias=bias, act=act) def forward(self, inputs): # inputs [bs, C, H, W] -> outputs [bs, 4C, W/2, H/2] top_left = inputs[:, :, 0::2, 0::2] top_right = inputs[:, :, 0::2, 1::2] bottom_left = inputs[:, :, 1::2, 0::2] bottom_right = inputs[:, :, 1::2, 1::2] outputs = paddle.concat( [top_left, bottom_left, top_right, bottom_right], 1) return self.conv(outputs) class BottleNeck(nn.Layer): def __init__(self, in_channels, out_channels, shortcut=True, expansion=0.5, depthwise=False, bias=False, act="silu"): super(BottleNeck, self).__init__() hidden_channels = int(out_channels * expansion) Conv = DWConv if depthwise else BaseConv self.conv1 = BaseConv( in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) self.conv2 = Conv( hidden_channels, out_channels, ksize=3, stride=1, bias=bias, act=act) self.add_shortcut = shortcut and in_channels == out_channels def forward(self, x): y = self.conv2(self.conv1(x)) if self.add_shortcut: y = y + x return y class SPPLayer(nn.Layer): """Spatial Pyramid Pooling (SPP) layer used in YOLOv3-SPP and YOLOX""" def __init__(self, in_channels, out_channels, kernel_sizes=(5, 9, 13), bias=False, act="silu"): super(SPPLayer, self).__init__() hidden_channels = in_channels // 2 self.conv1 = BaseConv( in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) self.maxpoolings = nn.LayerList([ nn.MaxPool2D( kernel_size=ks, stride=1, padding=ks // 2) for ks in kernel_sizes ]) conv2_channels = hidden_channels * (len(kernel_sizes) + 1) self.conv2 = BaseConv( conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act) def forward(self, x): x = self.conv1(x) x = paddle.concat([x] + [mp(x) for mp in self.maxpoolings], axis=1) x = self.conv2(x) return x class SPPFLayer(nn.Layer): """ Spatial Pyramid Pooling - Fast (SPPF) layer used in YOLOv5 by Glenn Jocher, equivalent to SPP(k=(5, 9, 13)) """ def __init__(self, in_channels, out_channels, ksize=5, bias=False, act='silu'): super(SPPFLayer, self).__init__() hidden_channels = in_channels // 2 self.conv1 = BaseConv( in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) self.maxpooling = nn.MaxPool2D( kernel_size=ksize, stride=1, padding=ksize // 2) conv2_channels = hidden_channels * 4 self.conv2 = BaseConv( conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act) def forward(self, x): x = self.conv1(x) y1 = self.maxpooling(x) y2 = self.maxpooling(y1) y3 = self.maxpooling(y2) concats = paddle.concat([x, y1, y2, y3], axis=1) out = self.conv2(concats) return out class CSPLayer(nn.Layer): """CSP (Cross Stage Partial) layer with 3 convs, named C3 in YOLOv5""" def __init__(self, in_channels, out_channels, num_blocks=1, shortcut=True, expansion=0.5, depthwise=False, bias=False, act="silu"): super(CSPLayer, self).__init__() hidden_channels = int(out_channels * expansion) self.conv1 = BaseConv( in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) self.conv2 = BaseConv( in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) self.bottlenecks = nn.Sequential(* [ BottleNeck( hidden_channels, hidden_channels, shortcut=shortcut, expansion=1.0, depthwise=depthwise, bias=bias, act=act) for _ in range(num_blocks) ]) self.conv3 = BaseConv( hidden_channels * 2, out_channels, ksize=1, stride=1, bias=bias, act=act) def forward(self, x): x_1 = self.conv1(x) x_1 = self.bottlenecks(x_1) x_2 = self.conv2(x) x = paddle.concat([x_1, x_2], axis=1) x = self.conv3(x) return x @register @serializable class CSPDarkNet(nn.Layer): """ CSPDarkNet backbone. Args: arch (str): Architecture of CSPDarkNet, from {P5, P6, X}, default as X, and 'X' means used in YOLOX, 'P5/P6' means used in YOLOv5. depth_mult (float): Depth multiplier, multiply number of channels in each layer, default as 1.0. width_mult (float): Width multiplier, multiply number of blocks in CSPLayer, default as 1.0. depthwise (bool): Whether to use depth-wise conv layer. act (str): Activation function type, default as 'silu'. return_idx (list): Index of stages whose feature maps are returned. """ __shared__ = ['depth_mult', 'width_mult', 'act', 'trt'] # in_channels, out_channels, num_blocks, add_shortcut, use_spp(use_sppf) # 'X' means setting used in YOLOX, 'P5/P6' means setting used in YOLOv5. arch_settings = { 'X': [[64, 128, 3, True, False], [128, 256, 9, True, False], [256, 512, 9, True, False], [512, 1024, 3, False, True]], 'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False], [256, 512, 9, True, False], [512, 1024, 3, True, True]], 'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False], [256, 512, 9, True, False], [512, 768, 3, True, False], [768, 1024, 3, True, True]], } def __init__(self, arch='X', depth_mult=1.0, width_mult=1.0, depthwise=False, act='silu', trt=False, return_idx=[2, 3, 4]): super(CSPDarkNet, self).__init__() self.arch = arch self.return_idx = return_idx Conv = DWConv if depthwise else BaseConv arch_setting = self.arch_settings[arch] base_channels = int(arch_setting[0][0] * width_mult) # Note: differences between the latest YOLOv5 and the original YOLOX # 1. self.stem, use SPPF(in YOLOv5) or SPP(in YOLOX) # 2. use SPPF(in YOLOv5) or SPP(in YOLOX) # 3. put SPPF before(YOLOv5) or SPP after(YOLOX) the last cspdark block's CSPLayer # 4. whether SPPF(SPP)'CSPLayer add shortcut, True in YOLOv5, False in YOLOX if arch in ['P5', 'P6']: # in the latest YOLOv5, use Conv stem, and SPPF (fast, only single spp kernal size) self.stem = Conv( 3, base_channels, ksize=6, stride=2, bias=False, act=act) spp_kernal_sizes = 5 elif arch in ['X']: # in the original YOLOX, use Focus stem, and SPP (three spp kernal sizes) self.stem = Focus( 3, base_channels, ksize=3, stride=1, bias=False, act=act) spp_kernal_sizes = (5, 9, 13) else: raise AttributeError("Unsupported arch type: {}".format(arch)) _out_channels = [base_channels] layers_num = 1 self.csp_dark_blocks = [] for i, (in_channels, out_channels, num_blocks, shortcut, use_spp) in enumerate(arch_setting): in_channels = int(in_channels * width_mult) out_channels = int(out_channels * width_mult) _out_channels.append(out_channels) num_blocks = max(round(num_blocks * depth_mult), 1) stage = [] conv_layer = self.add_sublayer( 'layers{}.stage{}.conv_layer'.format(layers_num, i + 1), Conv( in_channels, out_channels, 3, 2, bias=False, act=act)) stage.append(conv_layer) layers_num += 1 if use_spp and arch in ['X']: # in YOLOX use SPPLayer spp_layer = self.add_sublayer( 'layers{}.stage{}.spp_layer'.format(layers_num, i + 1), SPPLayer( out_channels, out_channels, kernel_sizes=spp_kernal_sizes, bias=False, act=act)) stage.append(spp_layer) layers_num += 1 csp_layer = self.add_sublayer( 'layers{}.stage{}.csp_layer'.format(layers_num, i + 1), CSPLayer( out_channels, out_channels, num_blocks=num_blocks, shortcut=shortcut, depthwise=depthwise, bias=False, act=act)) stage.append(csp_layer) layers_num += 1 if use_spp and arch in ['P5', 'P6']: # in latest YOLOv5 use SPPFLayer instead of SPPLayer sppf_layer = self.add_sublayer( 'layers{}.stage{}.sppf_layer'.format(layers_num, i + 1), SPPFLayer( out_channels, out_channels, ksize=5, bias=False, act=act)) stage.append(sppf_layer) layers_num += 1 self.csp_dark_blocks.append(nn.Sequential(*stage)) self._out_channels = [_out_channels[i] for i in self.return_idx] self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx] def forward(self, inputs): x = inputs['image'] outputs = [] x = self.stem(x) for i, layer in enumerate(self.csp_dark_blocks): x = layer(x) if i + 1 in self.return_idx: outputs.append(x) return outputs @property def out_shape(self): return [ ShapeSpec( channels=c, stride=s) for c, s in zip(self._out_channels, self.strides) ] ================================================ FILE: ppdet/modeling/backbones/cspresnet.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.regularizer import L2Decay from paddle.nn.initializer import Constant from ppdet.modeling.ops import get_act_fn from ppdet.core.workspace import register, serializable from ..shape_spec import ShapeSpec __all__ = ['CSPResNet', 'BasicBlock', 'EffectiveSELayer', 'ConvBNLayer'] class ConvBNLayer(nn.Layer): def __init__(self, ch_in, ch_out, filter_size=3, stride=1, groups=1, padding=0, act=None): super(ConvBNLayer, self).__init__() self.conv = nn.Conv2D( in_channels=ch_in, out_channels=ch_out, kernel_size=filter_size, stride=stride, padding=padding, groups=groups, bias_attr=False) self.bn = nn.BatchNorm2D( ch_out, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) self.act = get_act_fn(act) if act is None or isinstance(act, ( str, dict)) else act def forward(self, x): x = self.conv(x) x = self.bn(x) x = self.act(x) return x class RepVggBlock(nn.Layer): def __init__(self, ch_in, ch_out, act='relu', alpha=False): super(RepVggBlock, self).__init__() self.ch_in = ch_in self.ch_out = ch_out self.conv1 = ConvBNLayer( ch_in, ch_out, 3, stride=1, padding=1, act=None) self.conv2 = ConvBNLayer( ch_in, ch_out, 1, stride=1, padding=0, act=None) self.act = get_act_fn(act) if act is None or isinstance(act, ( str, dict)) else act if alpha: self.alpha = self.create_parameter( shape=[1], attr=ParamAttr(initializer=Constant(value=1.)), dtype="float32") else: self.alpha = None def forward(self, x): if hasattr(self, 'conv'): y = self.conv(x) else: if self.alpha: y = self.conv1(x) + self.alpha * self.conv2(x) else: y = self.conv1(x) + self.conv2(x) y = self.act(y) return y def convert_to_deploy(self): if not hasattr(self, 'conv'): self.conv = nn.Conv2D( in_channels=self.ch_in, out_channels=self.ch_out, kernel_size=3, stride=1, padding=1, groups=1) kernel, bias = self.get_equivalent_kernel_bias() self.conv.weight.set_value(kernel) self.conv.bias.set_value(bias) self.__delattr__('conv1') self.__delattr__('conv2') def get_equivalent_kernel_bias(self): kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1) kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2) if self.alpha: return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor( kernel1x1), bias3x3 + self.alpha * bias1x1 else: return kernel3x3 + self._pad_1x1_to_3x3_tensor( kernel1x1), bias3x3 + bias1x1 def _pad_1x1_to_3x3_tensor(self, kernel1x1): if kernel1x1 is None: return 0 else: return nn.functional.pad(kernel1x1, [1, 1, 1, 1]) def _fuse_bn_tensor(self, branch): if branch is None: return 0, 0 kernel = branch.conv.weight running_mean = branch.bn._mean running_var = branch.bn._variance gamma = branch.bn.weight beta = branch.bn.bias eps = branch.bn._epsilon std = (running_var + eps).sqrt() t = (gamma / std).reshape((-1, 1, 1, 1)) return kernel * t, beta - running_mean * gamma / std class BasicBlock(nn.Layer): def __init__(self, ch_in, ch_out, act='relu', shortcut=True, use_alpha=False): super(BasicBlock, self).__init__() assert ch_in == ch_out self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act) self.conv2 = RepVggBlock(ch_out, ch_out, act=act, alpha=use_alpha) self.shortcut = shortcut def forward(self, x): y = self.conv1(x) y = self.conv2(y) if self.shortcut: return paddle.add(x, y) else: return y class EffectiveSELayer(nn.Layer): """ Effective Squeeze-Excitation From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667 """ def __init__(self, channels, act='hardsigmoid'): super(EffectiveSELayer, self).__init__() self.fc = nn.Conv2D(channels, channels, kernel_size=1, padding=0) self.act = get_act_fn(act) if act is None or isinstance(act, ( str, dict)) else act def forward(self, x): x_se = x.mean((2, 3), keepdim=True) x_se = self.fc(x_se) return x * self.act(x_se) class CSPResStage(nn.Layer): def __init__(self, block_fn, ch_in, ch_out, n, stride, act='relu', attn='eca', use_alpha=False): super(CSPResStage, self).__init__() ch_mid = (ch_in + ch_out) // 2 if stride == 2: self.conv_down = ConvBNLayer( ch_in, ch_mid, 3, stride=2, padding=1, act=act) else: self.conv_down = None self.conv1 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act) self.conv2 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act) self.blocks = nn.Sequential(*[ block_fn( ch_mid // 2, ch_mid // 2, act=act, shortcut=True, use_alpha=use_alpha) for i in range(n) ]) if attn: self.attn = EffectiveSELayer(ch_mid, act='hardsigmoid') else: self.attn = None self.conv3 = ConvBNLayer(ch_mid, ch_out, 1, act=act) def forward(self, x): if self.conv_down is not None: x = self.conv_down(x) y1 = self.conv1(x) y2 = self.blocks(self.conv2(x)) y = paddle.concat([y1, y2], axis=1) if self.attn is not None: y = self.attn(y) y = self.conv3(y) return y @register @serializable class CSPResNet(nn.Layer): __shared__ = ['width_mult', 'depth_mult', 'trt'] def __init__(self, layers=[3, 6, 6, 3], channels=[64, 128, 256, 512, 1024], act='swish', return_idx=[1, 2, 3], depth_wise=False, use_large_stem=False, width_mult=1.0, depth_mult=1.0, trt=False, use_checkpoint=False, use_alpha=False, **args): super(CSPResNet, self).__init__() self.use_checkpoint = use_checkpoint channels = [max(round(c * width_mult), 1) for c in channels] layers = [max(round(l * depth_mult), 1) for l in layers] act = get_act_fn( act, trt=trt) if act is None or isinstance(act, (str, dict)) else act if use_large_stem: self.stem = nn.Sequential( ('conv1', ConvBNLayer( 3, channels[0] // 2, 3, stride=2, padding=1, act=act)), ('conv2', ConvBNLayer( channels[0] // 2, channels[0] // 2, 3, stride=1, padding=1, act=act)), ('conv3', ConvBNLayer( channels[0] // 2, channels[0], 3, stride=1, padding=1, act=act))) else: self.stem = nn.Sequential( ('conv1', ConvBNLayer( 3, channels[0] // 2, 3, stride=2, padding=1, act=act)), ('conv2', ConvBNLayer( channels[0] // 2, channels[0], 3, stride=1, padding=1, act=act))) n = len(channels) - 1 self.stages = nn.Sequential(*[(str(i), CSPResStage( BasicBlock, channels[i], channels[i + 1], layers[i], 2, act=act, use_alpha=use_alpha)) for i in range(n)]) self._out_channels = channels[1:] self._out_strides = [4 * 2**i for i in range(n)] self.return_idx = return_idx if use_checkpoint: paddle.seed(0) def forward(self, inputs): x = inputs['image'] x = self.stem(x) outs = [] for idx, stage in enumerate(self.stages): if self.use_checkpoint and self.training: x = paddle.distributed.fleet.utils.recompute( stage, x, **{"preserve_rng_state": True}) else: x = stage(x) if idx in self.return_idx: outs.append(x) return outs @property def out_shape(self): return [ ShapeSpec( channels=self._out_channels[i], stride=self._out_strides[i]) for i in self.return_idx ] ================================================ FILE: ppdet/modeling/backbones/darknet.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register, serializable from ppdet.modeling.ops import batch_norm, mish from ..shape_spec import ShapeSpec __all__ = ['DarkNet', 'ConvBNLayer'] class ConvBNLayer(nn.Layer): def __init__(self, ch_in, ch_out, filter_size=3, stride=1, groups=1, padding=0, norm_type='bn', norm_decay=0., act="leaky", freeze_norm=False, data_format='NCHW', name=''): """ conv + bn + activation layer Args: ch_in (int): input channel ch_out (int): output channel filter_size (int): filter size, default 3 stride (int): stride, default 1 groups (int): number of groups of conv layer, default 1 padding (int): padding size, default 0 norm_type (str): batch norm type, default bn norm_decay (str): decay for weight and bias of batch norm layer, default 0. act (str): activation function type, default 'leaky', which means leaky_relu freeze_norm (bool): whether to freeze norm, default False data_format (str): data format, NCHW or NHWC """ super(ConvBNLayer, self).__init__() self.conv = nn.Conv2D( in_channels=ch_in, out_channels=ch_out, kernel_size=filter_size, stride=stride, padding=padding, groups=groups, data_format=data_format, bias_attr=False) self.batch_norm = batch_norm( ch_out, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, data_format=data_format) self.act = act def forward(self, inputs): out = self.conv(inputs) out = self.batch_norm(out) if self.act == 'leaky': out = F.leaky_relu(out, 0.1) else: out = getattr(F, self.act)(out) return out class DownSample(nn.Layer): def __init__(self, ch_in, ch_out, filter_size=3, stride=2, padding=1, norm_type='bn', norm_decay=0., freeze_norm=False, data_format='NCHW'): """ downsample layer Args: ch_in (int): input channel ch_out (int): output channel filter_size (int): filter size, default 3 stride (int): stride, default 2 padding (int): padding size, default 1 norm_type (str): batch norm type, default bn norm_decay (str): decay for weight and bias of batch norm layer, default 0. freeze_norm (bool): whether to freeze norm, default False data_format (str): data format, NCHW or NHWC """ super(DownSample, self).__init__() self.conv_bn_layer = ConvBNLayer( ch_in=ch_in, ch_out=ch_out, filter_size=filter_size, stride=stride, padding=padding, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, data_format=data_format) self.ch_out = ch_out def forward(self, inputs): out = self.conv_bn_layer(inputs) return out class BasicBlock(nn.Layer): def __init__(self, ch_in, ch_out, norm_type='bn', norm_decay=0., freeze_norm=False, data_format='NCHW'): """ BasicBlock layer of DarkNet Args: ch_in (int): input channel ch_out (int): output channel norm_type (str): batch norm type, default bn norm_decay (str): decay for weight and bias of batch norm layer, default 0. freeze_norm (bool): whether to freeze norm, default False data_format (str): data format, NCHW or NHWC """ super(BasicBlock, self).__init__() assert ch_in == ch_out and (ch_in % 2) == 0, \ f"ch_in and ch_out should be the same even int, but the input \'ch_in is {ch_in}, \'ch_out is {ch_out}" # example: # --------------{conv1} --> {conv2} # channel route: 10-->5 --> 5-->10 self.conv1 = ConvBNLayer( ch_in=ch_in, ch_out=int(ch_out / 2), filter_size=1, stride=1, padding=0, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, data_format=data_format) self.conv2 = ConvBNLayer( ch_in=int(ch_out / 2), ch_out=ch_out, filter_size=3, stride=1, padding=1, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, data_format=data_format) def forward(self, inputs): conv1 = self.conv1(inputs) conv2 = self.conv2(conv1) out = paddle.add(x=inputs, y=conv2) return out class Blocks(nn.Layer): def __init__(self, ch_in, ch_out, count, norm_type='bn', norm_decay=0., freeze_norm=False, name=None, data_format='NCHW'): """ Blocks layer, which consist of some BaickBlock layers Args: ch_in (int): input channel ch_out (int): output channel count (int): number of BasicBlock layer norm_type (str): batch norm type, default bn norm_decay (str): decay for weight and bias of batch norm layer, default 0. freeze_norm (bool): whether to freeze norm, default False name (str): layer name data_format (str): data format, NCHW or NHWC """ super(Blocks, self).__init__() self.basicblock0 = BasicBlock( ch_in, ch_out, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, data_format=data_format) self.res_out_list = [] for i in range(1, count): block_name = '{}.{}'.format(name, i) res_out = self.add_sublayer( block_name, BasicBlock( ch_out, ch_out, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, data_format=data_format)) self.res_out_list.append(res_out) self.ch_out = ch_out def forward(self, inputs): y = self.basicblock0(inputs) for basic_block_i in self.res_out_list: y = basic_block_i(y) return y DarkNet_cfg = {53: ([1, 2, 8, 8, 4])} @register @serializable class DarkNet(nn.Layer): __shared__ = ['norm_type', 'data_format'] def __init__(self, depth=53, freeze_at=-1, return_idx=[2, 3, 4], num_stages=5, norm_type='bn', norm_decay=0., freeze_norm=False, data_format='NCHW'): """ Darknet, see https://pjreddie.com/darknet/yolo/ Args: depth (int): depth of network freeze_at (int): freeze the backbone at which stage filter_size (int): filter size, default 3 return_idx (list): index of stages whose feature maps are returned norm_type (str): batch norm type, default bn norm_decay (str): decay for weight and bias of batch norm layer, default 0. data_format (str): data format, NCHW or NHWC """ super(DarkNet, self).__init__() self.depth = depth self.freeze_at = freeze_at self.return_idx = return_idx self.num_stages = num_stages self.stages = DarkNet_cfg[self.depth][0:num_stages] self.conv0 = ConvBNLayer( ch_in=3, ch_out=32, filter_size=3, stride=1, padding=1, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, data_format=data_format) self.downsample0 = DownSample( ch_in=32, ch_out=32 * 2, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, data_format=data_format) self._out_channels = [] self.darknet_conv_block_list = [] self.downsample_list = [] ch_in = [64, 128, 256, 512, 1024] for i, stage in enumerate(self.stages): name = 'stage.{}'.format(i) conv_block = self.add_sublayer( name, Blocks( int(ch_in[i]), int(ch_in[i]), stage, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, data_format=data_format, name=name)) self.darknet_conv_block_list.append(conv_block) if i in return_idx: self._out_channels.append(int(ch_in[i])) for i in range(num_stages - 1): down_name = 'stage.{}.downsample'.format(i) downsample = self.add_sublayer( down_name, DownSample( ch_in=int(ch_in[i]), ch_out=int(ch_in[i + 1]), norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, data_format=data_format)) self.downsample_list.append(downsample) def forward(self, inputs): x = inputs['image'] out = self.conv0(x) out = self.downsample0(out) blocks = [] for i, conv_block_i in enumerate(self.darknet_conv_block_list): out = conv_block_i(out) if i == self.freeze_at: out.stop_gradient = True if i in self.return_idx: blocks.append(out) if i < self.num_stages - 1: out = self.downsample_list[i](out) return blocks @property def out_shape(self): return [ShapeSpec(channels=c) for c in self._out_channels] ================================================ FILE: ppdet/modeling/backbones/dla.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register, serializable from ppdet.modeling.layers import ConvNormLayer from ..shape_spec import ShapeSpec DLA_cfg = {34: ([1, 1, 1, 2, 2, 1], [16, 32, 64, 128, 256, 512]), } class BasicBlock(nn.Layer): def __init__(self, ch_in, ch_out, stride=1): super(BasicBlock, self).__init__() self.conv1 = ConvNormLayer( ch_in, ch_out, filter_size=3, stride=stride, bias_on=False, norm_decay=None) self.conv2 = ConvNormLayer( ch_out, ch_out, filter_size=3, stride=1, bias_on=False, norm_decay=None) def forward(self, inputs, residual=None): if residual is None: residual = inputs out = self.conv1(inputs) out = F.relu(out) out = self.conv2(out) out = paddle.add(x=out, y=residual) out = F.relu(out) return out class Root(nn.Layer): def __init__(self, ch_in, ch_out, kernel_size, residual): super(Root, self).__init__() self.conv = ConvNormLayer( ch_in, ch_out, filter_size=1, stride=1, bias_on=False, norm_decay=None) self.residual = residual def forward(self, inputs): children = inputs out = self.conv(paddle.concat(inputs, axis=1)) if self.residual: out = paddle.add(x=out, y=children[0]) out = F.relu(out) return out class Tree(nn.Layer): def __init__(self, level, block, ch_in, ch_out, stride=1, level_root=False, root_dim=0, root_kernel_size=1, root_residual=False): super(Tree, self).__init__() if root_dim == 0: root_dim = 2 * ch_out if level_root: root_dim += ch_in if level == 1: self.tree1 = block(ch_in, ch_out, stride) self.tree2 = block(ch_out, ch_out, 1) else: self.tree1 = Tree( level - 1, block, ch_in, ch_out, stride, root_dim=0, root_kernel_size=root_kernel_size, root_residual=root_residual) self.tree2 = Tree( level - 1, block, ch_out, ch_out, 1, root_dim=root_dim + ch_out, root_kernel_size=root_kernel_size, root_residual=root_residual) if level == 1: self.root = Root(root_dim, ch_out, root_kernel_size, root_residual) self.level_root = level_root self.root_dim = root_dim self.downsample = None self.project = None self.level = level if stride > 1: self.downsample = nn.MaxPool2D(stride, stride=stride) if ch_in != ch_out: self.project = ConvNormLayer( ch_in, ch_out, filter_size=1, stride=1, bias_on=False, norm_decay=None) def forward(self, x, residual=None, children=None): children = [] if children is None else children bottom = self.downsample(x) if self.downsample else x residual = self.project(bottom) if self.project else bottom if self.level_root: children.append(bottom) x1 = self.tree1(x, residual) if self.level == 1: x2 = self.tree2(x1) x = self.root([x2, x1] + children) else: children.append(x1) x = self.tree2(x1, children=children) return x @register @serializable class DLA(nn.Layer): """ DLA, see https://arxiv.org/pdf/1707.06484.pdf Args: depth (int): DLA depth, only support 34 now. residual_root (bool): whether use a reidual layer in the root block pre_img (bool): add pre_img, only used in CenterTrack pre_hm (bool): add pre_hm, only used in CenterTrack """ def __init__(self, depth=34, residual_root=False, pre_img=False, pre_hm=False): super(DLA, self).__init__() assert depth == 34, 'Only support DLA with depth of 34 now.' if depth == 34: block = BasicBlock levels, channels = DLA_cfg[depth] self.channels = channels self.num_levels = len(levels) self.base_layer = nn.Sequential( ConvNormLayer( 3, channels[0], filter_size=7, stride=1, bias_on=False, norm_decay=None), nn.ReLU()) self.level0 = self._make_conv_level(channels[0], channels[0], levels[0]) self.level1 = self._make_conv_level( channels[0], channels[1], levels[1], stride=2) self.level2 = Tree( levels[2], block, channels[1], channels[2], 2, level_root=False, root_residual=residual_root) self.level3 = Tree( levels[3], block, channels[2], channels[3], 2, level_root=True, root_residual=residual_root) self.level4 = Tree( levels[4], block, channels[3], channels[4], 2, level_root=True, root_residual=residual_root) self.level5 = Tree( levels[5], block, channels[4], channels[5], 2, level_root=True, root_residual=residual_root) if pre_img: self.pre_img_layer = nn.Sequential( ConvNormLayer( 3, channels[0], filter_size=7, stride=1, bias_on=False, norm_decay=None), nn.ReLU()) if pre_hm: self.pre_hm_layer = nn.Sequential( ConvNormLayer( 1, channels[0], filter_size=7, stride=1, bias_on=False, norm_decay=None), nn.ReLU()) self.pre_img = pre_img self.pre_hm = pre_hm def _make_conv_level(self, ch_in, ch_out, conv_num, stride=1): modules = [] for i in range(conv_num): modules.extend([ ConvNormLayer( ch_in, ch_out, filter_size=3, stride=stride if i == 0 else 1, bias_on=False, norm_decay=None), nn.ReLU() ]) ch_in = ch_out return nn.Sequential(*modules) @property def out_shape(self): return [ ShapeSpec(channels=self.channels[i]) for i in range(self.num_levels) ] def forward(self, inputs): outs = [] feats = self.base_layer(inputs['image']) if self.pre_img and 'pre_image' in inputs and inputs[ 'pre_image'] is not None: feats = feats + self.pre_img_layer(inputs['pre_image']) if self.pre_hm and 'pre_hm' in inputs and inputs['pre_hm'] is not None: feats = feats + self.pre_hm_layer(inputs['pre_hm']) for i in range(self.num_levels): feats = getattr(self, 'level{}'.format(i))(feats) outs.append(feats) return outs ================================================ FILE: ppdet/modeling/backbones/esnet.py ================================================ # copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm from paddle.nn.initializer import KaimingNormal from paddle.regularizer import L2Decay from ppdet.core.workspace import register, serializable from numbers import Integral from ..shape_spec import ShapeSpec from ppdet.modeling.ops import channel_shuffle from ppdet.modeling.backbones.shufflenet_v2 import ConvBNLayer __all__ = ['ESNet'] def make_divisible(v, divisor=16, min_value=None): if min_value is None: min_value = divisor new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) if new_v < 0.9 * v: new_v += divisor return new_v class SEModule(nn.Layer): def __init__(self, channel, reduction=4): super(SEModule, self).__init__() self.avg_pool = AdaptiveAvgPool2D(1) self.conv1 = Conv2D( in_channels=channel, out_channels=channel // reduction, kernel_size=1, stride=1, padding=0, weight_attr=ParamAttr(), bias_attr=ParamAttr()) self.conv2 = Conv2D( in_channels=channel // reduction, out_channels=channel, kernel_size=1, stride=1, padding=0, weight_attr=ParamAttr(), bias_attr=ParamAttr()) def forward(self, inputs): outputs = self.avg_pool(inputs) outputs = self.conv1(outputs) outputs = F.relu(outputs) outputs = self.conv2(outputs) outputs = F.hardsigmoid(outputs) return paddle.multiply(x=inputs, y=outputs) class InvertedResidual(nn.Layer): def __init__(self, in_channels, mid_channels, out_channels, stride, act="relu"): super(InvertedResidual, self).__init__() self._conv_pw = ConvBNLayer( in_channels=in_channels // 2, out_channels=mid_channels // 2, kernel_size=1, stride=1, padding=0, groups=1, act=act) self._conv_dw = ConvBNLayer( in_channels=mid_channels // 2, out_channels=mid_channels // 2, kernel_size=3, stride=stride, padding=1, groups=mid_channels // 2, act=None) self._se = SEModule(mid_channels) self._conv_linear = ConvBNLayer( in_channels=mid_channels, out_channels=out_channels // 2, kernel_size=1, stride=1, padding=0, groups=1, act=act) def forward(self, inputs): x1, x2 = paddle.split( inputs, num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2], axis=1) x2 = self._conv_pw(x2) x3 = self._conv_dw(x2) x3 = paddle.concat([x2, x3], axis=1) x3 = self._se(x3) x3 = self._conv_linear(x3) out = paddle.concat([x1, x3], axis=1) return channel_shuffle(out, 2) class InvertedResidualDS(nn.Layer): def __init__(self, in_channels, mid_channels, out_channels, stride, act="relu"): super(InvertedResidualDS, self).__init__() # branch1 self._conv_dw_1 = ConvBNLayer( in_channels=in_channels, out_channels=in_channels, kernel_size=3, stride=stride, padding=1, groups=in_channels, act=None) self._conv_linear_1 = ConvBNLayer( in_channels=in_channels, out_channels=out_channels // 2, kernel_size=1, stride=1, padding=0, groups=1, act=act) # branch2 self._conv_pw_2 = ConvBNLayer( in_channels=in_channels, out_channels=mid_channels // 2, kernel_size=1, stride=1, padding=0, groups=1, act=act) self._conv_dw_2 = ConvBNLayer( in_channels=mid_channels // 2, out_channels=mid_channels // 2, kernel_size=3, stride=stride, padding=1, groups=mid_channels // 2, act=None) self._se = SEModule(mid_channels // 2) self._conv_linear_2 = ConvBNLayer( in_channels=mid_channels // 2, out_channels=out_channels // 2, kernel_size=1, stride=1, padding=0, groups=1, act=act) self._conv_dw_mv1 = ConvBNLayer( in_channels=out_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1, groups=out_channels, act="hard_swish") self._conv_pw_mv1 = ConvBNLayer( in_channels=out_channels, out_channels=out_channels, kernel_size=1, stride=1, padding=0, groups=1, act="hard_swish") def forward(self, inputs): x1 = self._conv_dw_1(inputs) x1 = self._conv_linear_1(x1) x2 = self._conv_pw_2(inputs) x2 = self._conv_dw_2(x2) x2 = self._se(x2) x2 = self._conv_linear_2(x2) out = paddle.concat([x1, x2], axis=1) out = self._conv_dw_mv1(out) out = self._conv_pw_mv1(out) return out @register @serializable class ESNet(nn.Layer): def __init__(self, scale=1.0, act="hard_swish", feature_maps=[4, 11, 14], channel_ratio=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]): super(ESNet, self).__init__() self.scale = scale if isinstance(feature_maps, Integral): feature_maps = [feature_maps] self.feature_maps = feature_maps stage_repeats = [3, 7, 3] stage_out_channels = [ -1, 24, make_divisible(128 * scale), make_divisible(256 * scale), make_divisible(512 * scale), 1024 ] self._out_channels = [] self._feature_idx = 0 # 1. conv1 self._conv1 = ConvBNLayer( in_channels=3, out_channels=stage_out_channels[1], kernel_size=3, stride=2, padding=1, act=act) self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1) self._feature_idx += 1 # 2. bottleneck sequences self._block_list = [] arch_idx = 0 for stage_id, num_repeat in enumerate(stage_repeats): for i in range(num_repeat): channels_scales = channel_ratio[arch_idx] mid_c = make_divisible( int(stage_out_channels[stage_id + 2] * channels_scales), divisor=8) if i == 0: block = self.add_sublayer( name=str(stage_id + 2) + '_' + str(i + 1), sublayer=InvertedResidualDS( in_channels=stage_out_channels[stage_id + 1], mid_channels=mid_c, out_channels=stage_out_channels[stage_id + 2], stride=2, act=act)) else: block = self.add_sublayer( name=str(stage_id + 2) + '_' + str(i + 1), sublayer=InvertedResidual( in_channels=stage_out_channels[stage_id + 2], mid_channels=mid_c, out_channels=stage_out_channels[stage_id + 2], stride=1, act=act)) self._block_list.append(block) arch_idx += 1 self._feature_idx += 1 self._update_out_channels(stage_out_channels[stage_id + 2], self._feature_idx, self.feature_maps) def _update_out_channels(self, channel, feature_idx, feature_maps): if feature_idx in feature_maps: self._out_channels.append(channel) def forward(self, inputs): y = self._conv1(inputs['image']) y = self._max_pool(y) outs = [] for i, inv in enumerate(self._block_list): y = inv(y) if i + 2 in self.feature_maps: outs.append(y) return outs @property def out_shape(self): return [ShapeSpec(channels=c) for c in self._out_channels] ================================================ FILE: ppdet/modeling/backbones/focalnet.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This code is based on https://github.com/microsoft/FocalNet/blob/main/classification/focalnet.py """ import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.modeling.shape_spec import ShapeSpec from ppdet.core.workspace import register, serializable from .transformer_utils import DropPath, Identity from .transformer_utils import add_parameter, to_2tuple from .transformer_utils import ones_, zeros_, trunc_normal_ from .swin_transformer import Mlp __all__ = ['FocalNet'] MODEL_cfg = { 'focalnet_T_224_1k_srf': dict( embed_dim=96, depths=[2, 2, 6, 2], focal_levels=[2, 2, 2, 2], focal_windows=[3, 3, 3, 3], drop_path_rate=0.2, use_conv_embed=False, use_postln=False, use_postln_in_modulation=False, use_layerscale=False, normalize_modulator=False, pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_srf_pretrained.pdparams', ), 'focalnet_S_224_1k_srf': dict( embed_dim=96, depths=[2, 2, 18, 2], focal_levels=[2, 2, 2, 2], focal_windows=[3, 3, 3, 3], drop_path_rate=0.3, use_conv_embed=False, use_postln=False, use_postln_in_modulation=False, use_layerscale=False, normalize_modulator=False, pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_srf_pretrained.pdparams', ), 'focalnet_B_224_1k_srf': dict( embed_dim=128, depths=[2, 2, 18, 2], focal_levels=[2, 2, 2, 2], focal_windows=[3, 3, 3, 3], drop_path_rate=0.5, use_conv_embed=False, use_postln=False, use_postln_in_modulation=False, use_layerscale=False, normalize_modulator=False, pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_srf_pretrained.pdparams', ), 'focalnet_T_224_1k_lrf': dict( embed_dim=96, depths=[2, 2, 6, 2], focal_levels=[3, 3, 3, 3], focal_windows=[3, 3, 3, 3], drop_path_rate=0.2, use_conv_embed=False, use_postln=False, use_postln_in_modulation=False, use_layerscale=False, normalize_modulator=False, pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_lrf_pretrained.pdparams', ), 'focalnet_S_224_1k_lrf': dict( embed_dim=96, depths=[2, 2, 18, 2], focal_levels=[3, 3, 3, 3], focal_windows=[3, 3, 3, 3], drop_path_rate=0.3, use_conv_embed=False, use_postln=False, use_postln_in_modulation=False, use_layerscale=False, normalize_modulator=False, pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_lrf_pretrained.pdparams', ), 'focalnet_B_224_1k_lrf': dict( embed_dim=128, depths=[2, 2, 18, 2], focal_levels=[3, 3, 3, 3], focal_windows=[3, 3, 3, 3], drop_path_rate=0.5, use_conv_embed=False, use_postln=False, use_postln_in_modulation=False, use_layerscale=False, normalize_modulator=False, pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_lrf_pretrained.pdparams', ), 'focalnet_L_384_22k_fl3': dict( embed_dim=192, depths=[2, 2, 18, 2], focal_levels=[3, 3, 3, 3], focal_windows=[5, 5, 5, 5], drop_path_rate=0.5, use_conv_embed=True, use_postln=True, use_postln_in_modulation=False, use_layerscale=True, normalize_modulator=False, pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_pretrained.pdparams', ), 'focalnet_L_384_22k_fl4': dict( embed_dim=192, depths=[2, 2, 18, 2], focal_levels=[4, 4, 4, 4], focal_windows=[3, 3, 3, 3], drop_path_rate=0.5, use_conv_embed=True, use_postln=True, use_postln_in_modulation=False, use_layerscale=True, normalize_modulator=True, # pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_fl4_pretrained.pdparams', ), 'focalnet_XL_384_22k_fl3': dict( embed_dim=256, depths=[2, 2, 18, 2], focal_levels=[3, 3, 3, 3], focal_windows=[5, 5, 5, 5], drop_path_rate=0.5, use_conv_embed=True, use_postln=True, use_postln_in_modulation=False, use_layerscale=True, normalize_modulator=False, pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_pretrained.pdparams', ), 'focalnet_XL_384_22k_fl4': dict( embed_dim=256, depths=[2, 2, 18, 2], focal_levels=[4, 4, 4, 4], focal_windows=[3, 3, 3, 3], drop_path_rate=0.5, use_conv_embed=True, use_postln=True, use_postln_in_modulation=False, use_layerscale=True, normalize_modulator=False, pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_fl4_pretrained.pdparams', ), 'focalnet_H_224_22k_fl3': dict( embed_dim=352, depths=[2, 2, 18, 2], focal_levels=[3, 3, 3, 3], focal_windows=[3, 3, 3, 3], drop_path_rate=0.5, use_conv_embed=True, use_postln=True, use_postln_in_modulation=True, # use_layerscale=True, normalize_modulator=False, pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_pretrained.pdparams', ), 'focalnet_H_224_22k_fl4': dict( embed_dim=352, depths=[2, 2, 18, 2], focal_levels=[4, 4, 4, 4], focal_windows=[3, 3, 3, 3], drop_path_rate=0.5, use_conv_embed=True, use_postln=True, use_postln_in_modulation=True, # use_layerscale=True, normalize_modulator=False, pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_fl4_pretrained.pdparams', ), } class FocalModulation(nn.Layer): """ Args: dim (int): Number of input channels. proj_drop (float, optional): Dropout ratio of output. Default: 0.0 focal_level (int): Number of focal levels focal_window (int): Focal window size at focal level 1 focal_factor (int): Step to increase the focal window. Default: 2 use_postln_in_modulation (bool): Whether use post-modulation layernorm normalize_modulator (bool): Whether use normalize in modulator """ def __init__(self, dim, proj_drop=0., focal_level=2, focal_window=7, focal_factor=2, use_postln_in_modulation=False, normalize_modulator=False): super().__init__() self.dim = dim # specific args for focalv3 self.focal_level = focal_level self.focal_window = focal_window self.focal_factor = focal_factor self.use_postln_in_modulation = use_postln_in_modulation self.normalize_modulator = normalize_modulator self.f = nn.Linear( dim, 2 * dim + (self.focal_level + 1), bias_attr=True) self.h = nn.Conv2D( dim, dim, kernel_size=1, stride=1, padding=0, groups=1, bias_attr=True) self.act = nn.GELU() self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) self.focal_layers = nn.LayerList() if self.use_postln_in_modulation: self.ln = nn.LayerNorm(dim) for k in range(self.focal_level): kernel_size = self.focal_factor * k + self.focal_window self.focal_layers.append( nn.Sequential( nn.Conv2D( dim, dim, kernel_size=kernel_size, stride=1, groups=dim, padding=kernel_size // 2, bias_attr=False), nn.GELU())) def forward(self, x): """ Forward function. Args: x: input features with shape of (B, H, W, C) """ _, _, _, C = x.shape x = self.f(x) x = x.transpose([0, 3, 1, 2]) q, ctx, gates = paddle.split(x, (C, C, self.focal_level + 1), 1) ctx_all = 0 for l in range(self.focal_level): ctx = self.focal_layers[l](ctx) ctx_all = ctx_all + ctx * gates[:, l:l + 1] ctx_global = self.act(ctx.mean(2, keepdim=True).mean(3, keepdim=True)) ctx_all = ctx_all + ctx_global * gates[:, self.focal_level:] if self.normalize_modulator: ctx_all = ctx_all / (self.focal_level + 1) x_out = q * self.h(ctx_all) x_out = x_out.transpose([0, 2, 3, 1]) if self.use_postln_in_modulation: x_out = self.ln(x_out) x_out = self.proj(x_out) x_out = self.proj_drop(x_out) return x_out class FocalModulationBlock(nn.Layer): """ Focal Modulation Block. Args: dim (int): Number of input channels. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. drop (float, optional): Dropout rate. Default: 0.0 drop_path (float, optional): Stochastic depth rate. Default: 0.0 act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm focal_level (int): number of focal levels focal_window (int): focal kernel size at level 1 use_postln (bool): Whether use layernorm after modulation. Default: False. use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False. normalize_modulator (bool): Whether use normalize in modulator use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False layerscale_value (float): Value for layer scale. Default: 1e-4 """ def __init__(self, dim, mlp_ratio=4., drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, focal_level=2, focal_window=9, use_postln=False, use_postln_in_modulation=False, normalize_modulator=False, use_layerscale=False, layerscale_value=1e-4): super().__init__() self.dim = dim self.mlp_ratio = mlp_ratio self.focal_window = focal_window self.focal_level = focal_level self.use_postln = use_postln self.use_layerscale = use_layerscale self.norm1 = norm_layer(dim) self.modulation = FocalModulation( dim, proj_drop=drop, focal_level=self.focal_level, focal_window=self.focal_window, use_postln_in_modulation=use_postln_in_modulation, normalize_modulator=normalize_modulator) self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) self.H = None self.W = None self.gamma_1 = 1.0 self.gamma_2 = 1.0 if self.use_layerscale: self.gamma_1 = add_parameter(self, layerscale_value * paddle.ones([dim])) self.gamma_2 = add_parameter(self, layerscale_value * paddle.ones([dim])) def forward(self, x): """ Args: x: Input feature, tensor size (B, H*W, C). """ B, L, C = x.shape H, W = self.H, self.W assert L == H * W, "input feature has wrong size" shortcut = x if not self.use_postln: x = self.norm1(x) x = x.reshape([-1, H, W, C]) # FM x = self.modulation(x).reshape([-1, H * W, C]) if self.use_postln: x = self.norm1(x) # FFN x = shortcut + self.drop_path(self.gamma_1 * x) if self.use_postln: x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x))) else: x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) return x class BasicLayer(nn.Layer): """ A basic focal modulation layer for one stage. Args: dim (int): Number of feature channels depth (int): Depths of this stage. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. drop (float, optional): Dropout rate. Default: 0.0 drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None focal_level (int): Number of focal levels focal_window (int): Focal window size at focal level 1 use_conv_embed (bool): Whether use overlapped convolution for patch embedding use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False layerscale_value (float): Value of layerscale use_postln (bool): Whether use layernorm after modulation. Default: False. use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False. normalize_modulator (bool): Whether use normalize in modulator use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. """ def __init__(self, dim, depth, mlp_ratio=4., drop=0., drop_path=0., norm_layer=nn.LayerNorm, downsample=None, focal_level=2, focal_window=9, use_conv_embed=False, use_layerscale=False, layerscale_value=1e-4, use_postln=False, use_postln_in_modulation=False, normalize_modulator=False, use_checkpoint=False): super().__init__() self.depth = depth self.use_checkpoint = use_checkpoint # build blocks self.blocks = nn.LayerList([ FocalModulationBlock( dim=dim, mlp_ratio=mlp_ratio, drop=drop, drop_path=drop_path[i] if isinstance(drop_path, np.ndarray) else drop_path, act_layer=nn.GELU, norm_layer=norm_layer, focal_level=focal_level, focal_window=focal_window, use_postln=use_postln, use_postln_in_modulation=use_postln_in_modulation, normalize_modulator=normalize_modulator, use_layerscale=use_layerscale, layerscale_value=layerscale_value) for i in range(depth) ]) # patch merging layer if downsample is not None: self.downsample = downsample( patch_size=2, in_chans=dim, embed_dim=2 * dim, use_conv_embed=use_conv_embed, norm_layer=norm_layer, is_stem=False) else: self.downsample = None def forward(self, x, H, W): """ Args: x: Input feature, tensor size (B, H*W, C). """ for blk in self.blocks: blk.H, blk.W = H, W x = blk(x) if self.downsample is not None: x_reshaped = x.transpose([0, 2, 1]).reshape( [x.shape[0], x.shape[-1], H, W]) x_down = self.downsample(x_reshaped) x_down = x_down.flatten(2).transpose([0, 2, 1]) Wh, Ww = (H + 1) // 2, (W + 1) // 2 return x, H, W, x_down, Wh, Ww else: return x, H, W, x, H, W class PatchEmbed(nn.Layer): """ Image to Patch Embedding Args: patch_size (int): Patch token size. Default: 4. in_chans (int): Number of input image channels. Default: 3. embed_dim (int): Number of linear projection output channels. Default: 96. norm_layer (nn.Layer, optional): Normalization layer. Default: None use_conv_embed (bool): Whether use overlapped convolution for patch embedding. Default: False is_stem (bool): Is the stem block or not. """ def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None, use_conv_embed=False, is_stem=False): super().__init__() patch_size = to_2tuple(patch_size) self.patch_size = patch_size self.in_chans = in_chans self.embed_dim = embed_dim if use_conv_embed: # if we choose to use conv embedding, then we treat the stem and non-stem differently if is_stem: kernel_size = 7 padding = 2 stride = 4 else: kernel_size = 3 padding = 1 stride = 2 self.proj = nn.Conv2D( in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding) else: self.proj = nn.Conv2D( in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) if norm_layer is not None: self.norm = norm_layer(embed_dim) else: self.norm = None def forward(self, x): _, _, H, W = x.shape if W % self.patch_size[1] != 0: # for 3D tensor: [pad_left, pad_right] # for 4D tensor: [pad_left, pad_right, pad_top, pad_bottom] x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0]) W += W % self.patch_size[1] if H % self.patch_size[0] != 0: x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]]) H += H % self.patch_size[0] x = self.proj(x) if self.norm is not None: _, _, Wh, Ww = x.shape x = x.flatten(2).transpose([0, 2, 1]) x = self.norm(x) x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww]) return x @register @serializable class FocalNet(nn.Layer): """ FocalNet backbone Args: arch (str): Architecture of FocalNet out_indices (Sequence[int]): Output from which stages. frozen_stages (int): Stages to be frozen (stop grad and set eval mode). -1 means not freezing any parameters. patch_size (int | tuple(int)): Patch size. Default: 4. in_chans (int): Number of input image channels. Default: 3. embed_dim (int): Number of linear projection output channels. Default: 96. depths (tuple[int]): Depths of each FocalNet Transformer stage. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. drop_rate (float): Dropout rate. drop_path_rate (float): Stochastic depth rate. Default: 0.2. norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm. patch_norm (bool): If True, add normalization after patch embedding. Default: True. focal_levels (Sequence[int]): Number of focal levels at four stages focal_windows (Sequence[int]): Focal window sizes at first focal level at four stages use_conv_embed (bool): Whether use overlapped convolution for patch embedding use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False layerscale_value (float): Value of layerscale use_postln (bool): Whether use layernorm after modulation. Default: False. use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False. normalize_modulator (bool): Whether use normalize in modulator use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. """ def __init__( self, arch='focalnet_T_224_1k_srf', out_indices=(0, 1, 2, 3), frozen_stages=-1, patch_size=4, in_chans=3, embed_dim=96, depths=[2, 2, 6, 2], mlp_ratio=4., drop_rate=0., drop_path_rate=0.2, # 0.5 better for large+ models norm_layer=nn.LayerNorm, patch_norm=True, focal_levels=[2, 2, 2, 2], focal_windows=[3, 3, 3, 3], use_conv_embed=False, use_layerscale=False, layerscale_value=1e-4, use_postln=False, use_postln_in_modulation=False, normalize_modulator=False, use_checkpoint=False, pretrained=None): super(FocalNet, self).__init__() assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch) embed_dim = MODEL_cfg[arch]['embed_dim'] depths = MODEL_cfg[arch]['depths'] drop_path_rate = MODEL_cfg[arch]['drop_path_rate'] focal_levels = MODEL_cfg[arch]['focal_levels'] focal_windows = MODEL_cfg[arch]['focal_windows'] use_conv_embed = MODEL_cfg[arch]['use_conv_embed'] use_layerscale = MODEL_cfg[arch]['use_layerscale'] use_postln = MODEL_cfg[arch]['use_postln'] use_postln_in_modulation = MODEL_cfg[arch]['use_postln_in_modulation'] normalize_modulator = MODEL_cfg[arch]['normalize_modulator'] if pretrained is None: pretrained = MODEL_cfg[arch]['pretrained'] self.out_indices = out_indices self.frozen_stages = frozen_stages self.num_layers = len(depths) self.patch_norm = patch_norm # split image into non-overlapping patches self.patch_embed = PatchEmbed( patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, norm_layer=norm_layer if self.patch_norm else None, use_conv_embed=use_conv_embed, is_stem=True) self.pos_drop = nn.Dropout(p=drop_rate) # stochastic depth decay rule dpr = np.linspace(0, drop_path_rate, sum(depths)) # build layers self.layers = nn.LayerList() for i_layer in range(self.num_layers): layer = BasicLayer( dim=int(embed_dim * 2**i_layer), depth=depths[i_layer], mlp_ratio=mlp_ratio, drop=drop_rate, drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], norm_layer=norm_layer, downsample=PatchEmbed if (i_layer < self.num_layers - 1) else None, focal_level=focal_levels[i_layer], focal_window=focal_windows[i_layer], use_conv_embed=use_conv_embed, use_layerscale=use_layerscale, layerscale_value=layerscale_value, use_postln=use_postln, use_postln_in_modulation=use_postln_in_modulation, normalize_modulator=normalize_modulator, use_checkpoint=use_checkpoint) self.layers.append(layer) num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)] self.num_features = num_features # add a norm layer for each output for i_layer in out_indices: layer = norm_layer(num_features[i_layer]) layer_name = f'norm{i_layer}' self.add_sublayer(layer_name, layer) self.apply(self._init_weights) self._freeze_stages() if pretrained: if 'http' in pretrained: #URL path = paddle.utils.download.get_weights_path_from_url( pretrained) else: #model in local path path = pretrained self.set_state_dict(paddle.load(path)) def _freeze_stages(self): if self.frozen_stages >= 0: self.patch_embed.eval() for param in self.patch_embed.parameters(): param.stop_gradient = True if self.frozen_stages >= 2: self.pos_drop.eval() for i in range(0, self.frozen_stages - 1): m = self.layers[i] m.eval() for param in m.parameters(): param.stop_gradient = True def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight) if isinstance(m, nn.Linear) and m.bias is not None: zeros_(m.bias) elif isinstance(m, nn.LayerNorm): zeros_(m.bias) ones_(m.weight) def forward(self, x): x = self.patch_embed(x['image']) B, _, Wh, Ww = x.shape x = x.flatten(2).transpose([0, 2, 1]) x = self.pos_drop(x) outs = [] for i in range(self.num_layers): layer = self.layers[i] x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) if i in self.out_indices: norm_layer = getattr(self, f'norm{i}') x_out = norm_layer(x_out) out = x_out.reshape([-1, H, W, self.num_features[i]]).transpose( (0, 3, 1, 2)) outs.append(out) return outs @property def out_shape(self): out_strides = [4, 8, 16, 32] return [ ShapeSpec( channels=self.num_features[i], stride=out_strides[i]) for i in self.out_indices ] ================================================ FILE: ppdet/modeling/backbones/ghostnet.py ================================================ # copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math import paddle from paddle import ParamAttr import paddle.nn as nn import paddle.nn.functional as F from paddle.nn import AdaptiveAvgPool2D, Linear from paddle.nn.initializer import Uniform from ppdet.core.workspace import register, serializable from numbers import Integral from ..shape_spec import ShapeSpec from .mobilenet_v3 import make_divisible, ConvBNLayer __all__ = ['GhostNet'] class ExtraBlockDW(nn.Layer): def __init__(self, in_c, ch_1, ch_2, stride, lr_mult, conv_decay=0., norm_type='bn', norm_decay=0., freeze_norm=False, name=None): super(ExtraBlockDW, self).__init__() self.pointwise_conv = ConvBNLayer( in_c=in_c, out_c=ch_1, filter_size=1, stride=1, padding=0, act='relu6', lr_mult=lr_mult, conv_decay=conv_decay, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, name=name + "_extra1") self.depthwise_conv = ConvBNLayer( in_c=ch_1, out_c=ch_2, filter_size=3, stride=stride, padding=1, # num_groups=int(ch_1), act='relu6', lr_mult=lr_mult, conv_decay=conv_decay, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, name=name + "_extra2_dw") self.normal_conv = ConvBNLayer( in_c=ch_2, out_c=ch_2, filter_size=1, stride=1, padding=0, act='relu6', lr_mult=lr_mult, conv_decay=conv_decay, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, name=name + "_extra2_sep") def forward(self, inputs): x = self.pointwise_conv(inputs) x = self.depthwise_conv(x) x = self.normal_conv(x) return x class SEBlock(nn.Layer): def __init__(self, num_channels, lr_mult, reduction_ratio=4, name=None): super(SEBlock, self).__init__() self.pool2d_gap = AdaptiveAvgPool2D(1) self._num_channels = num_channels stdv = 1.0 / math.sqrt(num_channels * 1.0) med_ch = num_channels // reduction_ratio self.squeeze = Linear( num_channels, med_ch, weight_attr=ParamAttr( learning_rate=lr_mult, initializer=Uniform(-stdv, stdv)), bias_attr=ParamAttr(learning_rate=lr_mult)) stdv = 1.0 / math.sqrt(med_ch * 1.0) self.excitation = Linear( med_ch, num_channels, weight_attr=ParamAttr( learning_rate=lr_mult, initializer=Uniform(-stdv, stdv)), bias_attr=ParamAttr(learning_rate=lr_mult)) def forward(self, inputs): pool = self.pool2d_gap(inputs) pool = paddle.squeeze(pool, axis=[2, 3]) squeeze = self.squeeze(pool) squeeze = F.relu(squeeze) excitation = self.excitation(squeeze) excitation = paddle.clip(x=excitation, min=0, max=1) excitation = paddle.unsqueeze(excitation, axis=[2, 3]) out = paddle.multiply(inputs, excitation) return out class GhostModule(nn.Layer): def __init__(self, in_channels, output_channels, kernel_size=1, ratio=2, dw_size=3, stride=1, relu=True, lr_mult=1., conv_decay=0., norm_type='bn', norm_decay=0., freeze_norm=False, name=None): super(GhostModule, self).__init__() init_channels = int(math.ceil(output_channels / ratio)) new_channels = int(init_channels * (ratio - 1)) self.primary_conv = ConvBNLayer( in_c=in_channels, out_c=init_channels, filter_size=kernel_size, stride=stride, padding=int((kernel_size - 1) // 2), num_groups=1, act="relu" if relu else None, lr_mult=lr_mult, conv_decay=conv_decay, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, name=name + "_primary_conv") self.cheap_operation = ConvBNLayer( in_c=init_channels, out_c=new_channels, filter_size=dw_size, stride=1, padding=int((dw_size - 1) // 2), num_groups=init_channels, act="relu" if relu else None, lr_mult=lr_mult, conv_decay=conv_decay, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, name=name + "_cheap_operation") def forward(self, inputs): x = self.primary_conv(inputs) y = self.cheap_operation(x) out = paddle.concat([x, y], axis=1) return out class GhostBottleneck(nn.Layer): def __init__(self, in_channels, hidden_dim, output_channels, kernel_size, stride, use_se, lr_mult, conv_decay=0., norm_type='bn', norm_decay=0., freeze_norm=False, return_list=False, name=None): super(GhostBottleneck, self).__init__() self._stride = stride self._use_se = use_se self._num_channels = in_channels self._output_channels = output_channels self.return_list = return_list self.ghost_module_1 = GhostModule( in_channels=in_channels, output_channels=hidden_dim, kernel_size=1, stride=1, relu=True, lr_mult=lr_mult, conv_decay=conv_decay, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, name=name + "_ghost_module_1") if stride == 2: self.depthwise_conv = ConvBNLayer( in_c=hidden_dim, out_c=hidden_dim, filter_size=kernel_size, stride=stride, padding=int((kernel_size - 1) // 2), num_groups=hidden_dim, act=None, lr_mult=lr_mult, conv_decay=conv_decay, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, name=name + "_depthwise_depthwise" # looks strange due to an old typo, will be fixed later. ) if use_se: self.se_block = SEBlock(hidden_dim, lr_mult, name=name + "_se") self.ghost_module_2 = GhostModule( in_channels=hidden_dim, output_channels=output_channels, kernel_size=1, relu=False, lr_mult=lr_mult, conv_decay=conv_decay, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, name=name + "_ghost_module_2") if stride != 1 or in_channels != output_channels: self.shortcut_depthwise = ConvBNLayer( in_c=in_channels, out_c=in_channels, filter_size=kernel_size, stride=stride, padding=int((kernel_size - 1) // 2), num_groups=in_channels, act=None, lr_mult=lr_mult, conv_decay=conv_decay, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, name=name + "_shortcut_depthwise_depthwise" # looks strange due to an old typo, will be fixed later. ) self.shortcut_conv = ConvBNLayer( in_c=in_channels, out_c=output_channels, filter_size=1, stride=1, padding=0, num_groups=1, act=None, lr_mult=lr_mult, conv_decay=conv_decay, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, name=name + "_shortcut_conv") def forward(self, inputs): y = self.ghost_module_1(inputs) x = y if self._stride == 2: x = self.depthwise_conv(x) if self._use_se: x = self.se_block(x) x = self.ghost_module_2(x) if self._stride == 1 and self._num_channels == self._output_channels: shortcut = inputs else: shortcut = self.shortcut_depthwise(inputs) shortcut = self.shortcut_conv(shortcut) x = paddle.add(x=x, y=shortcut) if self.return_list: return [y, x] else: return x @register @serializable class GhostNet(nn.Layer): __shared__ = ['norm_type'] def __init__( self, scale=1.3, feature_maps=[6, 12, 15], with_extra_blocks=False, extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]], lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0], conv_decay=0., norm_type='bn', norm_decay=0.0, freeze_norm=False): super(GhostNet, self).__init__() if isinstance(feature_maps, Integral): feature_maps = [feature_maps] if norm_type == 'sync_bn' and freeze_norm: raise ValueError( "The norm_type should not be sync_bn when freeze_norm is True") self.feature_maps = feature_maps self.with_extra_blocks = with_extra_blocks self.extra_block_filters = extra_block_filters inplanes = 16 self.cfgs = [ # k, t, c, SE, s [3, 16, 16, 0, 1], [3, 48, 24, 0, 2], [3, 72, 24, 0, 1], [5, 72, 40, 1, 2], [5, 120, 40, 1, 1], [3, 240, 80, 0, 2], [3, 200, 80, 0, 1], [3, 184, 80, 0, 1], [3, 184, 80, 0, 1], [3, 480, 112, 1, 1], [3, 672, 112, 1, 1], [5, 672, 160, 1, 2], # SSDLite output [5, 960, 160, 0, 1], [5, 960, 160, 1, 1], [5, 960, 160, 0, 1], [5, 960, 160, 1, 1] ] self.scale = scale conv1_out_ch = int(make_divisible(inplanes * self.scale, 4)) self.conv1 = ConvBNLayer( in_c=3, out_c=conv1_out_ch, filter_size=3, stride=2, padding=1, num_groups=1, act="relu", lr_mult=1., conv_decay=conv_decay, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, name="conv1") # build inverted residual blocks self._out_channels = [] self.ghost_bottleneck_list = [] idx = 0 inplanes = conv1_out_ch for k, exp_size, c, use_se, s in self.cfgs: lr_idx = min(idx // 3, len(lr_mult_list) - 1) lr_mult = lr_mult_list[lr_idx] # for SSD/SSDLite, first head input is after ResidualUnit expand_conv return_list = self.with_extra_blocks and idx + 2 in self.feature_maps ghost_bottleneck = self.add_sublayer( "_ghostbottleneck_" + str(idx), sublayer=GhostBottleneck( in_channels=inplanes, hidden_dim=int(make_divisible(exp_size * self.scale, 4)), output_channels=int(make_divisible(c * self.scale, 4)), kernel_size=k, stride=s, use_se=use_se, lr_mult=lr_mult, conv_decay=conv_decay, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, return_list=return_list, name="_ghostbottleneck_" + str(idx))) self.ghost_bottleneck_list.append(ghost_bottleneck) inplanes = int(make_divisible(c * self.scale, 4)) idx += 1 self._update_out_channels( int(make_divisible(exp_size * self.scale, 4)) if return_list else inplanes, idx + 1, feature_maps) if self.with_extra_blocks: self.extra_block_list = [] extra_out_c = int(make_divisible(self.scale * self.cfgs[-1][1], 4)) lr_idx = min(idx // 3, len(lr_mult_list) - 1) lr_mult = lr_mult_list[lr_idx] conv_extra = self.add_sublayer( "conv" + str(idx + 2), sublayer=ConvBNLayer( in_c=inplanes, out_c=extra_out_c, filter_size=1, stride=1, padding=0, num_groups=1, act="relu6", lr_mult=lr_mult, conv_decay=conv_decay, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, name="conv" + str(idx + 2))) self.extra_block_list.append(conv_extra) idx += 1 self._update_out_channels(extra_out_c, idx + 1, feature_maps) for j, block_filter in enumerate(self.extra_block_filters): in_c = extra_out_c if j == 0 else self.extra_block_filters[j - 1][1] conv_extra = self.add_sublayer( "conv" + str(idx + 2), sublayer=ExtraBlockDW( in_c, block_filter[0], block_filter[1], stride=2, lr_mult=lr_mult, conv_decay=conv_decay, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, name='conv' + str(idx + 2))) self.extra_block_list.append(conv_extra) idx += 1 self._update_out_channels(block_filter[1], idx + 1, feature_maps) def _update_out_channels(self, channel, feature_idx, feature_maps): if feature_idx in feature_maps: self._out_channels.append(channel) def forward(self, inputs): x = self.conv1(inputs['image']) outs = [] for idx, ghost_bottleneck in enumerate(self.ghost_bottleneck_list): x = ghost_bottleneck(x) if idx + 2 in self.feature_maps: if isinstance(x, list): outs.append(x[0]) x = x[1] else: outs.append(x) if not self.with_extra_blocks: return outs for i, block in enumerate(self.extra_block_list): idx = i + len(self.ghost_bottleneck_list) x = block(x) if idx + 2 in self.feature_maps: outs.append(x) return outs @property def out_shape(self): return [ShapeSpec(channels=c) for c in self._out_channels] ================================================ FILE: ppdet/modeling/backbones/hardnet.py ================================================ # copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn from ppdet.core.workspace import register from ..shape_spec import ShapeSpec __all__ = ['HarDNet'] def ConvLayer(in_channels, out_channels, kernel_size=3, stride=1, bias_attr=False): layer = nn.Sequential( ('conv', nn.Conv2D( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=kernel_size // 2, groups=1, bias_attr=bias_attr)), ('norm', nn.BatchNorm2D(out_channels)), ('relu', nn.ReLU6())) return layer def DWConvLayer(in_channels, out_channels, kernel_size=3, stride=1, bias_attr=False): layer = nn.Sequential( ('dwconv', nn.Conv2D( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=1, groups=out_channels, bias_attr=bias_attr)), ('norm', nn.BatchNorm2D(out_channels))) return layer def CombConvLayer(in_channels, out_channels, kernel_size=1, stride=1): layer = nn.Sequential( ('layer1', ConvLayer( in_channels, out_channels, kernel_size=kernel_size)), ('layer2', DWConvLayer( out_channels, out_channels, stride=stride))) return layer class HarDBlock(nn.Layer): def __init__(self, in_channels, growth_rate, grmul, n_layers, keepBase=False, residual_out=False, dwconv=False): super().__init__() self.keepBase = keepBase self.links = [] layers_ = [] self.out_channels = 0 for i in range(n_layers): outch, inch, link = self.get_link(i + 1, in_channels, growth_rate, grmul) self.links.append(link) if dwconv: layers_.append(CombConvLayer(inch, outch)) else: layers_.append(ConvLayer(inch, outch)) if (i % 2 == 0) or (i == n_layers - 1): self.out_channels += outch self.layers = nn.LayerList(layers_) def get_out_ch(self): return self.out_channels def get_link(self, layer, base_ch, growth_rate, grmul): if layer == 0: return base_ch, 0, [] out_channels = growth_rate link = [] for i in range(10): dv = 2**i if layer % dv == 0: k = layer - dv link.append(k) if i > 0: out_channels *= grmul out_channels = int(int(out_channels + 1) / 2) * 2 in_channels = 0 for i in link: ch, _, _ = self.get_link(i, base_ch, growth_rate, grmul) in_channels += ch return out_channels, in_channels, link def forward(self, x): layers_ = [x] for layer in range(len(self.layers)): link = self.links[layer] tin = [] for i in link: tin.append(layers_[i]) if len(tin) > 1: x = paddle.concat(tin, 1) else: x = tin[0] out = self.layers[layer](x) layers_.append(out) t = len(layers_) out_ = [] for i in range(t): if (i == 0 and self.keepBase) or (i == t - 1) or (i % 2 == 1): out_.append(layers_[i]) out = paddle.concat(out_, 1) return out @register class HarDNet(nn.Layer): def __init__(self, depth_wise=False, return_idx=[1, 3, 8, 13], arch=85): super(HarDNet, self).__init__() assert arch in [68, 85], "HarDNet-{} is not supported.".format(arch) if arch == 85: first_ch = [48, 96] second_kernel = 3 ch_list = [192, 256, 320, 480, 720] grmul = 1.7 gr = [24, 24, 28, 36, 48] n_layers = [8, 16, 16, 16, 16] elif arch == 68: first_ch = [32, 64] second_kernel = 3 ch_list = [128, 256, 320, 640] grmul = 1.7 gr = [14, 16, 20, 40] n_layers = [8, 16, 16, 16] else: raise ValueError("HarDNet-{} is not supported.".format(arch)) self.return_idx = return_idx self._out_channels = [96, 214, 458, 784] avg_pool = True if depth_wise: second_kernel = 1 avg_pool = False blks = len(n_layers) self.base = nn.LayerList([]) # First Layer: Standard Conv3x3, Stride=2 self.base.append( ConvLayer( in_channels=3, out_channels=first_ch[0], kernel_size=3, stride=2, bias_attr=False)) # Second Layer self.base.append( ConvLayer( first_ch[0], first_ch[1], kernel_size=second_kernel)) # Avgpooling or DWConv3x3 downsampling if avg_pool: self.base.append(nn.AvgPool2D(kernel_size=3, stride=2, padding=1)) else: self.base.append(DWConvLayer(first_ch[1], first_ch[1], stride=2)) # Build all HarDNet blocks ch = first_ch[1] for i in range(blks): blk = HarDBlock(ch, gr[i], grmul, n_layers[i], dwconv=depth_wise) ch = blk.out_channels self.base.append(blk) if i != blks - 1: self.base.append(ConvLayer(ch, ch_list[i], kernel_size=1)) ch = ch_list[i] if i == 0: self.base.append( nn.AvgPool2D( kernel_size=2, stride=2, ceil_mode=True)) elif i != blks - 1 and i != 1 and i != 3: self.base.append(nn.AvgPool2D(kernel_size=2, stride=2)) def forward(self, inputs): x = inputs['image'] outs = [] for i, layer in enumerate(self.base): x = layer(x) if i in self.return_idx: outs.append(x) return outs @property def out_shape(self): return [ShapeSpec(channels=self._out_channels[i]) for i in range(4)] ================================================ FILE: ppdet/modeling/backbones/hgnet_v2.py ================================================ # copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle.nn.initializer import KaimingNormal, Constant from paddle.nn import Conv2D, BatchNorm2D, ReLU, AdaptiveAvgPool2D, MaxPool2D from paddle.regularizer import L2Decay from paddle import ParamAttr import copy from ppdet.core.workspace import register, serializable from ..shape_spec import ShapeSpec __all__ = ['PPHGNetV2'] kaiming_normal_ = KaimingNormal() zeros_ = Constant(value=0.) ones_ = Constant(value=1.) class LearnableAffineBlock(nn.Layer): def __init__(self, scale_value=1.0, bias_value=0.0, lr_mult=1.0, lab_lr=0.01): super().__init__() self.scale = self.create_parameter( shape=[1, ], default_initializer=Constant(value=scale_value), attr=ParamAttr(learning_rate=lr_mult * lab_lr)) self.add_parameter("scale", self.scale) self.bias = self.create_parameter( shape=[1, ], default_initializer=Constant(value=bias_value), attr=ParamAttr(learning_rate=lr_mult * lab_lr)) self.add_parameter("bias", self.bias) def forward(self, x): return self.scale * x + self.bias class ConvBNAct(nn.Layer): def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, groups=1, use_act=True, use_lab=False, lr_mult=1.0): super().__init__() self.use_act = use_act self.use_lab = use_lab self.conv = Conv2D( in_channels, out_channels, kernel_size, stride, padding=padding if isinstance(padding, str) else (kernel_size - 1) // 2, groups=groups, weight_attr=ParamAttr(learning_rate=lr_mult), bias_attr=False) self.bn = BatchNorm2D( out_channels, weight_attr=ParamAttr( regularizer=L2Decay(0.0), learning_rate=lr_mult), bias_attr=ParamAttr( regularizer=L2Decay(0.0), learning_rate=lr_mult)) if self.use_act: self.act = ReLU() if self.use_lab: self.lab = LearnableAffineBlock(lr_mult=lr_mult) def forward(self, x): x = self.conv(x) x = self.bn(x) if self.use_act: x = self.act(x) if self.use_lab: x = self.lab(x) return x class LightConvBNAct(nn.Layer): def __init__(self, in_channels, out_channels, kernel_size, stride, groups=1, use_lab=False, lr_mult=1.0): super().__init__() self.conv1 = ConvBNAct( in_channels=in_channels, out_channels=out_channels, kernel_size=1, use_act=False, use_lab=use_lab, lr_mult=lr_mult) self.conv2 = ConvBNAct( in_channels=out_channels, out_channels=out_channels, kernel_size=kernel_size, groups=out_channels, use_act=True, use_lab=use_lab, lr_mult=lr_mult) def forward(self, x): x = self.conv1(x) x = self.conv2(x) return x class StemBlock(nn.Layer): def __init__(self, in_channels, mid_channels, out_channels, use_lab=False, lr_mult=1.0): super().__init__() self.stem1 = ConvBNAct( in_channels=in_channels, out_channels=mid_channels, kernel_size=3, stride=2, use_lab=use_lab, lr_mult=lr_mult) self.stem2a = ConvBNAct( in_channels=mid_channels, out_channels=mid_channels // 2, kernel_size=2, stride=1, padding="SAME", use_lab=use_lab, lr_mult=lr_mult) self.stem2b = ConvBNAct( in_channels=mid_channels // 2, out_channels=mid_channels, kernel_size=2, stride=1, padding="SAME", use_lab=use_lab, lr_mult=lr_mult) self.stem3 = ConvBNAct( in_channels=mid_channels * 2, out_channels=mid_channels, kernel_size=3, stride=2, use_lab=use_lab, lr_mult=lr_mult) self.stem4 = ConvBNAct( in_channels=mid_channels, out_channels=out_channels, kernel_size=1, stride=1, use_lab=use_lab, lr_mult=lr_mult) self.pool = nn.MaxPool2D( kernel_size=2, stride=1, ceil_mode=True, padding="SAME") def forward(self, x): x = self.stem1(x) x2 = self.stem2a(x) x2 = self.stem2b(x2) x1 = self.pool(x) x = paddle.concat([x1, x2], 1) x = self.stem3(x) x = self.stem4(x) return x class HG_Block(nn.Layer): def __init__(self, in_channels, mid_channels, out_channels, kernel_size=3, layer_num=6, identity=False, light_block=True, use_lab=False, lr_mult=1.0): super().__init__() self.identity = identity self.layers = nn.LayerList() block_type = "LightConvBNAct" if light_block else "ConvBNAct" for i in range(layer_num): self.layers.append( eval(block_type)(in_channels=in_channels if i == 0 else mid_channels, out_channels=mid_channels, stride=1, kernel_size=kernel_size, use_lab=use_lab, lr_mult=lr_mult)) # feature aggregation total_channels = in_channels + layer_num * mid_channels self.aggregation_squeeze_conv = ConvBNAct( in_channels=total_channels, out_channels=out_channels // 2, kernel_size=1, stride=1, use_lab=use_lab, lr_mult=lr_mult) self.aggregation_excitation_conv = ConvBNAct( in_channels=out_channels // 2, out_channels=out_channels, kernel_size=1, stride=1, use_lab=use_lab, lr_mult=lr_mult) def forward(self, x): identity = x output = [] output.append(x) for layer in self.layers: x = layer(x) output.append(x) x = paddle.concat(output, axis=1) x = self.aggregation_squeeze_conv(x) x = self.aggregation_excitation_conv(x) if self.identity: x += identity return x class HG_Stage(nn.Layer): def __init__(self, in_channels, mid_channels, out_channels, block_num, layer_num=6, downsample=True, light_block=True, kernel_size=3, use_lab=False, lr_mult=1.0): super().__init__() self.downsample = downsample if downsample: self.downsample = ConvBNAct( in_channels=in_channels, out_channels=in_channels, kernel_size=3, stride=2, groups=in_channels, use_act=False, use_lab=use_lab, lr_mult=lr_mult) blocks_list = [] for i in range(block_num): blocks_list.append( HG_Block( in_channels=in_channels if i == 0 else out_channels, mid_channels=mid_channels, out_channels=out_channels, kernel_size=kernel_size, layer_num=layer_num, identity=False if i == 0 else True, light_block=light_block, use_lab=use_lab, lr_mult=lr_mult)) self.blocks = nn.Sequential(*blocks_list) def forward(self, x): if self.downsample: x = self.downsample(x) x = self.blocks(x) return x def _freeze_norm(m: nn.BatchNorm2D): param_attr = ParamAttr( learning_rate=0., regularizer=L2Decay(0.), trainable=False) bias_attr = ParamAttr( learning_rate=0., regularizer=L2Decay(0.), trainable=False) global_stats = True norm = nn.BatchNorm2D( m._num_features, weight_attr=param_attr, bias_attr=bias_attr, use_global_stats=global_stats) for param in norm.parameters(): param.stop_gradient = True return norm def reset_bn(model: nn.Layer, reset_func=_freeze_norm): if isinstance(model, nn.BatchNorm2D): model = reset_func(model) else: for name, child in model.named_children(): _child = reset_bn(child, reset_func) if _child is not child: setattr(model, name, _child) return model @register @serializable class PPHGNetV2(nn.Layer): """ PPHGNetV2 Args: stem_channels: list. Number of channels for the stem block. stage_type: str. The stage configuration of PPHGNet. such as the number of channels, stride, etc. use_lab: boolean. Whether to use LearnableAffineBlock in network. lr_mult_list: list. Control the learning rate of different stages. Returns: model: nn.Layer. Specific PPHGNetV2 model depends on args. """ arch_configs = { 'S': { 'stem_channels': [3, 24, 32], 'stage_config': { # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num "stage1": [32, 32, 64, 1, False, False, 3, 3], "stage2": [64, 48, 256, 1, True, False, 3, 3], "stage3": [256, 96, 512, 2, True, True, 5, 3], "stage4": [512, 192, 1024, 1, True, True, 5, 3], } }, 'M': { 'stem_channels': [3, 24, 32], 'stage_config': { # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num "stage1": [32, 32, 96, 1, False, False, 3, 4], "stage2": [96, 64, 384, 1, True, False, 3, 4], "stage3": [384, 128, 768, 3, True, True, 5, 4], "stage4": [768, 256, 1536, 1, True, True, 5, 4], } }, 'L': { 'stem_channels': [3, 32, 48], 'stage_config': { # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num "stage1": [48, 48, 128, 1, False, False, 3, 6], "stage2": [128, 96, 512, 1, True, False, 3, 6], "stage3": [512, 192, 1024, 3, True, True, 5, 6], "stage4": [1024, 384, 2048, 1, True, True, 5, 6], } }, 'X': { 'stem_channels': [3, 32, 64], 'stage_config': { # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num "stage1": [64, 64, 128, 1, False, False, 3, 6], "stage2": [128, 128, 512, 2, True, False, 3, 6], "stage3": [512, 256, 1024, 5, True, True, 5, 6], "stage4": [1024, 512, 2048, 2, True, True, 5, 6], } }, 'H': { 'stem_channels': [3, 48, 96], 'stage_config': { # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num "stage1": [96, 96, 192, 2, False, False, 3, 6], "stage2": [192, 192, 512, 3, True, False, 3, 6], "stage3": [512, 384, 1024, 6, True, True, 5, 6], "stage4": [1024, 768, 2048, 3, True, True, 5, 6], } } } def __init__(self, arch, use_lab=False, lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0], return_idx=[1, 2, 3], freeze_stem_only=True, freeze_at=0, freeze_norm=True): super().__init__() self.use_lab = use_lab self.return_idx = return_idx stem_channels = self.arch_configs[arch]['stem_channels'] stage_config = self.arch_configs[arch]['stage_config'] self._out_strides = [4, 8, 16, 32] self._out_channels = [stage_config[k][2] for k in stage_config] # stem self.stem = StemBlock( in_channels=stem_channels[0], mid_channels=stem_channels[1], out_channels=stem_channels[2], use_lab=use_lab, lr_mult=lr_mult_list[0]) # stages self.stages = nn.LayerList() for i, k in enumerate(stage_config): in_channels, mid_channels, out_channels, block_num, downsample, light_block, kernel_size, layer_num = stage_config[ k] self.stages.append( HG_Stage( in_channels, mid_channels, out_channels, block_num, layer_num, downsample, light_block, kernel_size, use_lab, lr_mult=lr_mult_list[i + 1])) if freeze_at >= 0: self._freeze_parameters(self.stem) if not freeze_stem_only: for i in range(min(freeze_at + 1, len(self.stages))): self._freeze_parameters(self.stages[i]) if freeze_norm: reset_bn(self, reset_func=_freeze_norm) self._init_weights() def _freeze_parameters(self, m): for p in m.parameters(): p.stop_gradient = True def _init_weights(self): for m in self.sublayers(): if isinstance(m, nn.Conv2D): kaiming_normal_(m.weight) elif isinstance(m, (nn.BatchNorm2D)): ones_(m.weight) zeros_(m.bias) elif isinstance(m, nn.Linear): zeros_(m.bias) @property def out_shape(self): return [ ShapeSpec( channels=self._out_channels[i], stride=self._out_strides[i]) for i in self.return_idx ] def forward(self, inputs): x = inputs['image'] x = self.stem(x) outs = [] for idx, stage in enumerate(self.stages): x = stage(x) if idx in self.return_idx: outs.append(x) return outs ================================================ FILE: ppdet/modeling/backbones/hrnet.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle.nn import AdaptiveAvgPool2D, Linear from paddle.regularizer import L2Decay from paddle import ParamAttr from paddle.nn.initializer import Normal, Uniform from numbers import Integral import math from ppdet.core.workspace import register from ..shape_spec import ShapeSpec __all__ = ['HRNet'] class ConvNormLayer(nn.Layer): def __init__(self, ch_in, ch_out, filter_size, stride=1, norm_type='bn', norm_groups=32, use_dcn=False, norm_momentum=0.9, norm_decay=0., freeze_norm=False, act=None, name=None): super(ConvNormLayer, self).__init__() assert norm_type in ['bn', 'sync_bn', 'gn'] self.act = act self.conv = nn.Conv2D( in_channels=ch_in, out_channels=ch_out, kernel_size=filter_size, stride=stride, padding=(filter_size - 1) // 2, groups=1, weight_attr=ParamAttr(initializer=Normal( mean=0., std=0.01)), bias_attr=False) norm_lr = 0. if freeze_norm else 1. param_attr = ParamAttr( learning_rate=norm_lr, regularizer=L2Decay(norm_decay)) bias_attr = ParamAttr( learning_rate=norm_lr, regularizer=L2Decay(norm_decay)) global_stats = True if freeze_norm else None if norm_type in ['bn', 'sync_bn']: self.norm = nn.BatchNorm2D( ch_out, momentum=norm_momentum, weight_attr=param_attr, bias_attr=bias_attr, use_global_stats=global_stats) elif norm_type == 'gn': self.norm = nn.GroupNorm( num_groups=norm_groups, num_channels=ch_out, weight_attr=param_attr, bias_attr=bias_attr) norm_params = self.norm.parameters() if freeze_norm: for param in norm_params: param.stop_gradient = True def forward(self, inputs): out = self.conv(inputs) out = self.norm(out) if self.act == 'relu': out = F.relu(out) return out class Layer1(nn.Layer): def __init__(self, num_channels, has_se=False, norm_momentum=0.9, norm_decay=0., freeze_norm=True, name=None): super(Layer1, self).__init__() self.bottleneck_block_list = [] for i in range(4): bottleneck_block = self.add_sublayer( "block_{}_{}".format(name, i + 1), BottleneckBlock( num_channels=num_channels if i == 0 else 256, num_filters=64, has_se=has_se, stride=1, downsample=True if i == 0 else False, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, name=name + '_' + str(i + 1))) self.bottleneck_block_list.append(bottleneck_block) def forward(self, input): conv = input for block_func in self.bottleneck_block_list: conv = block_func(conv) return conv class TransitionLayer(nn.Layer): def __init__(self, in_channels, out_channels, norm_momentum=0.9, norm_decay=0., freeze_norm=True, name=None): super(TransitionLayer, self).__init__() num_in = len(in_channels) num_out = len(out_channels) out = [] self.conv_bn_func_list = [] for i in range(num_out): residual = None if i < num_in: if in_channels[i] != out_channels[i]: residual = self.add_sublayer( "transition_{}_layer_{}".format(name, i + 1), ConvNormLayer( ch_in=in_channels[i], ch_out=out_channels[i], filter_size=3, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, act='relu', name=name + '_layer_' + str(i + 1))) else: residual = self.add_sublayer( "transition_{}_layer_{}".format(name, i + 1), ConvNormLayer( ch_in=in_channels[-1], ch_out=out_channels[i], filter_size=3, stride=2, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, act='relu', name=name + '_layer_' + str(i + 1))) self.conv_bn_func_list.append(residual) def forward(self, input): outs = [] for idx, conv_bn_func in enumerate(self.conv_bn_func_list): if conv_bn_func is None: outs.append(input[idx]) else: if idx < len(input): outs.append(conv_bn_func(input[idx])) else: outs.append(conv_bn_func(input[-1])) return outs class Branches(nn.Layer): def __init__(self, block_num, in_channels, out_channels, has_se=False, norm_momentum=0.9, norm_decay=0., freeze_norm=True, name=None): super(Branches, self).__init__() self.basic_block_list = [] for i in range(len(out_channels)): self.basic_block_list.append([]) for j in range(block_num): in_ch = in_channels[i] if j == 0 else out_channels[i] basic_block_func = self.add_sublayer( "bb_{}_branch_layer_{}_{}".format(name, i + 1, j + 1), BasicBlock( num_channels=in_ch, num_filters=out_channels[i], has_se=has_se, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, name=name + '_branch_layer_' + str(i + 1) + '_' + str(j + 1))) self.basic_block_list[i].append(basic_block_func) def forward(self, inputs): outs = [] for idx, input in enumerate(inputs): conv = input basic_block_list = self.basic_block_list[idx] for basic_block_func in basic_block_list: conv = basic_block_func(conv) outs.append(conv) return outs class BottleneckBlock(nn.Layer): def __init__(self, num_channels, num_filters, has_se, stride=1, downsample=False, norm_momentum=0.9, norm_decay=0., freeze_norm=True, name=None): super(BottleneckBlock, self).__init__() self.has_se = has_se self.downsample = downsample self.conv1 = ConvNormLayer( ch_in=num_channels, ch_out=num_filters, filter_size=1, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, act="relu", name=name + "_conv1") self.conv2 = ConvNormLayer( ch_in=num_filters, ch_out=num_filters, filter_size=3, stride=stride, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, act="relu", name=name + "_conv2") self.conv3 = ConvNormLayer( ch_in=num_filters, ch_out=num_filters * 4, filter_size=1, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, act=None, name=name + "_conv3") if self.downsample: self.conv_down = ConvNormLayer( ch_in=num_channels, ch_out=num_filters * 4, filter_size=1, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, act=None, name=name + "_downsample") if self.has_se: self.se = SELayer( num_channels=num_filters * 4, num_filters=num_filters * 4, reduction_ratio=16, name='fc' + name) def forward(self, input): residual = input conv1 = self.conv1(input) conv2 = self.conv2(conv1) conv3 = self.conv3(conv2) if self.downsample: residual = self.conv_down(input) if self.has_se: conv3 = self.se(conv3) y = paddle.add(x=residual, y=conv3) y = F.relu(y) return y class BasicBlock(nn.Layer): def __init__(self, num_channels, num_filters, stride=1, has_se=False, downsample=False, norm_momentum=0.9, norm_decay=0., freeze_norm=True, name=None): super(BasicBlock, self).__init__() self.has_se = has_se self.downsample = downsample self.conv1 = ConvNormLayer( ch_in=num_channels, ch_out=num_filters, filter_size=3, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, stride=stride, act="relu", name=name + "_conv1") self.conv2 = ConvNormLayer( ch_in=num_filters, ch_out=num_filters, filter_size=3, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, stride=1, act=None, name=name + "_conv2") if self.downsample: self.conv_down = ConvNormLayer( ch_in=num_channels, ch_out=num_filters * 4, filter_size=1, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, act=None, name=name + "_downsample") if self.has_se: self.se = SELayer( num_channels=num_filters, num_filters=num_filters, reduction_ratio=16, name='fc' + name) def forward(self, input): residual = input conv1 = self.conv1(input) conv2 = self.conv2(conv1) if self.downsample: residual = self.conv_down(input) if self.has_se: conv2 = self.se(conv2) y = paddle.add(x=residual, y=conv2) y = F.relu(y) return y class SELayer(nn.Layer): def __init__(self, num_channels, num_filters, reduction_ratio, name=None): super(SELayer, self).__init__() self.pool2d_gap = AdaptiveAvgPool2D(1) self._num_channels = num_channels med_ch = int(num_channels / reduction_ratio) stdv = 1.0 / math.sqrt(num_channels * 1.0) self.squeeze = Linear( num_channels, med_ch, weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv))) stdv = 1.0 / math.sqrt(med_ch * 1.0) self.excitation = Linear( med_ch, num_filters, weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv))) def forward(self, input): pool = self.pool2d_gap(input) pool = paddle.squeeze(pool, axis=[2, 3]) squeeze = self.squeeze(pool) squeeze = F.relu(squeeze) excitation = self.excitation(squeeze) excitation = F.sigmoid(excitation) excitation = paddle.unsqueeze(excitation, axis=[2, 3]) out = input * excitation return out class Stage(nn.Layer): def __init__(self, num_channels, num_modules, num_filters, has_se=False, norm_momentum=0.9, norm_decay=0., freeze_norm=True, multi_scale_output=True, name=None): super(Stage, self).__init__() self._num_modules = num_modules self.stage_func_list = [] for i in range(num_modules): if i == num_modules - 1 and not multi_scale_output: stage_func = self.add_sublayer( "stage_{}_{}".format(name, i + 1), HighResolutionModule( num_channels=num_channels, num_filters=num_filters, has_se=has_se, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, multi_scale_output=False, name=name + '_' + str(i + 1))) else: stage_func = self.add_sublayer( "stage_{}_{}".format(name, i + 1), HighResolutionModule( num_channels=num_channels, num_filters=num_filters, has_se=has_se, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, name=name + '_' + str(i + 1))) self.stage_func_list.append(stage_func) def forward(self, input): out = input for idx in range(self._num_modules): out = self.stage_func_list[idx](out) return out class HighResolutionModule(nn.Layer): def __init__(self, num_channels, num_filters, has_se=False, multi_scale_output=True, norm_momentum=0.9, norm_decay=0., freeze_norm=True, name=None): super(HighResolutionModule, self).__init__() self.branches_func = Branches( block_num=4, in_channels=num_channels, out_channels=num_filters, has_se=has_se, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, name=name) self.fuse_func = FuseLayers( in_channels=num_filters, out_channels=num_filters, multi_scale_output=multi_scale_output, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, name=name) def forward(self, input): out = self.branches_func(input) out = self.fuse_func(out) return out class FuseLayers(nn.Layer): def __init__(self, in_channels, out_channels, multi_scale_output=True, norm_momentum=0.9, norm_decay=0., freeze_norm=True, name=None): super(FuseLayers, self).__init__() self._actual_ch = len(in_channels) if multi_scale_output else 1 self._in_channels = in_channels self.residual_func_list = [] for i in range(self._actual_ch): for j in range(len(in_channels)): residual_func = None if j > i: residual_func = self.add_sublayer( "residual_{}_layer_{}_{}".format(name, i + 1, j + 1), ConvNormLayer( ch_in=in_channels[j], ch_out=out_channels[i], filter_size=1, stride=1, act=None, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, name=name + '_layer_' + str(i + 1) + '_' + str(j + 1))) self.residual_func_list.append(residual_func) elif j < i: pre_num_filters = in_channels[j] for k in range(i - j): if k == i - j - 1: residual_func = self.add_sublayer( "residual_{}_layer_{}_{}_{}".format( name, i + 1, j + 1, k + 1), ConvNormLayer( ch_in=pre_num_filters, ch_out=out_channels[i], filter_size=3, stride=2, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, act=None, name=name + '_layer_' + str(i + 1) + '_' + str(j + 1) + '_' + str(k + 1))) pre_num_filters = out_channels[i] else: residual_func = self.add_sublayer( "residual_{}_layer_{}_{}_{}".format( name, i + 1, j + 1, k + 1), ConvNormLayer( ch_in=pre_num_filters, ch_out=out_channels[j], filter_size=3, stride=2, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, act="relu", name=name + '_layer_' + str(i + 1) + '_' + str(j + 1) + '_' + str(k + 1))) pre_num_filters = out_channels[j] self.residual_func_list.append(residual_func) def forward(self, input): outs = [] residual_func_idx = 0 for i in range(self._actual_ch): residual = input[i] for j in range(len(self._in_channels)): if j > i: y = self.residual_func_list[residual_func_idx](input[j]) residual_func_idx += 1 y = F.interpolate(y, scale_factor=2**(j - i)) residual = paddle.add(x=residual, y=y) elif j < i: y = input[j] for k in range(i - j): y = self.residual_func_list[residual_func_idx](y) residual_func_idx += 1 residual = paddle.add(x=residual, y=y) residual = F.relu(residual) outs.append(residual) return outs @register class HRNet(nn.Layer): """ HRNet, see https://arxiv.org/abs/1908.07919 Args: width (int): the width of HRNet has_se (bool): whether to add SE block for each stage freeze_at (int): the stage to freeze freeze_norm (bool): whether to freeze norm in HRNet norm_momentum (float): momentum of BatchNorm norm_decay (float): weight decay for normalization layer weights return_idx (List): the stage to return upsample (bool): whether to upsample and concat the backbone feats """ def __init__(self, width=18, has_se=False, freeze_at=0, freeze_norm=True, norm_momentum=0.9, norm_decay=0., return_idx=[0, 1, 2, 3], upsample=False, downsample=False): super(HRNet, self).__init__() self.width = width self.has_se = has_se if isinstance(return_idx, Integral): return_idx = [return_idx] assert len(return_idx) > 0, "need one or more return index" self.freeze_at = freeze_at self.return_idx = return_idx self.upsample = upsample self.downsample = downsample self.channels = { 18: [[18, 36], [18, 36, 72], [18, 36, 72, 144]], 30: [[30, 60], [30, 60, 120], [30, 60, 120, 240]], 32: [[32, 64], [32, 64, 128], [32, 64, 128, 256]], 40: [[40, 80], [40, 80, 160], [40, 80, 160, 320]], 44: [[44, 88], [44, 88, 176], [44, 88, 176, 352]], 48: [[48, 96], [48, 96, 192], [48, 96, 192, 384]], 60: [[60, 120], [60, 120, 240], [60, 120, 240, 480]], 64: [[64, 128], [64, 128, 256], [64, 128, 256, 512]] } channels_2, channels_3, channels_4 = self.channels[width] num_modules_2, num_modules_3, num_modules_4 = 1, 4, 3 self._out_channels = [sum(channels_4)] if self.upsample else channels_4 self._out_strides = [4] if self.upsample else [4, 8, 16, 32] self.conv_layer1_1 = ConvNormLayer( ch_in=3, ch_out=64, filter_size=3, stride=2, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, act='relu', name="layer1_1") self.conv_layer1_2 = ConvNormLayer( ch_in=64, ch_out=64, filter_size=3, stride=2, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, act='relu', name="layer1_2") self.la1 = Layer1( num_channels=64, has_se=has_se, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, name="layer2") self.tr1 = TransitionLayer( in_channels=[256], out_channels=channels_2, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, name="tr1") self.st2 = Stage( num_channels=channels_2, num_modules=num_modules_2, num_filters=channels_2, has_se=self.has_se, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, name="st2") self.tr2 = TransitionLayer( in_channels=channels_2, out_channels=channels_3, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, name="tr2") self.st3 = Stage( num_channels=channels_3, num_modules=num_modules_3, num_filters=channels_3, has_se=self.has_se, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, name="st3") self.tr3 = TransitionLayer( in_channels=channels_3, out_channels=channels_4, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, name="tr3") self.st4 = Stage( num_channels=channels_4, num_modules=num_modules_4, num_filters=channels_4, has_se=self.has_se, norm_momentum=norm_momentum, norm_decay=norm_decay, freeze_norm=freeze_norm, multi_scale_output=len(return_idx) > 1, name="st4") if self.downsample: self.incre_modules, self.downsamp_modules, \ self.final_layer = self._make_head(channels_4, norm_momentum=norm_momentum, has_se=self.has_se) def _make_layer(self, block, inplanes, planes, blocks, stride=1, norm_momentum=0.9, has_se=False, name=None): downsample = None if stride != 1 or inplanes != planes * 4: downsample = True layers = [] layers.append( block( inplanes, planes, has_se, stride, downsample, norm_momentum=norm_momentum, freeze_norm=False, name=name + "_s0")) inplanes = planes * 4 for i in range(1, blocks): layers.append( block( inplanes, planes, has_se, norm_momentum=norm_momentum, freeze_norm=False, name=name + "_s" + str(i))) return nn.Sequential(*layers) def _make_head(self, pre_stage_channels, norm_momentum=0.9, has_se=False): head_block = BottleneckBlock head_channels = [32, 64, 128, 256] # Increasing the #channels on each resolution # from C, 2C, 4C, 8C to 128, 256, 512, 1024 incre_modules = [] for i, channels in enumerate(pre_stage_channels): incre_module = self._make_layer( head_block, channels, head_channels[i], 1, stride=1, norm_momentum=norm_momentum, has_se=has_se, name='incre' + str(i)) incre_modules.append(incre_module) incre_modules = nn.LayerList(incre_modules) # downsampling modules downsamp_modules = [] for i in range(len(pre_stage_channels) - 1): in_channels = head_channels[i] * 4 out_channels = head_channels[i + 1] * 4 downsamp_module = nn.Sequential( nn.Conv2D( in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=2, padding=1), nn.BatchNorm2D( out_channels, momentum=norm_momentum), nn.ReLU()) downsamp_modules.append(downsamp_module) downsamp_modules = nn.LayerList(downsamp_modules) final_layer = nn.Sequential( nn.Conv2D( in_channels=head_channels[3] * 4, out_channels=2048, kernel_size=1, stride=1, padding=0), nn.BatchNorm2D( 2048, momentum=norm_momentum), nn.ReLU()) return incre_modules, downsamp_modules, final_layer def forward(self, inputs): x = inputs['image'] conv1 = self.conv_layer1_1(x) conv2 = self.conv_layer1_2(conv1) la1 = self.la1(conv2) tr1 = self.tr1([la1]) st2 = self.st2(tr1) tr2 = self.tr2(st2) st3 = self.st3(tr2) tr3 = self.tr3(st3) st4 = self.st4(tr3) if self.upsample: # Upsampling x0_h, x0_w = st4[0].shape[2:4] x1 = F.upsample(st4[1], size=(x0_h, x0_w), mode='bilinear') x2 = F.upsample(st4[2], size=(x0_h, x0_w), mode='bilinear') x3 = F.upsample(st4[3], size=(x0_h, x0_w), mode='bilinear') x = paddle.concat([st4[0], x1, x2, x3], 1) return x if self.downsample: y = self.incre_modules[0](st4[0]) for i in range(len(self.downsamp_modules)): y = self.incre_modules[i+1](st4[i+1]) + \ self.downsamp_modules[i](y) y = self.final_layer(y) return y res = [] for i, layer in enumerate(st4): if i == self.freeze_at: layer.stop_gradient = True if i in self.return_idx: res.append(layer) return res @property def out_shape(self): if self.upsample: self.return_idx = [0] return [ ShapeSpec( channels=self._out_channels[i], stride=self._out_strides[i]) for i in self.return_idx ] ================================================ FILE: ppdet/modeling/backbones/lcnet.py ================================================ # copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn from paddle import ParamAttr from paddle.nn import Conv2D from paddle.regularizer import L2Decay from paddle.nn.initializer import KaimingNormal from ppdet.core.workspace import register, serializable from numbers import Integral from ..shape_spec import ShapeSpec __all__ = ['LCNet'] NET_CONFIG = { "blocks2": #k, in_c, out_c, s, use_se [[3, 16, 32, 1, False], ], "blocks3": [ [3, 32, 64, 2, False], [3, 64, 64, 1, False], ], "blocks4": [ [3, 64, 128, 2, False], [3, 128, 128, 1, False], ], "blocks5": [ [3, 128, 256, 2, False], [5, 256, 256, 1, False], [5, 256, 256, 1, False], [5, 256, 256, 1, False], [5, 256, 256, 1, False], [5, 256, 256, 1, False], ], "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]] } def make_divisible(v, divisor=8, min_value=None): if min_value is None: min_value = divisor new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) if new_v < 0.9 * v: new_v += divisor return new_v class ConvBNLayer(nn.Layer): def __init__(self, num_channels, filter_size, num_filters, stride, num_groups=1, act='hard_swish'): super().__init__() self.conv = Conv2D( in_channels=num_channels, out_channels=num_filters, kernel_size=filter_size, stride=stride, padding=(filter_size - 1) // 2, groups=num_groups, weight_attr=ParamAttr(initializer=KaimingNormal()), bias_attr=False) self.bn = nn.BatchNorm2D( num_filters, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) if act == 'hard_swish': self.act = nn.Hardswish() elif act == 'relu6': self.act = nn.ReLU6() def forward(self, x): x = self.conv(x) x = self.bn(x) x = self.act(x) return x class DepthwiseSeparable(nn.Layer): def __init__(self, num_channels, num_filters, stride, dw_size=3, use_se=False, act='hard_swish'): super().__init__() self.use_se = use_se self.dw_conv = ConvBNLayer( num_channels=num_channels, num_filters=num_channels, filter_size=dw_size, stride=stride, num_groups=num_channels, act=act) if use_se: self.se = SEModule(num_channels) self.pw_conv = ConvBNLayer( num_channels=num_channels, filter_size=1, num_filters=num_filters, stride=1, act=act) def forward(self, x): x = self.dw_conv(x) if self.use_se: x = self.se(x) x = self.pw_conv(x) return x class AdaptiveAvgPool2D(nn.AdaptiveAvgPool2D): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if paddle.device.get_device().startswith("npu"): self.device = "npu" else: self.device = None if isinstance(self._output_size, int) and self._output_size == 1: self._gap = True elif isinstance(self._output_size, tuple) and self._output_size[ 0] == 1 and self._output_size[1] == 1: self._gap = True else: self._gap = False def forward(self, x): if self.device == "npu" and self._gap: # Global Average Pooling N, C, _, _ = x.shape x_mean = paddle.mean(x, axis=[2, 3]) x_mean = paddle.reshape(x_mean, [N, C, 1, 1]) return x_mean else: return super(AdaptiveAvgPool2D, self).forward(x) class SEModule(nn.Layer): def __init__(self, channel, reduction=4): super().__init__() self.avg_pool = AdaptiveAvgPool2D(1) self.conv1 = Conv2D( in_channels=channel, out_channels=channel // reduction, kernel_size=1, stride=1, padding=0) self.relu = nn.ReLU() self.conv2 = Conv2D( in_channels=channel // reduction, out_channels=channel, kernel_size=1, stride=1, padding=0) self.hardsigmoid = nn.Hardsigmoid() def forward(self, x): identity = x x = self.avg_pool(x) x = self.conv1(x) x = self.relu(x) x = self.conv2(x) x = self.hardsigmoid(x) x = paddle.multiply(x=identity, y=x) return x @register @serializable class LCNet(nn.Layer): def __init__(self, scale=1.0, feature_maps=[3, 4, 5], act='hard_swish'): super().__init__() self.scale = scale self.feature_maps = feature_maps out_channels = [] self.conv1 = ConvBNLayer( num_channels=3, filter_size=3, num_filters=make_divisible(16 * scale), stride=2, act=act) self.blocks2 = nn.Sequential(* [ DepthwiseSeparable( num_channels=make_divisible(in_c * scale), num_filters=make_divisible(out_c * scale), dw_size=k, stride=s, use_se=se, act=act) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"]) ]) self.blocks3 = nn.Sequential(* [ DepthwiseSeparable( num_channels=make_divisible(in_c * scale), num_filters=make_divisible(out_c * scale), dw_size=k, stride=s, use_se=se, act=act) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"]) ]) out_channels.append( make_divisible(NET_CONFIG["blocks3"][-1][2] * scale)) self.blocks4 = nn.Sequential(* [ DepthwiseSeparable( num_channels=make_divisible(in_c * scale), num_filters=make_divisible(out_c * scale), dw_size=k, stride=s, use_se=se, act=act) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"]) ]) out_channels.append( make_divisible(NET_CONFIG["blocks4"][-1][2] * scale)) self.blocks5 = nn.Sequential(* [ DepthwiseSeparable( num_channels=make_divisible(in_c * scale), num_filters=make_divisible(out_c * scale), dw_size=k, stride=s, use_se=se, act=act) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"]) ]) out_channels.append( make_divisible(NET_CONFIG["blocks5"][-1][2] * scale)) self.blocks6 = nn.Sequential(* [ DepthwiseSeparable( num_channels=make_divisible(in_c * scale), num_filters=make_divisible(out_c * scale), dw_size=k, stride=s, use_se=se, act=act) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"]) ]) out_channels.append( make_divisible(NET_CONFIG["blocks6"][-1][2] * scale)) self._out_channels = [ ch for idx, ch in enumerate(out_channels) if idx + 2 in feature_maps ] def forward(self, inputs): x = inputs['image'] outs = [] x = self.conv1(x) x = self.blocks2(x) x = self.blocks3(x) outs.append(x) x = self.blocks4(x) outs.append(x) x = self.blocks5(x) outs.append(x) x = self.blocks6(x) outs.append(x) outs = [o for i, o in enumerate(outs) if i + 2 in self.feature_maps] return outs @property def out_shape(self): return [ShapeSpec(channels=c) for c in self._out_channels] ================================================ FILE: ppdet/modeling/backbones/lite_hrnet.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This code is based on https://github.com/HRNet/Lite-HRNet/blob/hrnet/models/backbones/litehrnet.py """ import paddle import paddle.nn as nn import paddle.nn.functional as F from numbers import Integral from paddle import ParamAttr from paddle.regularizer import L2Decay from paddle.nn.initializer import Normal, Constant from ppdet.core.workspace import register from ppdet.modeling.shape_spec import ShapeSpec from ppdet.modeling.ops import channel_shuffle from .. import layers as L __all__ = ['LiteHRNet'] class ConvNormLayer(nn.Layer): def __init__(self, ch_in, ch_out, filter_size, stride=1, groups=1, norm_type=None, norm_groups=32, norm_decay=0., freeze_norm=False, act=None): super(ConvNormLayer, self).__init__() self.act = act norm_lr = 0. if freeze_norm else 1. if norm_type is not None: assert norm_type in ['bn', 'sync_bn', 'gn'], \ "norm_type should be one of ['bn', 'sync_bn', 'gn'], but got {}".format(norm_type) param_attr = ParamAttr( initializer=Constant(1.0), learning_rate=norm_lr, regularizer=L2Decay(norm_decay), ) bias_attr = ParamAttr( learning_rate=norm_lr, regularizer=L2Decay(norm_decay)) global_stats = True if freeze_norm else None if norm_type in ['bn', 'sync_bn']: self.norm = nn.BatchNorm2D( ch_out, weight_attr=param_attr, bias_attr=bias_attr, use_global_stats=global_stats, ) elif norm_type == 'gn': self.norm = nn.GroupNorm( num_groups=norm_groups, num_channels=ch_out, weight_attr=param_attr, bias_attr=bias_attr) norm_params = self.norm.parameters() if freeze_norm: for param in norm_params: param.stop_gradient = True conv_bias_attr = False else: conv_bias_attr = True self.norm = None self.conv = nn.Conv2D( in_channels=ch_in, out_channels=ch_out, kernel_size=filter_size, stride=stride, padding=(filter_size - 1) // 2, groups=groups, weight_attr=ParamAttr(initializer=Normal( mean=0., std=0.001)), bias_attr=conv_bias_attr) def forward(self, inputs): out = self.conv(inputs) if self.norm is not None: out = self.norm(out) if self.act == 'relu': out = F.relu(out) elif self.act == 'sigmoid': out = F.sigmoid(out) return out class DepthWiseSeparableConvNormLayer(nn.Layer): def __init__(self, ch_in, ch_out, filter_size, stride=1, dw_norm_type=None, pw_norm_type=None, norm_decay=0., freeze_norm=False, dw_act=None, pw_act=None): super(DepthWiseSeparableConvNormLayer, self).__init__() self.depthwise_conv = ConvNormLayer( ch_in=ch_in, ch_out=ch_in, filter_size=filter_size, stride=stride, groups=ch_in, norm_type=dw_norm_type, act=dw_act, norm_decay=norm_decay, freeze_norm=freeze_norm, ) self.pointwise_conv = ConvNormLayer( ch_in=ch_in, ch_out=ch_out, filter_size=1, stride=1, norm_type=pw_norm_type, act=pw_act, norm_decay=norm_decay, freeze_norm=freeze_norm, ) def forward(self, x): x = self.depthwise_conv(x) x = self.pointwise_conv(x) return x class CrossResolutionWeightingModule(nn.Layer): def __init__(self, channels, ratio=16, norm_type='bn', freeze_norm=False, norm_decay=0.): super(CrossResolutionWeightingModule, self).__init__() self.channels = channels total_channel = sum(channels) self.conv1 = ConvNormLayer( ch_in=total_channel, ch_out=total_channel // ratio, filter_size=1, stride=1, norm_type=norm_type, act='relu', freeze_norm=freeze_norm, norm_decay=norm_decay) self.conv2 = ConvNormLayer( ch_in=total_channel // ratio, ch_out=total_channel, filter_size=1, stride=1, norm_type=norm_type, act='sigmoid', freeze_norm=freeze_norm, norm_decay=norm_decay) def forward(self, x): mini_size = x[-1].shape[-2:] out = [F.adaptive_avg_pool2d(s, mini_size) for s in x[:-1]] + [x[-1]] out = paddle.concat(out, 1) out = self.conv1(out) out = self.conv2(out) out = paddle.split(out, self.channels, 1) out = [ s * F.interpolate( a, s.shape[-2:], mode='nearest') for s, a in zip(x, out) ] return out class SpatialWeightingModule(nn.Layer): def __init__(self, in_channel, ratio=16, freeze_norm=False, norm_decay=0.): super(SpatialWeightingModule, self).__init__() self.global_avgpooling = nn.AdaptiveAvgPool2D(1) self.conv1 = ConvNormLayer( ch_in=in_channel, ch_out=in_channel // ratio, filter_size=1, stride=1, act='relu', freeze_norm=freeze_norm, norm_decay=norm_decay) self.conv2 = ConvNormLayer( ch_in=in_channel // ratio, ch_out=in_channel, filter_size=1, stride=1, act='sigmoid', freeze_norm=freeze_norm, norm_decay=norm_decay) def forward(self, x): out = self.global_avgpooling(x) out = self.conv1(out) out = self.conv2(out) return x * out class ConditionalChannelWeightingBlock(nn.Layer): def __init__(self, in_channels, stride, reduce_ratio, norm_type='bn', freeze_norm=False, norm_decay=0.): super(ConditionalChannelWeightingBlock, self).__init__() assert stride in [1, 2] branch_channels = [channel // 2 for channel in in_channels] self.cross_resolution_weighting = CrossResolutionWeightingModule( branch_channels, ratio=reduce_ratio, norm_type=norm_type, freeze_norm=freeze_norm, norm_decay=norm_decay) self.depthwise_convs = nn.LayerList([ ConvNormLayer( channel, channel, filter_size=3, stride=stride, groups=channel, norm_type=norm_type, freeze_norm=freeze_norm, norm_decay=norm_decay) for channel in branch_channels ]) self.spatial_weighting = nn.LayerList([ SpatialWeightingModule( channel, ratio=4, freeze_norm=freeze_norm, norm_decay=norm_decay) for channel in branch_channels ]) def forward(self, x): x = [s.chunk(2, axis=1) for s in x] x1 = [s[0] for s in x] x2 = [s[1] for s in x] x2 = self.cross_resolution_weighting(x2) x2 = [dw(s) for s, dw in zip(x2, self.depthwise_convs)] x2 = [sw(s) for s, sw in zip(x2, self.spatial_weighting)] out = [paddle.concat([s1, s2], axis=1) for s1, s2 in zip(x1, x2)] out = [channel_shuffle(s, groups=2) for s in out] return out class ShuffleUnit(nn.Layer): def __init__(self, in_channel, out_channel, stride, norm_type='bn', freeze_norm=False, norm_decay=0.): super(ShuffleUnit, self).__init__() branch_channel = out_channel // 2 self.stride = stride if self.stride == 1: assert in_channel == branch_channel * 2, \ "when stride=1, in_channel {} should equal to branch_channel*2 {}".format(in_channel, branch_channel * 2) if stride > 1: self.branch1 = nn.Sequential( ConvNormLayer( ch_in=in_channel, ch_out=in_channel, filter_size=3, stride=self.stride, groups=in_channel, norm_type=norm_type, freeze_norm=freeze_norm, norm_decay=norm_decay), ConvNormLayer( ch_in=in_channel, ch_out=branch_channel, filter_size=1, stride=1, norm_type=norm_type, act='relu', freeze_norm=freeze_norm, norm_decay=norm_decay), ) self.branch2 = nn.Sequential( ConvNormLayer( ch_in=branch_channel if stride == 1 else in_channel, ch_out=branch_channel, filter_size=1, stride=1, norm_type=norm_type, act='relu', freeze_norm=freeze_norm, norm_decay=norm_decay), ConvNormLayer( ch_in=branch_channel, ch_out=branch_channel, filter_size=3, stride=self.stride, groups=branch_channel, norm_type=norm_type, freeze_norm=freeze_norm, norm_decay=norm_decay), ConvNormLayer( ch_in=branch_channel, ch_out=branch_channel, filter_size=1, stride=1, norm_type=norm_type, act='relu', freeze_norm=freeze_norm, norm_decay=norm_decay), ) def forward(self, x): if self.stride > 1: x1 = self.branch1(x) x2 = self.branch2(x) else: x1, x2 = x.chunk(2, axis=1) x2 = self.branch2(x2) out = paddle.concat([x1, x2], axis=1) out = channel_shuffle(out, groups=2) return out class IterativeHead(nn.Layer): def __init__(self, in_channels, norm_type='bn', freeze_norm=False, norm_decay=0.): super(IterativeHead, self).__init__() num_branches = len(in_channels) self.in_channels = in_channels[::-1] projects = [] for i in range(num_branches): if i != num_branches - 1: projects.append( DepthWiseSeparableConvNormLayer( ch_in=self.in_channels[i], ch_out=self.in_channels[i + 1], filter_size=3, stride=1, dw_act=None, pw_act='relu', dw_norm_type=norm_type, pw_norm_type=norm_type, freeze_norm=freeze_norm, norm_decay=norm_decay)) else: projects.append( DepthWiseSeparableConvNormLayer( ch_in=self.in_channels[i], ch_out=self.in_channels[i], filter_size=3, stride=1, dw_act=None, pw_act='relu', dw_norm_type=norm_type, pw_norm_type=norm_type, freeze_norm=freeze_norm, norm_decay=norm_decay)) self.projects = nn.LayerList(projects) def forward(self, x): x = x[::-1] y = [] last_x = None for i, s in enumerate(x): if last_x is not None: last_x = F.interpolate( last_x, size=s.shape[-2:], mode='bilinear', align_corners=True) s = s + last_x s = self.projects[i](s) y.append(s) last_x = s return y[::-1] class Stem(nn.Layer): def __init__(self, in_channel, stem_channel, out_channel, expand_ratio, norm_type='bn', freeze_norm=False, norm_decay=0.): super(Stem, self).__init__() self.conv1 = ConvNormLayer( in_channel, stem_channel, filter_size=3, stride=2, norm_type=norm_type, act='relu', freeze_norm=freeze_norm, norm_decay=norm_decay) mid_channel = int(round(stem_channel * expand_ratio)) branch_channel = stem_channel // 2 if stem_channel == out_channel: inc_channel = out_channel - branch_channel else: inc_channel = out_channel - stem_channel self.branch1 = nn.Sequential( ConvNormLayer( ch_in=branch_channel, ch_out=branch_channel, filter_size=3, stride=2, groups=branch_channel, norm_type=norm_type, freeze_norm=freeze_norm, norm_decay=norm_decay), ConvNormLayer( ch_in=branch_channel, ch_out=inc_channel, filter_size=1, stride=1, norm_type=norm_type, act='relu', freeze_norm=freeze_norm, norm_decay=norm_decay), ) self.expand_conv = ConvNormLayer( ch_in=branch_channel, ch_out=mid_channel, filter_size=1, stride=1, norm_type=norm_type, act='relu', freeze_norm=freeze_norm, norm_decay=norm_decay) self.depthwise_conv = ConvNormLayer( ch_in=mid_channel, ch_out=mid_channel, filter_size=3, stride=2, groups=mid_channel, norm_type=norm_type, freeze_norm=freeze_norm, norm_decay=norm_decay) self.linear_conv = ConvNormLayer( ch_in=mid_channel, ch_out=branch_channel if stem_channel == out_channel else stem_channel, filter_size=1, stride=1, norm_type=norm_type, act='relu', freeze_norm=freeze_norm, norm_decay=norm_decay) def forward(self, x): x = self.conv1(x) x1, x2 = x.chunk(2, axis=1) x1 = self.branch1(x1) x2 = self.expand_conv(x2) x2 = self.depthwise_conv(x2) x2 = self.linear_conv(x2) out = paddle.concat([x1, x2], axis=1) out = channel_shuffle(out, groups=2) return out class LiteHRNetModule(nn.Layer): def __init__(self, num_branches, num_blocks, in_channels, reduce_ratio, module_type, multiscale_output=False, with_fuse=True, norm_type='bn', freeze_norm=False, norm_decay=0.): super(LiteHRNetModule, self).__init__() assert num_branches == len(in_channels),\ "num_branches {} should equal to num_in_channels {}".format(num_branches, len(in_channels)) assert module_type in [ 'LITE', 'NAIVE' ], "module_type should be one of ['LITE', 'NAIVE']" self.num_branches = num_branches self.in_channels = in_channels self.multiscale_output = multiscale_output self.with_fuse = with_fuse self.norm_type = 'bn' self.module_type = module_type if self.module_type == 'LITE': self.layers = self._make_weighting_blocks( num_blocks, reduce_ratio, freeze_norm=freeze_norm, norm_decay=norm_decay) elif self.module_type == 'NAIVE': self.layers = self._make_naive_branches( num_branches, num_blocks, freeze_norm=freeze_norm, norm_decay=norm_decay) if self.with_fuse: self.fuse_layers = self._make_fuse_layers( freeze_norm=freeze_norm, norm_decay=norm_decay) self.relu = nn.ReLU() def _make_weighting_blocks(self, num_blocks, reduce_ratio, stride=1, freeze_norm=False, norm_decay=0.): layers = [] for i in range(num_blocks): layers.append( ConditionalChannelWeightingBlock( self.in_channels, stride=stride, reduce_ratio=reduce_ratio, norm_type=self.norm_type, freeze_norm=freeze_norm, norm_decay=norm_decay)) return nn.Sequential(*layers) def _make_naive_branches(self, num_branches, num_blocks, freeze_norm=False, norm_decay=0.): branches = [] for branch_idx in range(num_branches): layers = [] for i in range(num_blocks): layers.append( ShuffleUnit( self.in_channels[branch_idx], self.in_channels[branch_idx], stride=1, norm_type=self.norm_type, freeze_norm=freeze_norm, norm_decay=norm_decay)) branches.append(nn.Sequential(*layers)) return nn.LayerList(branches) def _make_fuse_layers(self, freeze_norm=False, norm_decay=0.): if self.num_branches == 1: return None fuse_layers = [] num_out_branches = self.num_branches if self.multiscale_output else 1 for i in range(num_out_branches): fuse_layer = [] for j in range(self.num_branches): if j > i: fuse_layer.append( nn.Sequential( L.Conv2d( self.in_channels[j], self.in_channels[i], kernel_size=1, stride=1, padding=0, bias=False, ), nn.BatchNorm2D(self.in_channels[i]), nn.Upsample( scale_factor=2**(j - i), mode='nearest'))) elif j == i: fuse_layer.append(None) else: conv_downsamples = [] for k in range(i - j): if k == i - j - 1: conv_downsamples.append( nn.Sequential( L.Conv2d( self.in_channels[j], self.in_channels[j], kernel_size=3, stride=2, padding=1, groups=self.in_channels[j], bias=False, ), nn.BatchNorm2D(self.in_channels[j]), L.Conv2d( self.in_channels[j], self.in_channels[i], kernel_size=1, stride=1, padding=0, bias=False, ), nn.BatchNorm2D(self.in_channels[i]))) else: conv_downsamples.append( nn.Sequential( L.Conv2d( self.in_channels[j], self.in_channels[j], kernel_size=3, stride=2, padding=1, groups=self.in_channels[j], bias=False, ), nn.BatchNorm2D(self.in_channels[j]), L.Conv2d( self.in_channels[j], self.in_channels[j], kernel_size=1, stride=1, padding=0, bias=False, ), nn.BatchNorm2D(self.in_channels[j]), nn.ReLU())) fuse_layer.append(nn.Sequential(*conv_downsamples)) fuse_layers.append(nn.LayerList(fuse_layer)) return nn.LayerList(fuse_layers) def forward(self, x): if self.num_branches == 1: return [self.layers[0](x[0])] if self.module_type == 'LITE': out = self.layers(x) elif self.module_type == 'NAIVE': for i in range(self.num_branches): x[i] = self.layers[i](x[i]) out = x if self.with_fuse: out_fuse = [] for i in range(len(self.fuse_layers)): y = out[0] if i == 0 else self.fuse_layers[i][0](out[0]) for j in range(self.num_branches): if j == 0: y += y elif i == j: y += out[j] else: y += self.fuse_layers[i][j](out[j]) if i == 0: out[i] = y out_fuse.append(self.relu(y)) out = out_fuse elif not self.multiscale_output: out = [out[0]] return out @register class LiteHRNet(nn.Layer): """ @inproceedings{Yulitehrnet21, title={Lite-HRNet: A Lightweight High-Resolution Network}, author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong}, booktitle={CVPR},year={2021} } Args: network_type (str): the network_type should be one of ["lite_18", "lite_30", "naive", "wider_naive"], "naive": Simply combining the shuffle block in ShuffleNet and the highresolution design pattern in HRNet. "wider_naive": Naive network with wider channels in each block. "lite_18": Lite-HRNet-18, which replaces the pointwise convolution in a shuffle block by conditional channel weighting. "lite_30": Lite-HRNet-30, with more blocks compared with Lite-HRNet-18. freeze_at (int): the stage to freeze freeze_norm (bool): whether to freeze norm in HRNet norm_decay (float): weight decay for normalization layer weights return_idx (List): the stage to return """ def __init__(self, network_type, freeze_at=0, freeze_norm=True, norm_decay=0., return_idx=[0, 1, 2, 3]): super(LiteHRNet, self).__init__() if isinstance(return_idx, Integral): return_idx = [return_idx] assert network_type in ["lite_18", "lite_30", "naive", "wider_naive"], \ "the network_type should be one of [lite_18, lite_30, naive, wider_naive]" assert len(return_idx) > 0, "need one or more return index" self.freeze_at = freeze_at self.freeze_norm = freeze_norm self.norm_decay = norm_decay self.return_idx = return_idx self.norm_type = 'bn' self.module_configs = { "lite_18": { "num_modules": [2, 4, 2], "num_branches": [2, 3, 4], "num_blocks": [2, 2, 2], "module_type": ["LITE", "LITE", "LITE"], "reduce_ratios": [8, 8, 8], "num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]], }, "lite_30": { "num_modules": [3, 8, 3], "num_branches": [2, 3, 4], "num_blocks": [2, 2, 2], "module_type": ["LITE", "LITE", "LITE"], "reduce_ratios": [8, 8, 8], "num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]], }, "naive": { "num_modules": [2, 4, 2], "num_branches": [2, 3, 4], "num_blocks": [2, 2, 2], "module_type": ["NAIVE", "NAIVE", "NAIVE"], "reduce_ratios": [1, 1, 1], "num_channels": [[30, 60], [30, 60, 120], [30, 60, 120, 240]], }, "wider_naive": { "num_modules": [2, 4, 2], "num_branches": [2, 3, 4], "num_blocks": [2, 2, 2], "module_type": ["NAIVE", "NAIVE", "NAIVE"], "reduce_ratios": [1, 1, 1], "num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]], }, } self.stages_config = self.module_configs[network_type] self.stem = Stem(3, 32, 32, 1) num_channels_pre_layer = [32] for stage_idx in range(3): num_channels = self.stages_config["num_channels"][stage_idx] setattr(self, 'transition{}'.format(stage_idx), self._make_transition_layer(num_channels_pre_layer, num_channels, self.freeze_norm, self.norm_decay)) stage, num_channels_pre_layer = self._make_stage( self.stages_config, stage_idx, num_channels, True, self.freeze_norm, self.norm_decay) setattr(self, 'stage{}'.format(stage_idx), stage) self.head_layer = IterativeHead(num_channels_pre_layer, 'bn', self.freeze_norm, self.norm_decay) def _make_transition_layer(self, num_channels_pre_layer, num_channels_cur_layer, freeze_norm=False, norm_decay=0.): num_branches_pre = len(num_channels_pre_layer) num_branches_cur = len(num_channels_cur_layer) transition_layers = [] for i in range(num_branches_cur): if i < num_branches_pre: if num_channels_cur_layer[i] != num_channels_pre_layer[i]: transition_layers.append( nn.Sequential( L.Conv2d( num_channels_pre_layer[i], num_channels_pre_layer[i], kernel_size=3, stride=1, padding=1, groups=num_channels_pre_layer[i], bias=False), nn.BatchNorm2D(num_channels_pre_layer[i]), L.Conv2d( num_channels_pre_layer[i], num_channels_cur_layer[i], kernel_size=1, stride=1, padding=0, bias=False, ), nn.BatchNorm2D(num_channels_cur_layer[i]), nn.ReLU())) else: transition_layers.append(None) else: conv_downsamples = [] for j in range(i + 1 - num_branches_pre): conv_downsamples.append( nn.Sequential( L.Conv2d( num_channels_pre_layer[-1], num_channels_pre_layer[-1], groups=num_channels_pre_layer[-1], kernel_size=3, stride=2, padding=1, bias=False, ), nn.BatchNorm2D(num_channels_pre_layer[-1]), L.Conv2d( num_channels_pre_layer[-1], num_channels_cur_layer[i] if j == i - num_branches_pre else num_channels_pre_layer[-1], kernel_size=1, stride=1, padding=0, bias=False, ), nn.BatchNorm2D(num_channels_cur_layer[i] if j == i - num_branches_pre else num_channels_pre_layer[-1]), nn.ReLU())) transition_layers.append(nn.Sequential(*conv_downsamples)) return nn.LayerList(transition_layers) def _make_stage(self, stages_config, stage_idx, in_channels, multiscale_output, freeze_norm=False, norm_decay=0.): num_modules = stages_config["num_modules"][stage_idx] num_branches = stages_config["num_branches"][stage_idx] num_blocks = stages_config["num_blocks"][stage_idx] reduce_ratio = stages_config['reduce_ratios'][stage_idx] module_type = stages_config['module_type'][stage_idx] modules = [] for i in range(num_modules): if not multiscale_output and i == num_modules - 1: reset_multiscale_output = False else: reset_multiscale_output = True modules.append( LiteHRNetModule( num_branches, num_blocks, in_channels, reduce_ratio, module_type, multiscale_output=reset_multiscale_output, with_fuse=True, freeze_norm=freeze_norm, norm_decay=norm_decay)) in_channels = modules[-1].in_channels return nn.Sequential(*modules), in_channels def forward(self, inputs): x = inputs['image'] dims = x.shape if len(dims) == 5: x = paddle.reshape(x, (dims[0] * dims[1], dims[2], dims[3], dims[4])) # [6, 3, 128, 96] x = self.stem(x) y_list = [x] for stage_idx in range(3): x_list = [] transition = getattr(self, 'transition{}'.format(stage_idx)) for j in range(self.stages_config["num_branches"][stage_idx]): if transition[j] is not None: if j >= len(y_list): x_list.append(transition[j](y_list[-1])) else: x_list.append(transition[j](y_list[j])) else: x_list.append(y_list[j]) y_list = getattr(self, 'stage{}'.format(stage_idx))(x_list) x = self.head_layer(y_list) res = [] for i, layer in enumerate(x): if i == self.freeze_at: layer.stop_gradient = True if i in self.return_idx: res.append(layer) return res @property def out_shape(self): return [ ShapeSpec( channels=self._out_channels[i], stride=self._out_strides[i]) for i in self.return_idx ] ================================================ FILE: ppdet/modeling/backbones/mobilenet_v1.py ================================================ # copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.regularizer import L2Decay from paddle.nn.initializer import KaimingNormal from ppdet.core.workspace import register, serializable from numbers import Integral from ..shape_spec import ShapeSpec __all__ = ['MobileNet'] class ConvBNLayer(nn.Layer): def __init__(self, in_channels, out_channels, kernel_size, stride, padding, num_groups=1, act='relu', conv_lr=1., conv_decay=0., norm_decay=0., norm_type='bn', name=None): super(ConvBNLayer, self).__init__() self.act = act self._conv = nn.Conv2D( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=num_groups, weight_attr=ParamAttr( learning_rate=conv_lr, initializer=KaimingNormal(), regularizer=L2Decay(conv_decay)), bias_attr=False) param_attr = ParamAttr(regularizer=L2Decay(norm_decay)) bias_attr = ParamAttr(regularizer=L2Decay(norm_decay)) if norm_type in ['sync_bn', 'bn']: self._batch_norm = nn.BatchNorm2D( out_channels, weight_attr=param_attr, bias_attr=bias_attr) def forward(self, x): x = self._conv(x) x = self._batch_norm(x) if self.act == "relu": x = F.relu(x) elif self.act == "relu6": x = F.relu6(x) return x class DepthwiseSeparable(nn.Layer): def __init__(self, in_channels, out_channels1, out_channels2, num_groups, stride, scale, conv_lr=1., conv_decay=0., norm_decay=0., norm_type='bn', name=None): super(DepthwiseSeparable, self).__init__() self._depthwise_conv = ConvBNLayer( in_channels, int(out_channels1 * scale), kernel_size=3, stride=stride, padding=1, num_groups=int(num_groups * scale), conv_lr=conv_lr, conv_decay=conv_decay, norm_decay=norm_decay, norm_type=norm_type, name=name + "_dw") self._pointwise_conv = ConvBNLayer( int(out_channels1 * scale), int(out_channels2 * scale), kernel_size=1, stride=1, padding=0, conv_lr=conv_lr, conv_decay=conv_decay, norm_decay=norm_decay, norm_type=norm_type, name=name + "_sep") def forward(self, x): x = self._depthwise_conv(x) x = self._pointwise_conv(x) return x class ExtraBlock(nn.Layer): def __init__(self, in_channels, out_channels1, out_channels2, num_groups=1, stride=2, conv_lr=1., conv_decay=0., norm_decay=0., norm_type='bn', name=None): super(ExtraBlock, self).__init__() self.pointwise_conv = ConvBNLayer( in_channels, int(out_channels1), kernel_size=1, stride=1, padding=0, num_groups=int(num_groups), act='relu6', conv_lr=conv_lr, conv_decay=conv_decay, norm_decay=norm_decay, norm_type=norm_type, name=name + "_extra1") self.normal_conv = ConvBNLayer( int(out_channels1), int(out_channels2), kernel_size=3, stride=stride, padding=1, num_groups=int(num_groups), act='relu6', conv_lr=conv_lr, conv_decay=conv_decay, norm_decay=norm_decay, norm_type=norm_type, name=name + "_extra2") def forward(self, x): x = self.pointwise_conv(x) x = self.normal_conv(x) return x @register @serializable class MobileNet(nn.Layer): __shared__ = ['norm_type'] def __init__(self, norm_type='bn', norm_decay=0., conv_decay=0., scale=1, conv_learning_rate=1.0, feature_maps=[4, 6, 13], with_extra_blocks=False, extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]]): super(MobileNet, self).__init__() if isinstance(feature_maps, Integral): feature_maps = [feature_maps] self.feature_maps = feature_maps self.with_extra_blocks = with_extra_blocks self.extra_block_filters = extra_block_filters self._out_channels = [] self.conv1 = ConvBNLayer( in_channels=3, out_channels=int(32 * scale), kernel_size=3, stride=2, padding=1, conv_lr=conv_learning_rate, conv_decay=conv_decay, norm_decay=norm_decay, norm_type=norm_type, name="conv1") self.dwsl = [] dws21 = self.add_sublayer( "conv2_1", sublayer=DepthwiseSeparable( in_channels=int(32 * scale), out_channels1=32, out_channels2=64, num_groups=32, stride=1, scale=scale, conv_lr=conv_learning_rate, conv_decay=conv_decay, norm_decay=norm_decay, norm_type=norm_type, name="conv2_1")) self.dwsl.append(dws21) self._update_out_channels(int(64 * scale), len(self.dwsl), feature_maps) dws22 = self.add_sublayer( "conv2_2", sublayer=DepthwiseSeparable( in_channels=int(64 * scale), out_channels1=64, out_channels2=128, num_groups=64, stride=2, scale=scale, conv_lr=conv_learning_rate, conv_decay=conv_decay, norm_decay=norm_decay, norm_type=norm_type, name="conv2_2")) self.dwsl.append(dws22) self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps) # 1/4 dws31 = self.add_sublayer( "conv3_1", sublayer=DepthwiseSeparable( in_channels=int(128 * scale), out_channels1=128, out_channels2=128, num_groups=128, stride=1, scale=scale, conv_lr=conv_learning_rate, conv_decay=conv_decay, norm_decay=norm_decay, norm_type=norm_type, name="conv3_1")) self.dwsl.append(dws31) self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps) dws32 = self.add_sublayer( "conv3_2", sublayer=DepthwiseSeparable( in_channels=int(128 * scale), out_channels1=128, out_channels2=256, num_groups=128, stride=2, scale=scale, conv_lr=conv_learning_rate, conv_decay=conv_decay, norm_decay=norm_decay, norm_type=norm_type, name="conv3_2")) self.dwsl.append(dws32) self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps) # 1/8 dws41 = self.add_sublayer( "conv4_1", sublayer=DepthwiseSeparable( in_channels=int(256 * scale), out_channels1=256, out_channels2=256, num_groups=256, stride=1, scale=scale, conv_lr=conv_learning_rate, conv_decay=conv_decay, norm_decay=norm_decay, norm_type=norm_type, name="conv4_1")) self.dwsl.append(dws41) self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps) dws42 = self.add_sublayer( "conv4_2", sublayer=DepthwiseSeparable( in_channels=int(256 * scale), out_channels1=256, out_channels2=512, num_groups=256, stride=2, scale=scale, conv_lr=conv_learning_rate, conv_decay=conv_decay, norm_decay=norm_decay, norm_type=norm_type, name="conv4_2")) self.dwsl.append(dws42) self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps) # 1/16 for i in range(5): tmp = self.add_sublayer( "conv5_" + str(i + 1), sublayer=DepthwiseSeparable( in_channels=int(512 * scale), out_channels1=512, out_channels2=512, num_groups=512, stride=1, scale=scale, conv_lr=conv_learning_rate, conv_decay=conv_decay, norm_decay=norm_decay, norm_type=norm_type, name="conv5_" + str(i + 1))) self.dwsl.append(tmp) self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps) dws56 = self.add_sublayer( "conv5_6", sublayer=DepthwiseSeparable( in_channels=int(512 * scale), out_channels1=512, out_channels2=1024, num_groups=512, stride=2, scale=scale, conv_lr=conv_learning_rate, conv_decay=conv_decay, norm_decay=norm_decay, norm_type=norm_type, name="conv5_6")) self.dwsl.append(dws56) self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps) # 1/32 dws6 = self.add_sublayer( "conv6", sublayer=DepthwiseSeparable( in_channels=int(1024 * scale), out_channels1=1024, out_channels2=1024, num_groups=1024, stride=1, scale=scale, conv_lr=conv_learning_rate, conv_decay=conv_decay, norm_decay=norm_decay, norm_type=norm_type, name="conv6")) self.dwsl.append(dws6) self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps) if self.with_extra_blocks: self.extra_blocks = [] for i, block_filter in enumerate(self.extra_block_filters): in_c = 1024 if i == 0 else self.extra_block_filters[i - 1][1] conv_extra = self.add_sublayer( "conv7_" + str(i + 1), sublayer=ExtraBlock( in_c, block_filter[0], block_filter[1], conv_lr=conv_learning_rate, conv_decay=conv_decay, norm_decay=norm_decay, norm_type=norm_type, name="conv7_" + str(i + 1))) self.extra_blocks.append(conv_extra) self._update_out_channels( block_filter[1], len(self.dwsl) + len(self.extra_blocks), feature_maps) def _update_out_channels(self, channel, feature_idx, feature_maps): if feature_idx in feature_maps: self._out_channels.append(channel) def forward(self, inputs): outs = [] y = self.conv1(inputs['image']) for i, block in enumerate(self.dwsl): y = block(y) if i + 1 in self.feature_maps: outs.append(y) if not self.with_extra_blocks: return outs y = outs[-1] for i, block in enumerate(self.extra_blocks): idx = i + len(self.dwsl) y = block(y) if idx + 1 in self.feature_maps: outs.append(y) return outs @property def out_shape(self): return [ShapeSpec(channels=c) for c in self._out_channels] ================================================ FILE: ppdet/modeling/backbones/mobilenet_v3.py ================================================ # copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.regularizer import L2Decay from ppdet.core.workspace import register, serializable from numbers import Integral from ..shape_spec import ShapeSpec __all__ = ['MobileNetV3'] def make_divisible(v, divisor=8, min_value=None): if min_value is None: min_value = divisor new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) if new_v < 0.9 * v: new_v += divisor return new_v class ConvBNLayer(nn.Layer): def __init__(self, in_c, out_c, filter_size, stride, padding, num_groups=1, act=None, lr_mult=1., conv_decay=0., norm_type='bn', norm_decay=0., freeze_norm=False, name=""): super(ConvBNLayer, self).__init__() self.act = act self.conv = nn.Conv2D( in_channels=in_c, out_channels=out_c, kernel_size=filter_size, stride=stride, padding=padding, groups=num_groups, weight_attr=ParamAttr( learning_rate=lr_mult, regularizer=L2Decay(conv_decay)), bias_attr=False) norm_lr = 0. if freeze_norm else lr_mult param_attr = ParamAttr( learning_rate=norm_lr, regularizer=L2Decay(norm_decay), trainable=False if freeze_norm else True) bias_attr = ParamAttr( learning_rate=norm_lr, regularizer=L2Decay(norm_decay), trainable=False if freeze_norm else True) global_stats = True if freeze_norm else None if norm_type in ['sync_bn', 'bn']: self.bn = nn.BatchNorm2D( out_c, weight_attr=param_attr, bias_attr=bias_attr, use_global_stats=global_stats) norm_params = self.bn.parameters() if freeze_norm: for param in norm_params: param.stop_gradient = True def forward(self, x): x = self.conv(x) x = self.bn(x) if self.act is not None: if self.act == "relu": x = F.relu(x) elif self.act == "relu6": x = F.relu6(x) elif self.act == "hard_swish": x = F.hardswish(x) else: raise NotImplementedError( "The activation function is selected incorrectly.") return x class ResidualUnit(nn.Layer): def __init__(self, in_c, mid_c, out_c, filter_size, stride, use_se, lr_mult, conv_decay=0., norm_type='bn', norm_decay=0., freeze_norm=False, act=None, return_list=False, name=''): super(ResidualUnit, self).__init__() self.if_shortcut = stride == 1 and in_c == out_c self.use_se = use_se self.return_list = return_list self.expand_conv = ConvBNLayer( in_c=in_c, out_c=mid_c, filter_size=1, stride=1, padding=0, act=act, lr_mult=lr_mult, conv_decay=conv_decay, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, name=name + "_expand") self.bottleneck_conv = ConvBNLayer( in_c=mid_c, out_c=mid_c, filter_size=filter_size, stride=stride, padding=int((filter_size - 1) // 2), num_groups=mid_c, act=act, lr_mult=lr_mult, conv_decay=conv_decay, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, name=name + "_depthwise") if self.use_se: self.mid_se = SEModule( mid_c, lr_mult, conv_decay, name=name + "_se") self.linear_conv = ConvBNLayer( in_c=mid_c, out_c=out_c, filter_size=1, stride=1, padding=0, act=None, lr_mult=lr_mult, conv_decay=conv_decay, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, name=name + "_linear") def forward(self, inputs): y = self.expand_conv(inputs) x = self.bottleneck_conv(y) if self.use_se: x = self.mid_se(x) x = self.linear_conv(x) if self.if_shortcut: x = paddle.add(inputs, x) if self.return_list: return [y, x] else: return x class SEModule(nn.Layer): def __init__(self, channel, lr_mult, conv_decay, reduction=4, name=""): super(SEModule, self).__init__() self.avg_pool = nn.AdaptiveAvgPool2D(1) mid_channels = int(channel // reduction) self.conv1 = nn.Conv2D( in_channels=channel, out_channels=mid_channels, kernel_size=1, stride=1, padding=0, weight_attr=ParamAttr( learning_rate=lr_mult, regularizer=L2Decay(conv_decay)), bias_attr=ParamAttr( learning_rate=lr_mult, regularizer=L2Decay(conv_decay))) self.conv2 = nn.Conv2D( in_channels=mid_channels, out_channels=channel, kernel_size=1, stride=1, padding=0, weight_attr=ParamAttr( learning_rate=lr_mult, regularizer=L2Decay(conv_decay)), bias_attr=ParamAttr( learning_rate=lr_mult, regularizer=L2Decay(conv_decay))) def forward(self, inputs): outputs = self.avg_pool(inputs) outputs = self.conv1(outputs) outputs = F.relu(outputs) outputs = self.conv2(outputs) outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5) return paddle.multiply(x=inputs, y=outputs) class ExtraBlockDW(nn.Layer): def __init__(self, in_c, ch_1, ch_2, stride, lr_mult, conv_decay=0., norm_type='bn', norm_decay=0., freeze_norm=False, name=None): super(ExtraBlockDW, self).__init__() self.pointwise_conv = ConvBNLayer( in_c=in_c, out_c=ch_1, filter_size=1, stride=1, padding='SAME', act='relu6', lr_mult=lr_mult, conv_decay=conv_decay, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, name=name + "_extra1") self.depthwise_conv = ConvBNLayer( in_c=ch_1, out_c=ch_2, filter_size=3, stride=stride, padding='SAME', num_groups=int(ch_1), act='relu6', lr_mult=lr_mult, conv_decay=conv_decay, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, name=name + "_extra2_dw") self.normal_conv = ConvBNLayer( in_c=ch_2, out_c=ch_2, filter_size=1, stride=1, padding='SAME', act='relu6', lr_mult=lr_mult, conv_decay=conv_decay, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, name=name + "_extra2_sep") def forward(self, inputs): x = self.pointwise_conv(inputs) x = self.depthwise_conv(x) x = self.normal_conv(x) return x @register @serializable class MobileNetV3(nn.Layer): __shared__ = ['norm_type'] def __init__( self, scale=1.0, model_name="large", feature_maps=[6, 12, 15], with_extra_blocks=False, extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]], lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0], conv_decay=0.0, multiplier=1.0, norm_type='bn', norm_decay=0.0, freeze_norm=False): super(MobileNetV3, self).__init__() if isinstance(feature_maps, Integral): feature_maps = [feature_maps] if norm_type == 'sync_bn' and freeze_norm: raise ValueError( "The norm_type should not be sync_bn when freeze_norm is True") self.feature_maps = feature_maps self.with_extra_blocks = with_extra_blocks self.extra_block_filters = extra_block_filters inplanes = 16 if model_name == "large": self.cfg = [ # k, exp, c, se, nl, s, [3, 16, 16, False, "relu", 1], [3, 64, 24, False, "relu", 2], [3, 72, 24, False, "relu", 1], [5, 72, 40, True, "relu", 2], # RCNN output [5, 120, 40, True, "relu", 1], [5, 120, 40, True, "relu", 1], # YOLOv3 output [3, 240, 80, False, "hard_swish", 2], # RCNN output [3, 200, 80, False, "hard_swish", 1], [3, 184, 80, False, "hard_swish", 1], [3, 184, 80, False, "hard_swish", 1], [3, 480, 112, True, "hard_swish", 1], [3, 672, 112, True, "hard_swish", 1], # YOLOv3 output [5, 672, 160, True, "hard_swish", 2], # SSD/SSDLite/RCNN output [5, 960, 160, True, "hard_swish", 1], [5, 960, 160, True, "hard_swish", 1], # YOLOv3 output ] elif model_name == "small": self.cfg = [ # k, exp, c, se, nl, s, [3, 16, 16, True, "relu", 2], [3, 72, 24, False, "relu", 2], # RCNN output [3, 88, 24, False, "relu", 1], # YOLOv3 output [5, 96, 40, True, "hard_swish", 2], # RCNN output [5, 240, 40, True, "hard_swish", 1], [5, 240, 40, True, "hard_swish", 1], [5, 120, 48, True, "hard_swish", 1], [5, 144, 48, True, "hard_swish", 1], # YOLOv3 output [5, 288, 96, True, "hard_swish", 2], # SSD/SSDLite/RCNN output [5, 576, 96, True, "hard_swish", 1], [5, 576, 96, True, "hard_swish", 1], # YOLOv3 output ] else: raise NotImplementedError( "mode[{}_model] is not implemented!".format(model_name)) if multiplier != 1.0: self.cfg[-3][2] = int(self.cfg[-3][2] * multiplier) self.cfg[-2][1] = int(self.cfg[-2][1] * multiplier) self.cfg[-2][2] = int(self.cfg[-2][2] * multiplier) self.cfg[-1][1] = int(self.cfg[-1][1] * multiplier) self.cfg[-1][2] = int(self.cfg[-1][2] * multiplier) self.conv1 = ConvBNLayer( in_c=3, out_c=make_divisible(inplanes * scale), filter_size=3, stride=2, padding=1, num_groups=1, act="hard_swish", lr_mult=lr_mult_list[0], conv_decay=conv_decay, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, name="conv1") self._out_channels = [] self.block_list = [] i = 0 inplanes = make_divisible(inplanes * scale) for (k, exp, c, se, nl, s) in self.cfg: lr_idx = min(i // 3, len(lr_mult_list) - 1) lr_mult = lr_mult_list[lr_idx] # for SSD/SSDLite, first head input is after ResidualUnit expand_conv return_list = self.with_extra_blocks and i + 2 in self.feature_maps block = self.add_sublayer( "conv" + str(i + 2), sublayer=ResidualUnit( in_c=inplanes, mid_c=make_divisible(scale * exp), out_c=make_divisible(scale * c), filter_size=k, stride=s, use_se=se, act=nl, lr_mult=lr_mult, conv_decay=conv_decay, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, return_list=return_list, name="conv" + str(i + 2))) self.block_list.append(block) inplanes = make_divisible(scale * c) i += 1 self._update_out_channels( make_divisible(scale * exp) if return_list else inplanes, i + 1, feature_maps) if self.with_extra_blocks: self.extra_block_list = [] extra_out_c = make_divisible(scale * self.cfg[-1][1]) lr_idx = min(i // 3, len(lr_mult_list) - 1) lr_mult = lr_mult_list[lr_idx] conv_extra = self.add_sublayer( "conv" + str(i + 2), sublayer=ConvBNLayer( in_c=inplanes, out_c=extra_out_c, filter_size=1, stride=1, padding=0, num_groups=1, act="hard_swish", lr_mult=lr_mult, conv_decay=conv_decay, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, name="conv" + str(i + 2))) self.extra_block_list.append(conv_extra) i += 1 self._update_out_channels(extra_out_c, i + 1, feature_maps) for j, block_filter in enumerate(self.extra_block_filters): in_c = extra_out_c if j == 0 else self.extra_block_filters[j - 1][1] conv_extra = self.add_sublayer( "conv" + str(i + 2), sublayer=ExtraBlockDW( in_c, block_filter[0], block_filter[1], stride=2, lr_mult=lr_mult, conv_decay=conv_decay, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, name='conv' + str(i + 2))) self.extra_block_list.append(conv_extra) i += 1 self._update_out_channels(block_filter[1], i + 1, feature_maps) def _update_out_channels(self, channel, feature_idx, feature_maps): if feature_idx in feature_maps: self._out_channels.append(channel) def forward(self, inputs): x = self.conv1(inputs['image']) outs = [] for idx, block in enumerate(self.block_list): x = block(x) if idx + 2 in self.feature_maps: if isinstance(x, list): outs.append(x[0]) x = x[1] else: outs.append(x) if not self.with_extra_blocks: return outs for i, block in enumerate(self.extra_block_list): idx = i + len(self.block_list) x = block(x) if idx + 2 in self.feature_maps: outs.append(x) return outs @property def out_shape(self): return [ShapeSpec(channels=c) for c in self._out_channels] ================================================ FILE: ppdet/modeling/backbones/mobileone.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This code is the paddle implementation of MobileOne block, see: https://arxiv.org/pdf/2206.04040.pdf. Some codes are based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py Ths copyright of microsoft/Swin-Transformer is as follows: MIT License [see LICENSE for details] """ import paddle import paddle.nn as nn from paddle import ParamAttr from paddle.regularizer import L2Decay from paddle.nn.initializer import Normal, Constant from ppdet.modeling.ops import get_act_fn from ppdet.modeling.layers import ConvNormLayer class MobileOneBlock(nn.Layer): def __init__( self, ch_in, ch_out, stride, kernel_size, conv_num=1, norm_type='bn', norm_decay=0., norm_groups=32, bias_on=False, lr_scale=1., freeze_norm=False, initializer=Normal( mean=0., std=0.01), skip_quant=False, act='relu', ): super(MobileOneBlock, self).__init__() self.ch_in = ch_in self.ch_out = ch_out self.kernel_size = kernel_size self.stride = stride self.padding = (kernel_size - 1) // 2 self.k = conv_num self.depth_conv = nn.LayerList() self.point_conv = nn.LayerList() for _ in range(self.k): self.depth_conv.append( ConvNormLayer( ch_in, ch_in, kernel_size, stride=stride, groups=ch_in, norm_type=norm_type, norm_decay=norm_decay, norm_groups=norm_groups, bias_on=bias_on, lr_scale=lr_scale, freeze_norm=freeze_norm, initializer=initializer, skip_quant=skip_quant)) self.point_conv.append( ConvNormLayer( ch_in, ch_out, 1, stride=1, groups=1, norm_type=norm_type, norm_decay=norm_decay, norm_groups=norm_groups, bias_on=bias_on, lr_scale=lr_scale, freeze_norm=freeze_norm, initializer=initializer, skip_quant=skip_quant)) self.rbr_1x1 = ConvNormLayer( ch_in, ch_in, 1, stride=self.stride, groups=ch_in, norm_type=norm_type, norm_decay=norm_decay, norm_groups=norm_groups, bias_on=bias_on, lr_scale=lr_scale, freeze_norm=freeze_norm, initializer=initializer, skip_quant=skip_quant) self.rbr_identity_st1 = nn.BatchNorm2D( num_features=ch_in, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay( 0.0))) if ch_in == ch_out and self.stride == 1 else None self.rbr_identity_st2 = nn.BatchNorm2D( num_features=ch_out, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay( 0.0))) if ch_in == ch_out and self.stride == 1 else None self.act = get_act_fn(act) if act is None or isinstance(act, ( str, dict)) else act def forward(self, x): if hasattr(self, "conv1") and hasattr(self, "conv2"): y = self.act(self.conv2(self.act(self.conv1(x)))) else: if self.rbr_identity_st1 is None: id_out_st1 = 0 else: id_out_st1 = self.rbr_identity_st1(x) x1_1 = 0 for i in range(self.k): x1_1 += self.depth_conv[i](x) x1_2 = self.rbr_1x1(x) x1 = self.act(x1_1 + x1_2 + id_out_st1) if self.rbr_identity_st2 is None: id_out_st2 = 0 else: id_out_st2 = self.rbr_identity_st2(x1) x2_1 = 0 for i in range(self.k): x2_1 += self.point_conv[i](x1) y = self.act(x2_1 + id_out_st2) return y def convert_to_deploy(self): if not hasattr(self, 'conv1'): self.conv1 = nn.Conv2D( in_channels=self.ch_in, out_channels=self.ch_in, kernel_size=self.kernel_size, stride=self.stride, padding=self.padding, groups=self.ch_in, bias_attr=ParamAttr( initializer=Constant(value=0.), learning_rate=1.)) if not hasattr(self, 'conv2'): self.conv2 = nn.Conv2D( in_channels=self.ch_in, out_channels=self.ch_out, kernel_size=1, stride=1, padding='SAME', groups=1, bias_attr=ParamAttr( initializer=Constant(value=0.), learning_rate=1.)) conv1_kernel, conv1_bias, conv2_kernel, conv2_bias = self.get_equivalent_kernel_bias( ) self.conv1.weight.set_value(conv1_kernel) self.conv1.bias.set_value(conv1_bias) self.conv2.weight.set_value(conv2_kernel) self.conv2.bias.set_value(conv2_bias) self.__delattr__('depth_conv') self.__delattr__('point_conv') self.__delattr__('rbr_1x1') if hasattr(self, 'rbr_identity_st1'): self.__delattr__('rbr_identity_st1') if hasattr(self, 'rbr_identity_st2'): self.__delattr__('rbr_identity_st2') def get_equivalent_kernel_bias(self): st1_kernel3x3, st1_bias3x3 = self._fuse_bn_tensor(self.depth_conv) st1_kernel1x1, st1_bias1x1 = self._fuse_bn_tensor(self.rbr_1x1) st1_kernelid, st1_biasid = self._fuse_bn_tensor( self.rbr_identity_st1, kernel_size=self.kernel_size) st2_kernel1x1, st2_bias1x1 = self._fuse_bn_tensor(self.point_conv) st2_kernelid, st2_biasid = self._fuse_bn_tensor( self.rbr_identity_st2, kernel_size=1) conv1_kernel = st1_kernel3x3 + self._pad_1x1_to_3x3_tensor( st1_kernel1x1) + st1_kernelid conv1_bias = st1_bias3x3 + st1_bias1x1 + st1_biasid conv2_kernel = st2_kernel1x1 + st2_kernelid conv2_bias = st2_bias1x1 + st2_biasid return conv1_kernel, conv1_bias, conv2_kernel, conv2_bias def _pad_1x1_to_3x3_tensor(self, kernel1x1): if kernel1x1 is None: return 0 else: padding_size = (self.kernel_size - 1) // 2 return nn.functional.pad( kernel1x1, [padding_size, padding_size, padding_size, padding_size]) def _fuse_bn_tensor(self, branch, kernel_size=3): if branch is None: return 0, 0 if isinstance(branch, nn.LayerList): fused_kernels = [] fused_bias = [] for block in branch: kernel = block.conv.weight running_mean = block.norm._mean running_var = block.norm._variance gamma = block.norm.weight beta = block.norm.bias eps = block.norm._epsilon std = (running_var + eps).sqrt() t = (gamma / std).reshape((-1, 1, 1, 1)) fused_kernels.append(kernel * t) fused_bias.append(beta - running_mean * gamma / std) return sum(fused_kernels), sum(fused_bias) elif isinstance(branch, ConvNormLayer): kernel = branch.conv.weight running_mean = branch.norm._mean running_var = branch.norm._variance gamma = branch.norm.weight beta = branch.norm.bias eps = branch.norm._epsilon else: assert isinstance(branch, nn.BatchNorm2D) input_dim = self.ch_in if kernel_size == 1 else 1 kernel_value = paddle.zeros( shape=[self.ch_in, input_dim, kernel_size, kernel_size], dtype='float32') if kernel_size > 1: for i in range(self.ch_in): kernel_value[i, i % input_dim, (kernel_size - 1) // 2, ( kernel_size - 1) // 2] = 1 elif kernel_size == 1: for i in range(self.ch_in): kernel_value[i, i % input_dim, 0, 0] = 1 else: raise ValueError("Invalid kernel size recieved!") kernel = paddle.to_tensor(kernel_value, place=branch.weight.place) running_mean = branch._mean running_var = branch._variance gamma = branch.weight beta = branch.bias eps = branch._epsilon std = (running_var + eps).sqrt() t = (gamma / std).reshape((-1, 1, 1, 1)) return kernel * t, beta - running_mean * gamma / std ================================================ FILE: ppdet/modeling/backbones/name_adapter.py ================================================ class NameAdapter(object): """Fix the backbones variable names for pretrained weight""" def __init__(self, model): super(NameAdapter, self).__init__() self.model = model @property def model_type(self): return getattr(self.model, '_model_type', '') @property def variant(self): return getattr(self.model, 'variant', '') def fix_conv_norm_name(self, name): if name == "conv1": bn_name = "bn_" + name else: bn_name = "bn" + name[3:] # the naming rule is same as pretrained weight if self.model_type == 'SEResNeXt': bn_name = name + "_bn" return bn_name def fix_shortcut_name(self, name): if self.model_type == 'SEResNeXt': name = 'conv' + name + '_prj' return name def fix_bottleneck_name(self, name): if self.model_type == 'SEResNeXt': conv_name1 = 'conv' + name + '_x1' conv_name2 = 'conv' + name + '_x2' conv_name3 = 'conv' + name + '_x3' shortcut_name = name else: conv_name1 = name + "_branch2a" conv_name2 = name + "_branch2b" conv_name3 = name + "_branch2c" shortcut_name = name + "_branch1" return conv_name1, conv_name2, conv_name3, shortcut_name def fix_basicblock_name(self, name): if self.model_type == 'SEResNeXt': conv_name1 = 'conv' + name + '_x1' conv_name2 = 'conv' + name + '_x2' shortcut_name = name else: conv_name1 = name + "_branch2a" conv_name2 = name + "_branch2b" shortcut_name = name + "_branch1" return conv_name1, conv_name2, shortcut_name def fix_layer_warp_name(self, stage_num, count, i): name = 'res' + str(stage_num) if count > 10 and stage_num == 4: if i == 0: conv_name = name + "a" else: conv_name = name + "b" + str(i) else: conv_name = name + chr(ord("a") + i) if self.model_type == 'SEResNeXt': conv_name = str(stage_num + 2) + '_' + str(i + 1) return conv_name def fix_c1_stage_name(self): return "res_conv1" if self.model_type == 'ResNeXt' else "conv1" ================================================ FILE: ppdet/modeling/backbones/res2net.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from numbers import Integral import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register, serializable from ..shape_spec import ShapeSpec from .resnet import ConvNormLayer __all__ = ['Res2Net', 'Res2NetC5'] Res2Net_cfg = { 50: [3, 4, 6, 3], 101: [3, 4, 23, 3], 152: [3, 8, 36, 3], 200: [3, 12, 48, 3] } class BottleNeck(nn.Layer): def __init__(self, ch_in, ch_out, stride, shortcut, width, scales=4, variant='b', groups=1, lr=1.0, norm_type='bn', norm_decay=0., freeze_norm=True, dcn_v2=False): super(BottleNeck, self).__init__() self.shortcut = shortcut self.scales = scales self.stride = stride if not shortcut: if variant == 'd' and stride == 2: self.branch1 = nn.Sequential() self.branch1.add_sublayer( 'pool', nn.AvgPool2D( kernel_size=2, stride=2, padding=0, ceil_mode=True)) self.branch1.add_sublayer( 'conv', ConvNormLayer( ch_in=ch_in, ch_out=ch_out, filter_size=1, stride=1, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, lr=lr)) else: self.branch1 = ConvNormLayer( ch_in=ch_in, ch_out=ch_out, filter_size=1, stride=stride, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, lr=lr) self.branch2a = ConvNormLayer( ch_in=ch_in, ch_out=width * scales, filter_size=1, stride=stride if variant == 'a' else 1, groups=1, act='relu', norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, lr=lr) self.branch2b = nn.LayerList([ ConvNormLayer( ch_in=width, ch_out=width, filter_size=3, stride=1 if variant == 'a' else stride, groups=groups, act='relu', norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, lr=lr, dcn_v2=dcn_v2) for _ in range(self.scales - 1) ]) self.branch2c = ConvNormLayer( ch_in=width * scales, ch_out=ch_out, filter_size=1, stride=1, groups=1, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, lr=lr) def forward(self, inputs): out = self.branch2a(inputs) feature_split = paddle.split(out, self.scales, 1) out_split = [] for i in range(self.scales - 1): if i == 0 or self.stride == 2: out_split.append(self.branch2b[i](feature_split[i])) else: out_split.append(self.branch2b[i](paddle.add(feature_split[i], out_split[-1]))) if self.stride == 1: out_split.append(feature_split[-1]) else: out_split.append(F.avg_pool2d(feature_split[-1], 3, self.stride, 1)) out = self.branch2c(paddle.concat(out_split, 1)) if self.shortcut: short = inputs else: short = self.branch1(inputs) out = paddle.add(out, short) out = F.relu(out) return out class Blocks(nn.Layer): def __init__(self, ch_in, ch_out, count, stage_num, width, scales=4, variant='b', groups=1, lr=1.0, norm_type='bn', norm_decay=0., freeze_norm=True, dcn_v2=False): super(Blocks, self).__init__() self.blocks = nn.Sequential() for i in range(count): self.blocks.add_sublayer( str(i), BottleNeck( ch_in=ch_in if i == 0 else ch_out, ch_out=ch_out, stride=2 if i == 0 and stage_num != 2 else 1, shortcut=False if i == 0 else True, width=width * (2**(stage_num - 2)), scales=scales, variant=variant, groups=groups, lr=lr, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, dcn_v2=dcn_v2)) def forward(self, inputs): return self.blocks(inputs) @register @serializable class Res2Net(nn.Layer): """ Res2Net, see https://arxiv.org/abs/1904.01169 Args: depth (int): Res2Net depth, should be 50, 101, 152, 200. width (int): Res2Net width scales (int): Res2Net scale variant (str): Res2Net variant, supports 'a', 'b', 'c', 'd' currently lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5), lower learning rate ratio is need for pretrained model got using distillation(default as [1.0, 1.0, 1.0, 1.0]). groups (int): The groups number of the Conv Layer. norm_type (str): normalization type, 'bn' or 'sync_bn' norm_decay (float): weight decay for normalization layer weights freeze_norm (bool): freeze normalization layers freeze_at (int): freeze the backbone at which stage return_idx (list): index of stages whose feature maps are returned, index 0 stands for res2 dcn_v2_stages (list): index of stages who select deformable conv v2 num_stages (int): number of stages created """ __shared__ = ['norm_type'] def __init__(self, depth=50, width=26, scales=4, variant='b', lr_mult_list=[1.0, 1.0, 1.0, 1.0], groups=1, norm_type='bn', norm_decay=0., freeze_norm=True, freeze_at=0, return_idx=[0, 1, 2, 3], dcn_v2_stages=[-1], num_stages=4): super(Res2Net, self).__init__() self._model_type = 'Res2Net' if groups == 1 else 'Res2NeXt' assert depth in [50, 101, 152, 200], \ "depth {} not in [50, 101, 152, 200]" assert variant in ['a', 'b', 'c', 'd'], "invalid Res2Net variant" assert num_stages >= 1 and num_stages <= 4 self.depth = depth self.variant = variant self.norm_type = norm_type self.norm_decay = norm_decay self.freeze_norm = freeze_norm self.freeze_at = freeze_at if isinstance(return_idx, Integral): return_idx = [return_idx] assert max(return_idx) < num_stages, \ 'the maximum return index must smaller than num_stages, ' \ 'but received maximum return index is {} and num_stages ' \ 'is {}'.format(max(return_idx), num_stages) self.return_idx = return_idx self.num_stages = num_stages assert len(lr_mult_list) == 4, \ "lr_mult_list length must be 4 but got {}".format(len(lr_mult_list)) if isinstance(dcn_v2_stages, Integral): dcn_v2_stages = [dcn_v2_stages] assert max(dcn_v2_stages) < num_stages self.dcn_v2_stages = dcn_v2_stages block_nums = Res2Net_cfg[depth] # C1 stage if self.variant in ['c', 'd']: conv_def = [ [3, 32, 3, 2, "conv1_1"], [32, 32, 3, 1, "conv1_2"], [32, 64, 3, 1, "conv1_3"], ] else: conv_def = [[3, 64, 7, 2, "conv1"]] self.res1 = nn.Sequential() for (c_in, c_out, k, s, _name) in conv_def: self.res1.add_sublayer( _name, ConvNormLayer( ch_in=c_in, ch_out=c_out, filter_size=k, stride=s, groups=1, act='relu', norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, lr=1.0)) self._in_channels = [64, 256, 512, 1024] self._out_channels = [256, 512, 1024, 2048] self._out_strides = [4, 8, 16, 32] # C2-C5 stages self.res_layers = [] for i in range(num_stages): lr_mult = lr_mult_list[i] stage_num = i + 2 self.res_layers.append( self.add_sublayer( "res{}".format(stage_num), Blocks( self._in_channels[i], self._out_channels[i], count=block_nums[i], stage_num=stage_num, width=width, scales=scales, groups=groups, lr=lr_mult, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, dcn_v2=(i in self.dcn_v2_stages)))) @property def out_shape(self): return [ ShapeSpec( channels=self._out_channels[i], stride=self._out_strides[i]) for i in self.return_idx ] def forward(self, inputs): x = inputs['image'] res1 = self.res1(x) x = F.max_pool2d(res1, kernel_size=3, stride=2, padding=1) outs = [] for idx, stage in enumerate(self.res_layers): x = stage(x) if idx == self.freeze_at: x.stop_gradient = True if idx in self.return_idx: outs.append(x) return outs @register class Res2NetC5(nn.Layer): def __init__(self, depth=50, width=26, scales=4, variant='b'): super(Res2NetC5, self).__init__() feat_in, feat_out = [1024, 2048] self.res5 = Blocks( feat_in, feat_out, count=3, stage_num=5, width=width, scales=scales, variant=variant) self.feat_out = feat_out @property def out_shape(self): return [ShapeSpec( channels=self.feat_out, stride=32, )] def forward(self, roi_feat, stage=0): y = self.res5(roi_feat) return y ================================================ FILE: ppdet/modeling/backbones/resnet.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math from numbers import Integral import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register, serializable from paddle.regularizer import L2Decay from paddle.nn.initializer import Uniform from paddle import ParamAttr from paddle.nn.initializer import Constant from paddle.vision.ops import DeformConv2D from .name_adapter import NameAdapter from ..shape_spec import ShapeSpec __all__ = ['ResNet', 'Res5Head', 'Blocks', 'BasicBlock', 'BottleNeck'] ResNet_cfg = { 18: [2, 2, 2, 2], 34: [3, 4, 6, 3], 50: [3, 4, 6, 3], 101: [3, 4, 23, 3], 152: [3, 8, 36, 3], } class ConvNormLayer(nn.Layer): def __init__(self, ch_in, ch_out, filter_size, stride, groups=1, act=None, norm_type='bn', norm_decay=0., freeze_norm=True, lr=1.0, dcn_v2=False): super(ConvNormLayer, self).__init__() assert norm_type in ['bn', 'sync_bn'] self.norm_type = norm_type self.act = act self.dcn_v2 = dcn_v2 if not self.dcn_v2: self.conv = nn.Conv2D( in_channels=ch_in, out_channels=ch_out, kernel_size=filter_size, stride=stride, padding=(filter_size - 1) // 2, groups=groups, weight_attr=ParamAttr(learning_rate=lr), bias_attr=False) else: self.offset_channel = 2 * filter_size**2 self.mask_channel = filter_size**2 self.conv_offset = nn.Conv2D( in_channels=ch_in, out_channels=3 * filter_size**2, kernel_size=filter_size, stride=stride, padding=(filter_size - 1) // 2, weight_attr=ParamAttr(initializer=Constant(0.)), bias_attr=ParamAttr(initializer=Constant(0.))) self.conv = DeformConv2D( in_channels=ch_in, out_channels=ch_out, kernel_size=filter_size, stride=stride, padding=(filter_size - 1) // 2, dilation=1, groups=groups, weight_attr=ParamAttr(learning_rate=lr), bias_attr=False) norm_lr = 0. if freeze_norm else lr param_attr = ParamAttr( learning_rate=norm_lr, regularizer=L2Decay(norm_decay), trainable=False if freeze_norm else True) bias_attr = ParamAttr( learning_rate=norm_lr, regularizer=L2Decay(norm_decay), trainable=False if freeze_norm else True) global_stats = True if freeze_norm else None if norm_type in ['sync_bn', 'bn']: self.norm = nn.BatchNorm2D( ch_out, weight_attr=param_attr, bias_attr=bias_attr, use_global_stats=global_stats) norm_params = self.norm.parameters() if freeze_norm: for param in norm_params: param.stop_gradient = True def forward(self, inputs): if not self.dcn_v2: out = self.conv(inputs) else: offset_mask = self.conv_offset(inputs) offset, mask = paddle.split( offset_mask, num_or_sections=[self.offset_channel, self.mask_channel], axis=1) mask = F.sigmoid(mask) out = self.conv(inputs, offset, mask=mask) if self.norm_type in ['bn', 'sync_bn']: out = self.norm(out) if self.act: out = getattr(F, self.act)(out) return out class SELayer(nn.Layer): def __init__(self, ch, reduction_ratio=16): super(SELayer, self).__init__() self.pool = nn.AdaptiveAvgPool2D(1) stdv = 1.0 / math.sqrt(ch) c_ = ch // reduction_ratio self.squeeze = nn.Linear( ch, c_, weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)), bias_attr=True) stdv = 1.0 / math.sqrt(c_) self.extract = nn.Linear( c_, ch, weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)), bias_attr=True) def forward(self, inputs): out = self.pool(inputs) out = paddle.squeeze(out, axis=[2, 3]) out = self.squeeze(out) out = F.relu(out) out = self.extract(out) out = F.sigmoid(out) out = paddle.unsqueeze(out, axis=[2, 3]) scale = out * inputs return scale class BasicBlock(nn.Layer): expansion = 1 def __init__(self, ch_in, ch_out, stride, shortcut, variant='b', groups=1, base_width=64, lr=1.0, norm_type='bn', norm_decay=0., freeze_norm=True, dcn_v2=False, std_senet=False): super(BasicBlock, self).__init__() assert groups == 1 and base_width == 64, 'BasicBlock only supports groups=1 and base_width=64' self.shortcut = shortcut if not shortcut: if variant == 'd' and stride == 2: self.short = nn.Sequential() self.short.add_sublayer( 'pool', nn.AvgPool2D( kernel_size=2, stride=2, padding=0, ceil_mode=True)) self.short.add_sublayer( 'conv', ConvNormLayer( ch_in=ch_in, ch_out=ch_out, filter_size=1, stride=1, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, lr=lr)) else: self.short = ConvNormLayer( ch_in=ch_in, ch_out=ch_out, filter_size=1, stride=stride, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, lr=lr) self.branch2a = ConvNormLayer( ch_in=ch_in, ch_out=ch_out, filter_size=3, stride=stride, act='relu', norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, lr=lr) self.branch2b = ConvNormLayer( ch_in=ch_out, ch_out=ch_out, filter_size=3, stride=1, act=None, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, lr=lr, dcn_v2=dcn_v2) self.std_senet = std_senet if self.std_senet: self.se = SELayer(ch_out) def forward(self, inputs): out = self.branch2a(inputs) out = self.branch2b(out) if self.std_senet: out = self.se(out) if self.shortcut: short = inputs else: short = self.short(inputs) out = paddle.add(x=out, y=short) out = F.relu(out) return out class BottleNeck(nn.Layer): expansion = 4 def __init__(self, ch_in, ch_out, stride, shortcut, variant='b', groups=1, base_width=4, lr=1.0, norm_type='bn', norm_decay=0., freeze_norm=True, dcn_v2=False, std_senet=False): super(BottleNeck, self).__init__() if variant == 'a': stride1, stride2 = stride, 1 else: stride1, stride2 = 1, stride # ResNeXt width = int(ch_out * (base_width / 64.)) * groups self.branch2a = ConvNormLayer( ch_in=ch_in, ch_out=width, filter_size=1, stride=stride1, groups=1, act='relu', norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, lr=lr) self.branch2b = ConvNormLayer( ch_in=width, ch_out=width, filter_size=3, stride=stride2, groups=groups, act='relu', norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, lr=lr, dcn_v2=dcn_v2) self.branch2c = ConvNormLayer( ch_in=width, ch_out=ch_out * self.expansion, filter_size=1, stride=1, groups=1, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, lr=lr) self.shortcut = shortcut if not shortcut: if variant == 'd' and stride == 2: self.short = nn.Sequential() self.short.add_sublayer( 'pool', nn.AvgPool2D( kernel_size=2, stride=2, padding=0, ceil_mode=True)) self.short.add_sublayer( 'conv', ConvNormLayer( ch_in=ch_in, ch_out=ch_out * self.expansion, filter_size=1, stride=1, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, lr=lr)) else: self.short = ConvNormLayer( ch_in=ch_in, ch_out=ch_out * self.expansion, filter_size=1, stride=stride, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, lr=lr) self.std_senet = std_senet if self.std_senet: self.se = SELayer(ch_out * self.expansion) def forward(self, inputs): out = self.branch2a(inputs) out = self.branch2b(out) out = self.branch2c(out) if self.std_senet: out = self.se(out) if self.shortcut: short = inputs else: short = self.short(inputs) out = paddle.add(x=out, y=short) out = F.relu(out) return out class Blocks(nn.Layer): def __init__(self, block, ch_in, ch_out, count, name_adapter, stage_num, variant='b', groups=1, base_width=64, lr=1.0, norm_type='bn', norm_decay=0., freeze_norm=True, dcn_v2=False, std_senet=False): super(Blocks, self).__init__() self.blocks = [] for i in range(count): conv_name = name_adapter.fix_layer_warp_name(stage_num, count, i) layer = self.add_sublayer( conv_name, block( ch_in=ch_in, ch_out=ch_out, stride=2 if i == 0 and stage_num != 2 else 1, shortcut=False if i == 0 else True, variant=variant, groups=groups, base_width=base_width, lr=lr, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, dcn_v2=dcn_v2, std_senet=std_senet)) self.blocks.append(layer) if i == 0: ch_in = ch_out * block.expansion def forward(self, inputs): block_out = inputs for block in self.blocks: block_out = block(block_out) return block_out @register @serializable class ResNet(nn.Layer): __shared__ = ['norm_type'] def __init__(self, depth=50, ch_in=64, variant='b', lr_mult_list=[1.0, 1.0, 1.0, 1.0], groups=1, base_width=64, norm_type='bn', norm_decay=0, freeze_norm=True, freeze_at=0, return_idx=[0, 1, 2, 3], dcn_v2_stages=[-1], num_stages=4, std_senet=False, freeze_stem_only=False): """ Residual Network, see https://arxiv.org/abs/1512.03385 Args: depth (int): ResNet depth, should be 18, 34, 50, 101, 152. ch_in (int): output channel of first stage, default 64 variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5), lower learning rate ratio is need for pretrained model got using distillation(default as [1.0, 1.0, 1.0, 1.0]). groups (int): group convolution cardinality base_width (int): base width of each group convolution norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel' norm_decay (float): weight decay for normalization layer weights freeze_norm (bool): freeze normalization layers freeze_at (int): freeze the backbone at which stage return_idx (list): index of the stages whose feature maps are returned dcn_v2_stages (list): index of stages who select deformable conv v2 num_stages (int): total num of stages std_senet (bool): whether use senet, default True """ super(ResNet, self).__init__() self._model_type = 'ResNet' if groups == 1 else 'ResNeXt' assert num_stages >= 1 and num_stages <= 4 self.depth = depth self.variant = variant self.groups = groups self.base_width = base_width self.norm_type = norm_type self.norm_decay = norm_decay self.freeze_norm = freeze_norm self.freeze_at = freeze_at if isinstance(return_idx, Integral): return_idx = [return_idx] assert max(return_idx) < num_stages, \ 'the maximum return index must smaller than num_stages, ' \ 'but received maximum return index is {} and num_stages ' \ 'is {}'.format(max(return_idx), num_stages) self.return_idx = return_idx self.num_stages = num_stages assert len(lr_mult_list) == 4, \ "lr_mult_list length must be 4 but got {}".format(len(lr_mult_list)) if isinstance(dcn_v2_stages, Integral): dcn_v2_stages = [dcn_v2_stages] assert max(dcn_v2_stages) < num_stages if isinstance(dcn_v2_stages, Integral): dcn_v2_stages = [dcn_v2_stages] assert max(dcn_v2_stages) < num_stages self.dcn_v2_stages = dcn_v2_stages block_nums = ResNet_cfg[depth] na = NameAdapter(self) conv1_name = na.fix_c1_stage_name() if variant in ['c', 'd']: conv_def = [ [3, ch_in // 2, 3, 2, "conv1_1"], [ch_in // 2, ch_in // 2, 3, 1, "conv1_2"], [ch_in // 2, ch_in, 3, 1, "conv1_3"], ] else: conv_def = [[3, ch_in, 7, 2, conv1_name]] self.conv1 = nn.Sequential() for (c_in, c_out, k, s, _name) in conv_def: self.conv1.add_sublayer( _name, ConvNormLayer( ch_in=c_in, ch_out=c_out, filter_size=k, stride=s, groups=1, act='relu', norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, lr=1.0)) self.ch_in = ch_in ch_out_list = [64, 128, 256, 512] block = BottleNeck if depth >= 50 else BasicBlock self._out_channels = [block.expansion * v for v in ch_out_list] self._out_strides = [4, 8, 16, 32] self.res_layers = [] for i in range(num_stages): lr_mult = lr_mult_list[i] stage_num = i + 2 res_name = "res{}".format(stage_num) res_layer = self.add_sublayer( res_name, Blocks( block, self.ch_in, ch_out_list[i], count=block_nums[i], name_adapter=na, stage_num=stage_num, variant=variant, groups=groups, base_width=base_width, lr=lr_mult, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, dcn_v2=(i in self.dcn_v2_stages), std_senet=std_senet)) self.res_layers.append(res_layer) self.ch_in = self._out_channels[i] if freeze_at >= 0: self._freeze_parameters(self.conv1) if not freeze_stem_only: for i in range(min(freeze_at + 1, num_stages)): self._freeze_parameters(self.res_layers[i]) def _freeze_parameters(self, m): for p in m.parameters(): p.stop_gradient = True @property def out_shape(self): return [ ShapeSpec( channels=self._out_channels[i], stride=self._out_strides[i]) for i in self.return_idx ] def forward(self, inputs): x = inputs['image'] conv1 = self.conv1(x) x = F.max_pool2d(conv1, kernel_size=3, stride=2, padding=1) outs = [] for idx, stage in enumerate(self.res_layers): x = stage(x) if idx in self.return_idx: outs.append(x) return outs @register class Res5Head(nn.Layer): def __init__(self, depth=50): super(Res5Head, self).__init__() feat_in, feat_out = [1024, 512] if depth < 50: feat_in = 256 na = NameAdapter(self) block = BottleNeck if depth >= 50 else BasicBlock self.res5 = Blocks( block, feat_in, feat_out, count=3, name_adapter=na, stage_num=5) self.feat_out = feat_out if depth < 50 else feat_out * 4 @property def out_shape(self): return [ShapeSpec( channels=self.feat_out, stride=16, )] def forward(self, roi_feat, stage=0): y = self.res5(roi_feat) return y ================================================ FILE: ppdet/modeling/backbones/senet.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle.nn as nn from ppdet.core.workspace import register, serializable from .resnet import ResNet, Blocks, BasicBlock, BottleNeck from ..shape_spec import ShapeSpec from .name_adapter import NameAdapter __all__ = ['SENet', 'SERes5Head'] @register @serializable class SENet(ResNet): __shared__ = ['norm_type'] def __init__(self, depth=50, variant='b', lr_mult_list=[1.0, 1.0, 1.0, 1.0], groups=1, base_width=64, norm_type='bn', norm_decay=0, freeze_norm=True, freeze_at=0, return_idx=[0, 1, 2, 3], dcn_v2_stages=[-1], std_senet=True, num_stages=4): """ Squeeze-and-Excitation Networks, see https://arxiv.org/abs/1709.01507 Args: depth (int): SENet depth, should be 50, 101, 152 variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5), lower learning rate ratio is need for pretrained model got using distillation(default as [1.0, 1.0, 1.0, 1.0]). groups (int): group convolution cardinality base_width (int): base width of each group convolution norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel' norm_decay (float): weight decay for normalization layer weights freeze_norm (bool): freeze normalization layers freeze_at (int): freeze the backbone at which stage return_idx (list): index of the stages whose feature maps are returned dcn_v2_stages (list): index of stages who select deformable conv v2 std_senet (bool): whether use senet, default True num_stages (int): total num of stages """ super(SENet, self).__init__( depth=depth, variant=variant, lr_mult_list=lr_mult_list, ch_in=128, groups=groups, base_width=base_width, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, freeze_at=freeze_at, return_idx=return_idx, dcn_v2_stages=dcn_v2_stages, std_senet=std_senet, num_stages=num_stages) @register class SERes5Head(nn.Layer): def __init__(self, depth=50, variant='b', lr_mult=1.0, groups=1, base_width=64, norm_type='bn', norm_decay=0, dcn_v2=False, freeze_norm=False, std_senet=True): """ SERes5Head layer Args: depth (int): SENet depth, should be 50, 101, 152 variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently lr_mult (list): learning rate ratio of SERes5Head, default as 1.0. groups (int): group convolution cardinality base_width (int): base width of each group convolution norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel' norm_decay (float): weight decay for normalization layer weights dcn_v2_stages (list): index of stages who select deformable conv v2 std_senet (bool): whether use senet, default True """ super(SERes5Head, self).__init__() ch_out = 512 ch_in = 256 if depth < 50 else 1024 na = NameAdapter(self) block = BottleNeck if depth >= 50 else BasicBlock self.res5 = Blocks( block, ch_in, ch_out, count=3, name_adapter=na, stage_num=5, variant=variant, groups=groups, base_width=base_width, lr=lr_mult, norm_type=norm_type, norm_decay=norm_decay, freeze_norm=freeze_norm, dcn_v2=dcn_v2, std_senet=std_senet) self.ch_out = ch_out * block.expansion @property def out_shape(self): return [ShapeSpec( channels=self.ch_out, stride=16, )] def forward(self, roi_feat): y = self.res5(roi_feat) return y ================================================ FILE: ppdet/modeling/backbones/shufflenet_v2.py ================================================ # copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn from paddle import ParamAttr import paddle.nn.functional as F from paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm2D from paddle.nn.initializer import KaimingNormal from paddle.regularizer import L2Decay from ppdet.core.workspace import register, serializable from numbers import Integral from ..shape_spec import ShapeSpec from ppdet.modeling.ops import channel_shuffle __all__ = ['ShuffleNetV2'] class ConvBNLayer(nn.Layer): def __init__(self, in_channels, out_channels, kernel_size, stride, padding, groups=1, act=None): super(ConvBNLayer, self).__init__() self._conv = Conv2D( in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, weight_attr=ParamAttr(initializer=KaimingNormal()), bias_attr=False) self._batch_norm = BatchNorm2D( out_channels, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) if act == "hard_swish": act = 'hardswish' self.act = act def forward(self, inputs): y = self._conv(inputs) y = self._batch_norm(y) if self.act: y = getattr(F, self.act)(y) return y class InvertedResidual(nn.Layer): def __init__(self, in_channels, out_channels, stride, act="relu"): super(InvertedResidual, self).__init__() self._conv_pw = ConvBNLayer( in_channels=in_channels // 2, out_channels=out_channels // 2, kernel_size=1, stride=1, padding=0, groups=1, act=act) self._conv_dw = ConvBNLayer( in_channels=out_channels // 2, out_channels=out_channels // 2, kernel_size=3, stride=stride, padding=1, groups=out_channels // 2, act=None) self._conv_linear = ConvBNLayer( in_channels=out_channels // 2, out_channels=out_channels // 2, kernel_size=1, stride=1, padding=0, groups=1, act=act) def forward(self, inputs): x1, x2 = paddle.split( inputs, num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2], axis=1) x2 = self._conv_pw(x2) x2 = self._conv_dw(x2) x2 = self._conv_linear(x2) out = paddle.concat([x1, x2], axis=1) return channel_shuffle(out, 2) class InvertedResidualDS(nn.Layer): def __init__(self, in_channels, out_channels, stride, act="relu"): super(InvertedResidualDS, self).__init__() # branch1 self._conv_dw_1 = ConvBNLayer( in_channels=in_channels, out_channels=in_channels, kernel_size=3, stride=stride, padding=1, groups=in_channels, act=None) self._conv_linear_1 = ConvBNLayer( in_channels=in_channels, out_channels=out_channels // 2, kernel_size=1, stride=1, padding=0, groups=1, act=act) # branch2 self._conv_pw_2 = ConvBNLayer( in_channels=in_channels, out_channels=out_channels // 2, kernel_size=1, stride=1, padding=0, groups=1, act=act) self._conv_dw_2 = ConvBNLayer( in_channels=out_channels // 2, out_channels=out_channels // 2, kernel_size=3, stride=stride, padding=1, groups=out_channels // 2, act=None) self._conv_linear_2 = ConvBNLayer( in_channels=out_channels // 2, out_channels=out_channels // 2, kernel_size=1, stride=1, padding=0, groups=1, act=act) def forward(self, inputs): x1 = self._conv_dw_1(inputs) x1 = self._conv_linear_1(x1) x2 = self._conv_pw_2(inputs) x2 = self._conv_dw_2(x2) x2 = self._conv_linear_2(x2) out = paddle.concat([x1, x2], axis=1) return channel_shuffle(out, 2) @register @serializable class ShuffleNetV2(nn.Layer): def __init__(self, scale=1.0, act="relu", feature_maps=[5, 13, 17]): super(ShuffleNetV2, self).__init__() self.scale = scale if isinstance(feature_maps, Integral): feature_maps = [feature_maps] self.feature_maps = feature_maps stage_repeats = [4, 8, 4] if scale == 0.25: stage_out_channels = [-1, 24, 24, 48, 96, 512] elif scale == 0.33: stage_out_channels = [-1, 24, 32, 64, 128, 512] elif scale == 0.5: stage_out_channels = [-1, 24, 48, 96, 192, 1024] elif scale == 1.0: stage_out_channels = [-1, 24, 116, 232, 464, 1024] elif scale == 1.5: stage_out_channels = [-1, 24, 176, 352, 704, 1024] elif scale == 2.0: stage_out_channels = [-1, 24, 244, 488, 976, 2048] else: raise NotImplementedError("This scale size:[" + str(scale) + "] is not implemented!") self._out_channels = [] self._feature_idx = 0 # 1. conv1 self._conv1 = ConvBNLayer( in_channels=3, out_channels=stage_out_channels[1], kernel_size=3, stride=2, padding=1, act=act) self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1) self._feature_idx += 1 # 2. bottleneck sequences self._block_list = [] for stage_id, num_repeat in enumerate(stage_repeats): for i in range(num_repeat): if i == 0: block = self.add_sublayer( name=str(stage_id + 2) + '_' + str(i + 1), sublayer=InvertedResidualDS( in_channels=stage_out_channels[stage_id + 1], out_channels=stage_out_channels[stage_id + 2], stride=2, act=act)) else: block = self.add_sublayer( name=str(stage_id + 2) + '_' + str(i + 1), sublayer=InvertedResidual( in_channels=stage_out_channels[stage_id + 2], out_channels=stage_out_channels[stage_id + 2], stride=1, act=act)) self._block_list.append(block) self._feature_idx += 1 self._update_out_channels(stage_out_channels[stage_id + 2], self._feature_idx, self.feature_maps) def _update_out_channels(self, channel, feature_idx, feature_maps): if feature_idx in feature_maps: self._out_channels.append(channel) def forward(self, inputs): y = self._conv1(inputs['image']) y = self._max_pool(y) outs = [] for i, inv in enumerate(self._block_list): y = inv(y) if i + 2 in self.feature_maps: outs.append(y) return outs @property def out_shape(self): return [ShapeSpec(channels=c) for c in self._out_channels] ================================================ FILE: ppdet/modeling/backbones/swin_transformer.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This code is based on https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py Ths copyright of microsoft/Swin-Transformer is as follows: MIT License [see LICENSE for details] """ import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.modeling.shape_spec import ShapeSpec from ppdet.core.workspace import register, serializable from .transformer_utils import DropPath, Identity from .transformer_utils import add_parameter, to_2tuple from .transformer_utils import ones_, zeros_, trunc_normal_ __all__ = ['SwinTransformer'] MODEL_cfg = { # use 22kto1k finetune weights as default pretrained, can set by SwinTransformer.pretrained in config 'swin_T_224': dict( pretrain_img_size=224, embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_tiny_patch4_window7_224_22kto1k_pretrained.pdparams', ), 'swin_S_224': dict( pretrain_img_size=224, embed_dim=96, depths=[2, 2, 18, 2], num_heads=[3, 6, 12, 24], window_size=7, pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_small_patch4_window7_224_22kto1k_pretrained.pdparams', ), 'swin_B_224': dict( pretrain_img_size=224, embed_dim=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=7, pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window7_224_22kto1k_pretrained.pdparams', ), 'swin_L_224': dict( pretrain_img_size=224, embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=7, pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window7_224_22kto1k_pretrained.pdparams', ), 'swin_B_384': dict( pretrain_img_size=384, embed_dim=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=12, pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window12_384_22kto1k_pretrained.pdparams', ), 'swin_L_384': dict( pretrain_img_size=384, embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=12, pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window12_384_22kto1k_pretrained.pdparams', ), } class Mlp(nn.Layer): def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x def window_partition(x, window_size): """ Args: x: (B, H, W, C) window_size (int): window size Returns: windows: (num_windows*B, window_size, window_size, C) """ B, H, W, C = x.shape x = x.reshape( [-1, H // window_size, window_size, W // window_size, window_size, C]) windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape( [-1, window_size, window_size, C]) return windows def window_reverse(windows, window_size, H, W): """ Args: windows: (num_windows*B, window_size, window_size, C) window_size (int): Window size H (int): Height of image W (int): Width of image Returns: x: (B, H, W, C) """ _, _, _, C = windows.shape B = int(windows.shape[0] / (H * W / window_size / window_size)) x = windows.reshape( [-1, H // window_size, W // window_size, window_size, window_size, C]) x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, H, W, C]) return x class WindowAttention(nn.Layer): """ Window based multi-head self attention (W-MSA) module with relative position bias. It supports both of shifted and non-shifted window. Args: dim (int): Number of input channels. window_size (tuple[int]): The height and width of the window. num_heads (int): Number of attention heads. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 proj_drop (float, optional): Dropout ratio of output. Default: 0.0 """ def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.): super().__init__() self.dim = dim self.window_size = window_size # Wh, Ww self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim**-0.5 # define a parameter table of relative position bias self.relative_position_bias_table = add_parameter( self, paddle.zeros(((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))) # 2*Wh-1 * 2*Ww-1, nH # get pair-wise relative position index for each token inside the window coords_h = paddle.arange(self.window_size[0]) coords_w = paddle.arange(self.window_size[1]) coords = paddle.stack(paddle.meshgrid( [coords_h, coords_w])) # 2, Wh, Ww coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww coords_flatten_1 = coords_flatten.unsqueeze(axis=2) coords_flatten_2 = coords_flatten.unsqueeze(axis=1) relative_coords = coords_flatten_1 - coords_flatten_2 relative_coords = relative_coords.transpose( [1, 2, 0]) # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += self.window_size[ 0] - 1 # shift to start from 0 relative_coords[:, :, 1] += self.window_size[1] - 1 relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 self.relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) trunc_normal_(self.relative_position_bias_table) self.softmax = nn.Softmax(axis=-1) def forward(self, x, mask=None): """ Forward function. Args: x: input features with shape of (num_windows*B, N, C) mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None """ B_, N, C = x.shape qkv = self.qkv(x).reshape( [-1, N, 3, self.num_heads, C // self.num_heads]).transpose( [2, 0, 3, 1, 4]) q, k, v = qkv[0], qkv[1], qkv[2] q = q * self.scale attn = paddle.mm(q, k.transpose([0, 1, 3, 2])) index = self.relative_position_index.flatten() relative_position_bias = paddle.index_select( self.relative_position_bias_table, index) relative_position_bias = relative_position_bias.reshape([ self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1 ]) # Wh*Ww,Wh*Ww,nH relative_position_bias = relative_position_bias.transpose( [2, 0, 1]) # nH, Wh*Ww, Wh*Ww attn = attn + relative_position_bias.unsqueeze(0) if mask is not None: nW = mask.shape[0] attn = attn.reshape([-1, nW, self.num_heads, N, N ]) + mask.unsqueeze(1).unsqueeze(0) attn = attn.reshape([-1, self.num_heads, N, N]) attn = self.softmax(attn) else: attn = self.softmax(attn) attn = self.attn_drop(attn) # x = (attn @ v).transpose(1, 2).reshape([B_, N, C]) x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([-1, N, C]) x = self.proj(x) x = self.proj_drop(x) return x class SwinTransformerBlock(nn.Layer): """ Swin Transformer Block. Args: dim (int): Number of input channels. num_heads (int): Number of attention heads. window_size (int): Window size. shift_size (int): Shift size for SW-MSA. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optional): Attention dropout rate. Default: 0.0 drop_path (float, optional): Stochastic depth rate. Default: 0.0 act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm """ def __init__(self, dim, num_heads, window_size=7, shift_size=0, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): super().__init__() self.dim = dim self.num_heads = num_heads self.window_size = window_size self.shift_size = shift_size self.mlp_ratio = mlp_ratio assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" self.norm1 = norm_layer(dim) self.attn = WindowAttention( dim, window_size=to_2tuple(self.window_size), num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) self.H = None self.W = None def forward(self, x, mask_matrix): """ Forward function. Args: x: Input feature, tensor size (B, H*W, C). H, W: Spatial resolution of the input feature. mask_matrix: Attention mask for cyclic shift. """ B, L, C = x.shape H, W = self.H, self.W assert L == H * W, "input feature has wrong size" shortcut = x x = self.norm1(x) x = x.reshape([-1, H, W, C]) # pad feature maps to multiples of window size pad_l = pad_t = 0 pad_r = (self.window_size - W % self.window_size) % self.window_size pad_b = (self.window_size - H % self.window_size) % self.window_size x = F.pad(x, [0, pad_l, 0, pad_b, 0, pad_r, 0, pad_t], data_format='NHWC') _, Hp, Wp, _ = x.shape # cyclic shift if self.shift_size > 0: shifted_x = paddle.roll( x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2)) attn_mask = mask_matrix else: shifted_x = x attn_mask = None # partition windows x_windows = window_partition( shifted_x, self.window_size) # nW*B, window_size, window_size, C x_windows = x_windows.reshape( [x_windows.shape[0], self.window_size * self.window_size, C]) # nW*B, window_size*window_size, C # W-MSA/SW-MSA attn_windows = self.attn( x_windows, mask=attn_mask) # nW*B, window_size*window_size, C # merge windows attn_windows = attn_windows.reshape( [x_windows.shape[0], self.window_size, self.window_size, C]) shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C # reverse cyclic shift if self.shift_size > 0: x = paddle.roll( shifted_x, shifts=(self.shift_size, self.shift_size), axis=(1, 2)) else: x = shifted_x if pad_r > 0 or pad_b > 0: x = x[:, :H, :W, :] x = x.reshape([-1, H * W, C]) # FFN x = shortcut + self.drop_path(x) x = x + self.drop_path(self.mlp(self.norm2(x))) return x class PatchMerging(nn.Layer): r""" Patch Merging Layer. Args: dim (int): Number of input channels. norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm """ def __init__(self, dim, norm_layer=nn.LayerNorm): super().__init__() self.dim = dim self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False) self.norm = norm_layer(4 * dim) def forward(self, x, H, W): """ Forward function. Args: x: Input feature, tensor size (B, H*W, C). H, W: Spatial resolution of the input feature. """ B, L, C = x.shape assert L == H * W, "input feature has wrong size" x = x.reshape([-1, H, W, C]) # padding pad_input = (H % 2 == 1) or (W % 2 == 1) if pad_input: # paddle F.pad default data_format is 'NCHW' x = F.pad(x, [0, 0, 0, H % 2, 0, W % 2, 0, 0], data_format='NHWC') H += H % 2 W += W % 2 x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C x = paddle.concat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C x = x.reshape([-1, H * W // 4, 4 * C]) # B H/2*W/2 4*C x = self.norm(x) x = self.reduction(x) return x class BasicLayer(nn.Layer): """ A basic Swin Transformer layer for one stage. Args: dim (int): Number of input channels. depth (int): Number of blocks. num_heads (int): Number of attention heads. window_size (int): Local window size. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optional): Attention dropout rate. Default: 0.0 drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None """ def __init__(self, dim, depth, num_heads, window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., norm_layer=nn.LayerNorm, downsample=None): super().__init__() self.window_size = window_size self.shift_size = window_size // 2 self.depth = depth # build blocks self.blocks = nn.LayerList([ SwinTransformerBlock( dim=dim, num_heads=num_heads, window_size=window_size, shift_size=0 if (i % 2 == 0) else window_size // 2, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop, attn_drop=attn_drop, drop_path=drop_path[i] if isinstance(drop_path, np.ndarray) else drop_path, norm_layer=norm_layer) for i in range(depth) ]) # patch merging layer if downsample is not None: self.downsample = downsample(dim=dim, norm_layer=norm_layer) else: self.downsample = None def forward(self, x, H, W): """ Forward function. Args: x: Input feature, tensor size (B, H*W, C). H, W: Spatial resolution of the input feature. """ # calculate attention mask for SW-MSA Hp = int(np.ceil(H / self.window_size)) * self.window_size Wp = int(np.ceil(W / self.window_size)) * self.window_size img_mask = paddle.zeros([1, Hp, Wp, 1], dtype='float32') # 1 Hp Wp 1 h_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None)) w_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None)) cnt = 0 for h in h_slices: for w in w_slices: img_mask[:, h, w, :] = cnt cnt += 1 mask_windows = window_partition( img_mask, self.window_size) # nW, window_size, window_size, 1 mask_windows = mask_windows.reshape( [-1, self.window_size * self.window_size]) attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) huns = -100.0 * paddle.ones_like(attn_mask) attn_mask = huns * (attn_mask != 0).astype("float32") for blk in self.blocks: blk.H, blk.W = H, W x = blk(x, attn_mask) if self.downsample is not None: x_down = self.downsample(x, H, W) Wh, Ww = (H + 1) // 2, (W + 1) // 2 return x, H, W, x_down, Wh, Ww else: return x, H, W, x, H, W class PatchEmbed(nn.Layer): """ Image to Patch Embedding Args: patch_size (int): Patch token size. Default: 4. in_chans (int): Number of input image channels. Default: 3. embed_dim (int): Number of linear projection output channels. Default: 96. norm_layer (nn.Layer, optional): Normalization layer. Default: None """ def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): super().__init__() patch_size = to_2tuple(patch_size) self.patch_size = patch_size self.in_chans = in_chans self.embed_dim = embed_dim self.proj = nn.Conv2D( in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) if norm_layer is not None: self.norm = norm_layer(embed_dim) else: self.norm = None def forward(self, x): # TODO # export dynamic shape B, C, H, W = x.shape # assert [H, W] == self.img_size[:2], "Input image size ({H}*{W}) doesn't match model ({}*{}).".format(H, W, self.img_size[0], self.img_size[1]) if W % self.patch_size[1] != 0: x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0]) if H % self.patch_size[0] != 0: x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]]) x = self.proj(x) if self.norm is not None: _, _, Wh, Ww = x.shape x = x.flatten(2).transpose([0, 2, 1]) x = self.norm(x) x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww]) return x @register @serializable class SwinTransformer(nn.Layer): """ Swin Transformer backbone Args: arch (str): Architecture of FocalNet pretrain_img_size (int | tuple(int)): Input image size. Default 224 patch_size (int | tuple(int)): Patch size. Default: 4 in_chans (int): Number of input image channels. Default: 3 embed_dim (int): Patch embedding dimension. Default: 96 depths (tuple(int)): Depth of each Swin Transformer layer. num_heads (tuple(int)): Number of attention heads in different layers. window_size (int): Window size. Default: 7 mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4 qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None drop_rate (float): Dropout rate. Default: 0 attn_drop_rate (float): Attention dropout rate. Default: 0 drop_path_rate (float): Stochastic depth rate. Default: 0.1 norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm. ape (bool): If True, add absolute position embedding to the patch embedding. Default: False patch_norm (bool): If True, add normalization after patch embedding. Default: True """ def __init__(self, arch='swin_T_224', pretrain_img_size=224, patch_size=4, in_chans=3, embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.2, norm_layer=nn.LayerNorm, ape=False, patch_norm=True, out_indices=(0, 1, 2, 3), frozen_stages=-1, pretrained=None): super(SwinTransformer, self).__init__() assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch) pretrain_img_size = MODEL_cfg[arch]['pretrain_img_size'] embed_dim = MODEL_cfg[arch]['embed_dim'] depths = MODEL_cfg[arch]['depths'] num_heads = MODEL_cfg[arch]['num_heads'] window_size = MODEL_cfg[arch]['window_size'] if pretrained is None: pretrained = MODEL_cfg[arch]['pretrained'] self.num_layers = len(depths) self.ape = ape self.patch_norm = patch_norm self.out_indices = out_indices self.frozen_stages = frozen_stages # split image into non-overlapping patches self.patch_embed = PatchEmbed( patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, norm_layer=norm_layer if self.patch_norm else None) # absolute position embedding if self.ape: pretrain_img_size = to_2tuple(pretrain_img_size) patch_size = to_2tuple(patch_size) patches_resolution = [ pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1] ] self.absolute_pos_embed = add_parameter( self, paddle.zeros((1, embed_dim, patches_resolution[0], patches_resolution[1]))) trunc_normal_(self.absolute_pos_embed) self.pos_drop = nn.Dropout(p=drop_rate) # stochastic depth dpr = np.linspace(0, drop_path_rate, sum(depths)) # stochastic depth decay rule # build layers self.layers = nn.LayerList() for i_layer in range(self.num_layers): layer = BasicLayer( dim=int(embed_dim * 2**i_layer), depth=depths[i_layer], num_heads=num_heads[i_layer], window_size=window_size, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], norm_layer=norm_layer, downsample=PatchMerging if (i_layer < self.num_layers - 1) else None) self.layers.append(layer) num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)] self.num_features = num_features # add a norm layer for each output for i_layer in out_indices: layer = norm_layer(num_features[i_layer]) layer_name = f'norm{i_layer}' self.add_sublayer(layer_name, layer) self.apply(self._init_weights) self._freeze_stages() if pretrained: if 'http' in pretrained: #URL path = paddle.utils.download.get_weights_path_from_url( pretrained) else: #model in local path path = pretrained self.set_state_dict(paddle.load(path)) def _freeze_stages(self): if self.frozen_stages >= 0: self.patch_embed.eval() for param in self.patch_embed.parameters(): param.stop_gradient = True if self.frozen_stages >= 1 and self.ape: self.absolute_pos_embed.stop_gradient = True if self.frozen_stages >= 2: self.pos_drop.eval() for i in range(0, self.frozen_stages - 1): m = self.layers[i] m.eval() for param in m.parameters(): param.stop_gradient = True def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight) if isinstance(m, nn.Linear) and m.bias is not None: zeros_(m.bias) elif isinstance(m, nn.LayerNorm): zeros_(m.bias) ones_(m.weight) def forward(self, x): """Forward function.""" x = self.patch_embed(x['image']) B, _, Wh, Ww = x.shape if self.ape: # interpolate the position embedding to the corresponding size absolute_pos_embed = F.interpolate( self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic') x = (x + absolute_pos_embed).flatten(2).transpose([0, 2, 1]) else: x = x.flatten(2).transpose([0, 2, 1]) x = self.pos_drop(x) outs = [] for i in range(self.num_layers): layer = self.layers[i] x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) if i in self.out_indices: norm_layer = getattr(self, f'norm{i}') x_out = norm_layer(x_out) out = x_out.reshape((-1, H, W, self.num_features[i])).transpose( (0, 3, 1, 2)) outs.append(out) return outs @property def out_shape(self): out_strides = [4, 8, 16, 32] return [ ShapeSpec( channels=self.num_features[i], stride=out_strides[i]) for i in self.out_indices ] ================================================ FILE: ppdet/modeling/backbones/trans_encoder.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle.nn import ReLU, Swish, GELU import math from ppdet.core.workspace import register from ..shape_spec import ShapeSpec __all__ = ['TransEncoder'] class BertEmbeddings(nn.Layer): def __init__(self, word_size, position_embeddings_size, word_type_size, hidden_size, dropout_prob): super(BertEmbeddings, self).__init__() self.word_embeddings = nn.Embedding( word_size, hidden_size, padding_idx=0) self.position_embeddings = nn.Embedding(position_embeddings_size, hidden_size) self.token_type_embeddings = nn.Embedding(word_type_size, hidden_size) self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8) self.dropout = nn.Dropout(dropout_prob) def forward(self, x, token_type_ids=None, position_ids=None): seq_len = x.shape[1] if position_ids is None: position_ids = paddle.arange(seq_len).unsqueeze(0).expand_as(x) if token_type_ids is None: token_type_ids = paddle.zeros(x.shape) word_embs = self.word_embeddings(x) position_embs = self.position_embeddings(position_ids) token_type_embs = self.token_type_embeddings(token_type_ids) embs_cmb = word_embs + position_embs + token_type_embs embs_out = self.layernorm(embs_cmb) embs_out = self.dropout(embs_out) return embs_out class BertSelfAttention(nn.Layer): def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob, output_attentions=False): super(BertSelfAttention, self).__init__() if hidden_size % num_attention_heads != 0: raise ValueError( "The hidden_size must be a multiple of the number of attention " "heads, but got {} % {} != 0" % (hidden_size, num_attention_heads)) self.num_attention_heads = num_attention_heads self.attention_head_size = int(hidden_size / num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.query = nn.Linear(hidden_size, self.all_head_size) self.key = nn.Linear(hidden_size, self.all_head_size) self.value = nn.Linear(hidden_size, self.all_head_size) self.dropout = nn.Dropout(attention_probs_dropout_prob) self.output_attentions = output_attentions def forward(self, x, attention_mask, head_mask=None): query = self.query(x) key = self.key(x) value = self.value(x) query_dim1, query_dim2 = query.shape[:-1] new_shape = [ query_dim1, query_dim2, self.num_attention_heads, self.attention_head_size ] query = query.reshape(new_shape).transpose(perm=(0, 2, 1, 3)) key = key.reshape(new_shape).transpose(perm=(0, 2, 3, 1)) value = value.reshape(new_shape).transpose(perm=(0, 2, 1, 3)) attention = paddle.matmul(query, key) / math.sqrt(self.attention_head_size) attention = attention + attention_mask attention_value = F.softmax(attention, axis=-1) attention_value = self.dropout(attention_value) if head_mask is not None: attention_value = attention_value * head_mask context = paddle.matmul(attention_value, value).transpose(perm=(0, 2, 1, 3)) ctx_dim1, ctx_dim2 = context.shape[:-2] new_context_shape = [ ctx_dim1, ctx_dim2, self.all_head_size, ] context = context.reshape(new_context_shape) if self.output_attentions: return (context, attention_value) else: return (context, ) class BertAttention(nn.Layer): def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob, fc_dropout_prob, output_attentions=False): super(BertAttention, self).__init__() self.bert_selfattention = BertSelfAttention( hidden_size, num_attention_heads, attention_probs_dropout_prob, output_attentions) self.fc = nn.Linear(hidden_size, hidden_size) self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8) self.dropout = nn.Dropout(fc_dropout_prob) def forward(self, x, attention_mask, head_mask=None): attention_feats = self.bert_selfattention(x, attention_mask, head_mask) features = self.fc(attention_feats[0]) features = self.dropout(features) features = self.layernorm(features + x) if len(attention_feats) == 2: return (features, attention_feats[1]) else: return (features, ) class BertFeedForward(nn.Layer): def __init__(self, hidden_size, intermediate_size, num_attention_heads, attention_probs_dropout_prob, fc_dropout_prob, act_fn='ReLU', output_attentions=False): super(BertFeedForward, self).__init__() self.fc1 = nn.Linear(hidden_size, intermediate_size) self.act_fn = eval(act_fn) self.fc2 = nn.Linear(intermediate_size, hidden_size) self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8) self.dropout = nn.Dropout(fc_dropout_prob) def forward(self, x): features = self.fc1(x) features = self.act_fn(features) features = self.fc2(features) features = self.dropout(features) features = self.layernorm(features + x) return features class BertLayer(nn.Layer): def __init__(self, hidden_size, intermediate_size, num_attention_heads, attention_probs_dropout_prob, fc_dropout_prob, act_fn='ReLU', output_attentions=False): super(BertLayer, self).__init__() self.attention = BertAttention(hidden_size, num_attention_heads, attention_probs_dropout_prob, output_attentions) self.feed_forward = BertFeedForward( hidden_size, intermediate_size, num_attention_heads, attention_probs_dropout_prob, fc_dropout_prob, act_fn, output_attentions) def forward(self, x, attention_mask, head_mask=None): attention_feats = self.attention(x, attention_mask, head_mask) features = self.feed_forward(attention_feats[0]) if len(attention_feats) == 2: return (features, attention_feats[1]) else: return (features, ) class BertEncoder(nn.Layer): def __init__(self, num_hidden_layers, hidden_size, intermediate_size, num_attention_heads, attention_probs_dropout_prob, fc_dropout_prob, act_fn='ReLU', output_attentions=False, output_hidden_feats=False): super(BertEncoder, self).__init__() self.output_attentions = output_attentions self.output_hidden_feats = output_hidden_feats self.layers = nn.LayerList([ BertLayer(hidden_size, intermediate_size, num_attention_heads, attention_probs_dropout_prob, fc_dropout_prob, act_fn, output_attentions) for _ in range(num_hidden_layers) ]) def forward(self, x, attention_mask, head_mask=None): all_features = (x, ) all_attentions = () for i, layer in enumerate(self.layers): mask = head_mask[i] if head_mask is not None else None layer_out = layer(x, attention_mask, mask) if self.output_hidden_feats: all_features = all_features + (x, ) x = layer_out[0] if self.output_attentions: all_attentions = all_attentions + (layer_out[1], ) outputs = (x, ) if self.output_hidden_feats: outputs += (all_features, ) if self.output_attentions: outputs += (all_attentions, ) return outputs class BertPooler(nn.Layer): def __init__(self, hidden_size): super(BertPooler, self).__init__() self.fc = nn.Linear(hidden_size, hidden_size) self.act = nn.Tanh() def forward(self, x): first_token = x[:, 0] pooled_output = self.fc(first_token) pooled_output = self.act(pooled_output) return pooled_output class METROEncoder(nn.Layer): def __init__(self, vocab_size, num_hidden_layers, features_dims, position_embeddings_size, hidden_size, intermediate_size, output_feature_dim, num_attention_heads, attention_probs_dropout_prob, fc_dropout_prob, act_fn='ReLU', output_attentions=False, output_hidden_feats=False, use_img_layernorm=False): super(METROEncoder, self).__init__() self.img_dims = features_dims self.num_hidden_layers = num_hidden_layers self.use_img_layernorm = use_img_layernorm self.output_attentions = output_attentions self.embedding = BertEmbeddings(vocab_size, position_embeddings_size, 2, hidden_size, fc_dropout_prob) self.encoder = BertEncoder( num_hidden_layers, hidden_size, intermediate_size, num_attention_heads, attention_probs_dropout_prob, fc_dropout_prob, act_fn, output_attentions, output_hidden_feats) self.pooler = BertPooler(hidden_size) self.position_embeddings = nn.Embedding(position_embeddings_size, hidden_size) self.img_embedding = nn.Linear( features_dims, hidden_size, bias_attr=True) self.dropout = nn.Dropout(fc_dropout_prob) self.cls_head = nn.Linear(hidden_size, output_feature_dim) self.residual = nn.Linear(features_dims, output_feature_dim) self.apply(self.init_weights) def init_weights(self, module): """ Initialize the weights. """ if isinstance(module, (nn.Linear, nn.Embedding)): module.weight.set_value( paddle.normal( mean=0.0, std=0.02, shape=module.weight.shape)) elif isinstance(module, nn.LayerNorm): module.bias.set_value(paddle.zeros(shape=module.bias.shape)) module.weight.set_value( paddle.full( shape=module.weight.shape, fill_value=1.0)) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.set_value(paddle.zeros(shape=module.bias.shape)) def forward(self, x): batchsize, seq_len = x.shape[:2] input_ids = paddle.zeros((batchsize, seq_len), dtype="int64") position_ids = paddle.arange( seq_len, dtype="int64").unsqueeze(0).expand_as(input_ids) attention_mask = paddle.ones_like(input_ids).unsqueeze(1).unsqueeze(2) head_mask = [None] * self.num_hidden_layers position_embs = self.position_embeddings(position_ids) attention_mask = (1.0 - attention_mask) * -10000.0 img_features = self.img_embedding(x) # We empirically observe that adding an additional learnable position embedding leads to more stable training embeddings = position_embs + img_features if self.use_img_layernorm: embeddings = self.layernorm(embeddings) embeddings = self.dropout(embeddings) encoder_outputs = self.encoder( embeddings, attention_mask, head_mask=head_mask) pred_score = self.cls_head(encoder_outputs[0]) res_img_feats = self.residual(x) pred_score = pred_score + res_img_feats if self.output_attentions and self.output_hidden_feats: return pred_score, encoder_outputs[1], encoder_outputs[-1] else: return pred_score def gelu(x): """Implementation of the gelu activation function. https://arxiv.org/abs/1606.08415 """ return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0))) @register class TransEncoder(nn.Layer): def __init__(self, vocab_size=30522, num_hidden_layers=4, num_attention_heads=4, position_embeddings_size=512, intermediate_size=3072, input_feat_dim=[2048, 512, 128], hidden_feat_dim=[1024, 256, 128], attention_probs_dropout_prob=0.1, fc_dropout_prob=0.1, act_fn='gelu', output_attentions=False, output_hidden_feats=False): super(TransEncoder, self).__init__() output_feat_dim = input_feat_dim[1:] + [3] trans_encoder = [] for i in range(len(output_feat_dim)): features_dims = input_feat_dim[i] output_feature_dim = output_feat_dim[i] hidden_size = hidden_feat_dim[i] # init a transformer encoder and append it to a list assert hidden_size % num_attention_heads == 0 model = METROEncoder(vocab_size, num_hidden_layers, features_dims, position_embeddings_size, hidden_size, intermediate_size, output_feature_dim, num_attention_heads, attention_probs_dropout_prob, fc_dropout_prob, act_fn, output_attentions, output_hidden_feats) trans_encoder.append(model) self.trans_encoder = paddle.nn.Sequential(*trans_encoder) def forward(self, x): out = self.trans_encoder(x) return out ================================================ FILE: ppdet/modeling/backbones/transformer_utils.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle.nn.initializer import TruncatedNormal, Constant, Assign # Common initializations ones_ = Constant(value=1.) zeros_ = Constant(value=0.) trunc_normal_ = TruncatedNormal(std=.02) # Common Layers def drop_path(x, drop_prob=0., training=False): """ Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... """ if drop_prob == 0. or not training: return x keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype) shape = (x.shape[0], ) + (1, ) * (x.ndim - 1) random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype) random_tensor = paddle.floor(random_tensor) # binarize output = x.divide(keep_prob) * random_tensor return output class DropPath(nn.Layer): def __init__(self, drop_prob=None): super(DropPath, self).__init__() self.drop_prob = drop_prob def forward(self, x): return drop_path(x, self.drop_prob, self.training) class Identity(nn.Layer): def __init__(self): super(Identity, self).__init__() def forward(self, input): return input # common funcs def to_2tuple(x): if isinstance(x, (list, tuple)): return x return tuple([x] * 2) def add_parameter(layer, datas, name=None): parameter = layer.create_parameter( shape=(datas.shape), default_initializer=Assign(datas)) if name: layer.add_parameter(name, parameter) return parameter def window_partition(x, window_size): """ Partition into non-overlapping windows with padding if needed. Args: x (tensor): input tokens with [B, H, W, C]. window_size (int): window size. Returns: windows: windows after partition with [B * num_windows, window_size, window_size, C]. (Hp, Wp): padded height and width before partition """ B, H, W, C = x.shape pad_h = (window_size - H % window_size) % window_size pad_w = (window_size - W % window_size) % window_size x = F.pad(x.transpose([0, 3, 1, 2]), paddle.to_tensor( [0, int(pad_w), 0, int(pad_h)], dtype='int32')).transpose([0, 2, 3, 1]) Hp, Wp = H + pad_h, W + pad_w num_h, num_w = Hp // window_size, Wp // window_size x = x.reshape([B, num_h, window_size, num_w, window_size, C]) windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape( [-1, window_size, window_size, C]) return windows, (Hp, Wp), (num_h, num_w) def window_unpartition(x, pad_hw, num_hw, hw): """ Window unpartition into original sequences and removing padding. Args: x (tensor): input tokens with [B * num_windows, window_size, window_size, C]. pad_hw (Tuple): padded height and width (Hp, Wp). hw (Tuple): original height and width (H, W) before padding. Returns: x: unpartitioned sequences with [B, H, W, C]. """ Hp, Wp = pad_hw num_h, num_w = num_hw H, W = hw B, window_size, _, C = x.shape B = B // (num_h * num_w) x = x.reshape([B, num_h, num_w, window_size, window_size, C]) x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, Hp, Wp, C]) return x[:, :H, :W, :] ================================================ FILE: ppdet/modeling/backbones/vgg.py ================================================ from __future__ import division import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.nn import Conv2D, MaxPool2D from ppdet.core.workspace import register, serializable from ..shape_spec import ShapeSpec __all__ = ['VGG'] VGG_cfg = {16: [2, 2, 3, 3, 3], 19: [2, 2, 4, 4, 4]} class ConvBlock(nn.Layer): def __init__(self, in_channels, out_channels, groups, pool_size=2, pool_stride=2, pool_padding=0, name=None): super(ConvBlock, self).__init__() self.groups = groups self.conv0 = nn.Conv2D( in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1) self.conv_out_list = [] for i in range(1, groups): conv_out = self.add_sublayer( 'conv{}'.format(i), Conv2D( in_channels=out_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1)) self.conv_out_list.append(conv_out) self.pool = MaxPool2D( kernel_size=pool_size, stride=pool_stride, padding=pool_padding, ceil_mode=True) def forward(self, inputs): out = self.conv0(inputs) out = F.relu(out) for conv_i in self.conv_out_list: out = conv_i(out) out = F.relu(out) pool = self.pool(out) return out, pool class ExtraBlock(nn.Layer): def __init__(self, in_channels, mid_channels, out_channels, padding, stride, kernel_size, name=None): super(ExtraBlock, self).__init__() self.conv0 = Conv2D( in_channels=in_channels, out_channels=mid_channels, kernel_size=1, stride=1, padding=0) self.conv1 = Conv2D( in_channels=mid_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding) def forward(self, inputs): out = self.conv0(inputs) out = F.relu(out) out = self.conv1(out) out = F.relu(out) return out class L2NormScale(nn.Layer): def __init__(self, num_channels, scale=1.0): super(L2NormScale, self).__init__() self.scale = self.create_parameter( attr=ParamAttr(initializer=paddle.nn.initializer.Constant(scale)), shape=[num_channels]) def forward(self, inputs): out = F.normalize(inputs, axis=1, epsilon=1e-10) # out = self.scale.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as( # out) * out out = self.scale.unsqueeze(0).unsqueeze(2).unsqueeze(3) * out return out @register @serializable class VGG(nn.Layer): def __init__(self, depth=16, normalizations=[20., -1, -1, -1, -1, -1], extra_block_filters=[[256, 512, 1, 2, 3], [128, 256, 1, 2, 3], [128, 256, 0, 1, 3], [128, 256, 0, 1, 3]]): super(VGG, self).__init__() assert depth in [16, 19], \ "depth as 16/19 supported currently, but got {}".format(depth) self.depth = depth self.groups = VGG_cfg[depth] self.normalizations = normalizations self.extra_block_filters = extra_block_filters self._out_channels = [] self.conv_block_0 = ConvBlock( 3, 64, self.groups[0], 2, 2, 0, name="conv1_") self.conv_block_1 = ConvBlock( 64, 128, self.groups[1], 2, 2, 0, name="conv2_") self.conv_block_2 = ConvBlock( 128, 256, self.groups[2], 2, 2, 0, name="conv3_") self.conv_block_3 = ConvBlock( 256, 512, self.groups[3], 2, 2, 0, name="conv4_") self.conv_block_4 = ConvBlock( 512, 512, self.groups[4], 3, 1, 1, name="conv5_") self._out_channels.append(512) self.fc6 = Conv2D( in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=6, dilation=6) self.fc7 = Conv2D( in_channels=1024, out_channels=1024, kernel_size=1, stride=1, padding=0) self._out_channels.append(1024) # extra block self.extra_convs = [] last_channels = 1024 for i, v in enumerate(self.extra_block_filters): assert len(v) == 5, "extra_block_filters size not fix" extra_conv = self.add_sublayer("conv{}".format(6 + i), ExtraBlock(last_channels, v[0], v[1], v[2], v[3], v[4])) last_channels = v[1] self.extra_convs.append(extra_conv) self._out_channels.append(last_channels) self.norms = [] for i, n in enumerate(self.normalizations): if n != -1: norm = self.add_sublayer("norm{}".format(i), L2NormScale( self.extra_block_filters[i][1], n)) else: norm = None self.norms.append(norm) def forward(self, inputs): outputs = [] conv, pool = self.conv_block_0(inputs['image']) conv, pool = self.conv_block_1(pool) conv, pool = self.conv_block_2(pool) conv, pool = self.conv_block_3(pool) outputs.append(conv) conv, pool = self.conv_block_4(pool) out = self.fc6(pool) out = F.relu(out) out = self.fc7(out) out = F.relu(out) outputs.append(out) if not self.extra_block_filters: return outputs # extra block for extra_conv in self.extra_convs: out = extra_conv(out) outputs.append(out) for i, n in enumerate(self.normalizations): if n != -1: outputs[i] = self.norms[i](outputs[i]) return outputs @property def out_shape(self): return [ShapeSpec(channels=c) for c in self._out_channels] ================================================ FILE: ppdet/modeling/backbones/vision_transformer.py ================================================ # copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math import paddle import paddle.nn as nn import paddle.nn.functional as F import numpy as np from paddle.nn.initializer import Constant from ppdet.modeling.shape_spec import ShapeSpec from ppdet.core.workspace import register, serializable from .transformer_utils import zeros_, DropPath, Identity class Mlp(nn.Layer): def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x class Attention(nn.Layer): def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., window_size=None): super().__init__() self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim**-0.5 self.qkv = nn.Linear(dim, dim * 3, bias_attr=False) if qkv_bias: self.q_bias = self.create_parameter( shape=([dim]), default_initializer=zeros_) self.v_bias = self.create_parameter( shape=([dim]), default_initializer=zeros_) else: self.q_bias = None self.v_bias = None if window_size: self.window_size = window_size self.num_relative_distance = (2 * window_size[0] - 1) * ( 2 * window_size[1] - 1) + 3 self.relative_position_bias_table = self.create_parameter( shape=(self.num_relative_distance, num_heads), default_initializer=zeros_) # 2*Wh-1 * 2*Ww-1, nH # cls to token & token 2 cls & cls to cls # get pair-wise relative position index for each token inside the window coords_h = paddle.arange(window_size[0]) coords_w = paddle.arange(window_size[1]) coords = paddle.stack(paddle.meshgrid( [coords_h, coords_w])) # 2, Wh, Ww coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww coords_flatten_1 = paddle.unsqueeze(coords_flatten, 2) coords_flatten_2 = paddle.unsqueeze(coords_flatten, 1) relative_coords = coords_flatten_1.clone() - coords_flatten_2.clone( ) #relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Wh relative_coords = relative_coords.transpose( (1, 2, 0)) #.contiguous() # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += window_size[ 0] - 1 # shift to start from 0 relative_coords[:, :, 1] += window_size[1] - 1 relative_coords[:, :, 0] *= 2 * window_size[1] - 1 relative_position_index = \ paddle.zeros(shape=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype) relative_position_index[1:, 1:] = relative_coords.sum( -1) # Wh*Ww, Wh*Ww relative_position_index[0, 0:] = self.num_relative_distance - 3 relative_position_index[0:, 0] = self.num_relative_distance - 2 relative_position_index[0, 0] = self.num_relative_distance - 1 self.register_buffer("relative_position_index", relative_position_index) # trunc_normal_(self.relative_position_bias_table, std=.0) else: self.window_size = None self.relative_position_bias_table = None self.relative_position_index = None self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) def forward(self, x, rel_pos_bias=None): x_shape = x.shape N, C = x_shape[1], x_shape[2] qkv_bias = None if self.q_bias is not None: qkv_bias = paddle.concat( (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias)) qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias) qkv = qkv.reshape((-1, N, 3, self.num_heads, C // self.num_heads)).transpose((2, 0, 3, 1, 4)) q, k, v = qkv[0], qkv[1], qkv[2] attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale if self.relative_position_bias_table is not None: relative_position_bias = self.relative_position_bias_table[ self.relative_position_index.reshape([-1])].reshape([ self.window_size[0] * self.window_size[1] + 1, self.window_size[0] * self.window_size[1] + 1, -1 ]) # Wh*Ww,Wh*Ww,nH relative_position_bias = relative_position_bias.transpose( (2, 0, 1)) #.contiguous() # nH, Wh*Ww, Wh*Ww attn = attn + relative_position_bias.unsqueeze(0) if rel_pos_bias is not None: attn = attn + rel_pos_bias attn = nn.functional.softmax(attn, axis=-1) attn = self.attn_drop(attn) x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C)) x = self.proj(x) x = self.proj_drop(x) return x class Block(nn.Layer): def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., drop_path=0., window_size=None, init_values=None, act_layer=nn.GELU, norm_layer='nn.LayerNorm', epsilon=1e-5): super().__init__() self.norm1 = nn.LayerNorm(dim, epsilon=1e-6) self.attn = Attention( dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, window_size=window_size) # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() self.norm2 = eval(norm_layer)(dim, epsilon=epsilon) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) if init_values is not None: self.gamma_1 = self.create_parameter( shape=([dim]), default_initializer=Constant(value=init_values)) self.gamma_2 = self.create_parameter( shape=([dim]), default_initializer=Constant(value=init_values)) else: self.gamma_1, self.gamma_2 = None, None def forward(self, x, rel_pos_bias=None): if self.gamma_1 is None: x = x + self.drop_path( self.attn( self.norm1(x), rel_pos_bias=rel_pos_bias)) x = x + self.drop_path(self.mlp(self.norm2(x))) else: x = x + self.drop_path(self.gamma_1 * self.attn( self.norm1(x), rel_pos_bias=rel_pos_bias)) x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) return x class PatchEmbed(nn.Layer): """ Image to Patch Embedding """ def __init__(self, img_size=[224, 224], patch_size=16, in_chans=3, embed_dim=768): super().__init__() self.num_patches_w = img_size[0] // patch_size self.num_patches_h = img_size[1] // patch_size num_patches = self.num_patches_w * self.num_patches_h self.patch_shape = (img_size[0] // patch_size, img_size[1] // patch_size) self.img_size = img_size self.patch_size = patch_size self.num_patches = num_patches self.proj = nn.Conv2D( in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) @property def num_patches_in_h(self): return self.img_size[1] // self.patch_size @property def num_patches_in_w(self): return self.img_size[0] // self.patch_size def forward(self, x, mask=None): B, C, H, W = x.shape return self.proj(x) class RelativePositionBias(nn.Layer): def __init__(self, window_size, num_heads): super().__init__() self.window_size = window_size self.num_relative_distance = (2 * window_size[0] - 1) * ( 2 * window_size[1] - 1) + 3 self.relative_position_bias_table = self.create_parameter( shape=(self.num_relative_distance, num_heads), default_initialize=zeros_) # cls to token & token 2 cls & cls to cls # get pair-wise relative position index for each token inside the window coords_h = paddle.arange(window_size[0]) coords_w = paddle.arange(window_size[1]) coords = paddle.stack(paddle.meshgrid( [coords_h, coords_w])) # 2, Wh, Ww coords_flatten = coords.flatten(1) # 2, Wh*Ww relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww relative_coords = relative_coords.transpos( (1, 2, 0)) # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += window_size[1] - 1 relative_coords[:, :, 0] *= 2 * window_size[1] - 1 relative_position_index = \ paddle.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype) relative_position_index[1:, 1:] = relative_coords.sum( -1) # Wh*Ww, Wh*Ww relative_position_index[0, 0:] = self.num_relative_distance - 3 relative_position_index[0:, 0] = self.num_relative_distance - 2 relative_position_index[0, 0] = self.num_relative_distance - 1 self.register_buffer("relative_position_index", relative_position_index) def forward(self): relative_position_bias = \ self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([ self.window_size[0] * self.window_size[1] + 1, self.window_size[0] * self.window_size[1] + 1, -1]) # Wh*Ww,Wh*Ww,nH return relative_position_bias.transpose((2, 0, 1)) # nH, Wh*Ww, Wh*Ww def get_sinusoid_encoding_table(n_position, d_hid, token=False): ''' Sinusoid position encoding table ''' def get_position_angle_vec(position): return [ position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid) ] sinusoid_table = np.array( [get_position_angle_vec(pos_i) for pos_i in range(n_position)]) sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 if token: sinusoid_table = np.concatenate( [sinusoid_table, np.zeros([1, d_hid])], dim=0) return paddle.to_tensor(sinusoid_table, dtype=paddle.float32).unsqueeze(0) @register @serializable class VisionTransformer(nn.Layer): """ Vision Transformer with support for patch input """ def __init__(self, img_size=[672, 1092], patch_size=16, in_chans=3, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer='nn.LayerNorm', init_values=None, use_rel_pos_bias=False, use_shared_rel_pos_bias=False, epsilon=1e-5, final_norm=False, pretrained=None, out_indices=[3, 5, 7, 11], use_abs_pos_emb=False, use_sincos_pos_emb=True, with_fpn=True, num_fpn_levels=4, use_checkpoint=False, **args): super().__init__() self.img_size = img_size self.embed_dim = embed_dim self.with_fpn = with_fpn self.use_checkpoint = use_checkpoint self.use_sincos_pos_emb = use_sincos_pos_emb self.use_rel_pos_bias = use_rel_pos_bias self.final_norm = final_norm self.out_indices = out_indices self.num_fpn_levels = num_fpn_levels if use_checkpoint: paddle.seed(0) self.patch_embed = PatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) self.pos_w = self.patch_embed.num_patches_in_w self.pos_h = self.patch_embed.num_patches_in_h self.cls_token = self.create_parameter( shape=(1, 1, embed_dim), default_initializer=paddle.nn.initializer.Constant(value=0.)) if use_abs_pos_emb: self.pos_embed = self.create_parameter( shape=(1, self.pos_w * self.pos_h + 1, embed_dim), default_initializer=paddle.nn.initializer.TruncatedNormal( std=.02)) elif use_sincos_pos_emb: pos_embed = self.build_2d_sincos_position_embedding(embed_dim) self.pos_embed = pos_embed self.pos_embed = self.create_parameter(shape=pos_embed.shape) self.pos_embed.set_value(pos_embed.numpy()) self.pos_embed.stop_gradient = True else: self.pos_embed = None self.pos_drop = nn.Dropout(p=drop_rate) if use_shared_rel_pos_bias: self.rel_pos_bias = RelativePositionBias( window_size=self.patch_embed.patch_shape, num_heads=num_heads) else: self.rel_pos_bias = None dpr = np.linspace(0, drop_path_rate, depth) self.blocks = nn.LayerList([ Block( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None, epsilon=epsilon) for i in range(depth) ]) self.pretrained = pretrained self.init_weight() assert len(out_indices) <= 4, '' self.out_indices = out_indices self.out_channels = [embed_dim for _ in range(num_fpn_levels)] self.out_strides = [4, 8, 16, 32][-num_fpn_levels:] if with_fpn else [ patch_size for _ in range(len(out_indices)) ] self.norm = Identity() if self.with_fpn: assert num_fpn_levels <= 4, '' self.init_fpn( embed_dim=embed_dim, patch_size=patch_size, ) def init_weight(self): pretrained = self.pretrained if pretrained: if 'http' in pretrained: #URL path = paddle.utils.download.get_weights_path_from_url( pretrained) else: #model in local path path = pretrained load_state_dict = paddle.load(path) model_state_dict = self.state_dict() pos_embed_name = "pos_embed" if pos_embed_name in load_state_dict.keys(): load_pos_embed = paddle.to_tensor( load_state_dict[pos_embed_name], dtype="float32") if self.pos_embed.shape != load_pos_embed.shape: pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1)) model_state_dict[pos_embed_name] = self.resize_pos_embed( load_pos_embed, (pos_size, pos_size), (self.pos_h, self.pos_w)) # self.set_state_dict(model_state_dict) load_state_dict[pos_embed_name] = model_state_dict[ pos_embed_name] print("Load pos_embed and resize it from {} to {} .".format( load_pos_embed.shape, self.pos_embed.shape)) self.set_state_dict(load_state_dict) print("Load load_state_dict....") def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False): if patch_size == 16: self.fpn1 = nn.Sequential( nn.Conv2DTranspose( embed_dim, embed_dim, kernel_size=2, stride=2), nn.BatchNorm2D(embed_dim), nn.GELU(), nn.Conv2DTranspose( embed_dim, embed_dim, kernel_size=2, stride=2), ) self.fpn2 = nn.Sequential( nn.Conv2DTranspose( embed_dim, embed_dim, kernel_size=2, stride=2), ) self.fpn3 = Identity() self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2) elif patch_size == 8: self.fpn1 = nn.Sequential( nn.Conv2DTranspose( embed_dim, embed_dim, kernel_size=2, stride=2), ) self.fpn2 = Identity() self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), ) self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), ) if not out_with_norm: self.norm = Identity() else: self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6) def interpolate_pos_encoding(self, x, w, h): npatch = x.shape[1] - 1 N = self.pos_embed.shape[1] - 1 w0 = w // self.patch_embed.patch_size h0 = h // self.patch_embed.patch_size if npatch == N and w0 == self.patch_embed.num_patches_w and h0 == self.patch_embed.num_patches_h: return self.pos_embed class_pos_embed = self.pos_embed[:, 0] patch_pos_embed = self.pos_embed[:, 1:] dim = x.shape[-1] # we add a small number to avoid floating point error in the interpolation # see discussion at https://github.com/facebookresearch/dino/issues/8 # w0, h0 = w0 + 0.1, h0 + 0.1 # patch_pos_embed = nn.functional.interpolate( # patch_pos_embed.reshape([ # 1, self.patch_embed.num_patches_w, # self.patch_embed.num_patches_h, dim # ]).transpose((0, 3, 1, 2)), # scale_factor=(w0 / self.patch_embed.num_patches_w, # h0 / self.patch_embed.num_patches_h), # mode='bicubic', ) patch_pos_embed = nn.functional.interpolate( patch_pos_embed.reshape([ 1, self.patch_embed.num_patches_w, self.patch_embed.num_patches_h, dim ]).transpose((0, 3, 1, 2)), (w0, h0), mode='bicubic', ) assert int(w0) == patch_pos_embed.shape[-2] and int( h0) == patch_pos_embed.shape[-1] patch_pos_embed = patch_pos_embed.transpose( (0, 2, 3, 1)).reshape([1, -1, dim]) return paddle.concat( (class_pos_embed.unsqueeze(0), patch_pos_embed), axis=1) def resize_pos_embed(self, pos_embed, old_hw, new_hw): """ Resize pos_embed weight. Args: pos_embed (Tensor): the pos_embed weight old_hw (list[int]): the height and width of old pos_embed new_hw (list[int]): the height and width of new pos_embed Returns: Tensor: the resized pos_embed weight """ cls_pos_embed = pos_embed[:, :1, :] pos_embed = pos_embed[:, 1:, :] pos_embed = pos_embed.transpose([0, 2, 1]) pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]]) pos_embed = F.interpolate( pos_embed, new_hw, mode='bicubic', align_corners=False) pos_embed = pos_embed.flatten(2).transpose([0, 2, 1]) pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1) return pos_embed def build_2d_sincos_position_embedding( self, embed_dim=768, temperature=10000., ): h, w = self.patch_embed.patch_shape grid_w = paddle.arange(w, dtype=paddle.float32) grid_h = paddle.arange(h, dtype=paddle.float32) grid_w, grid_h = paddle.meshgrid(grid_w, grid_h) assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding' pos_dim = embed_dim // 4 omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim omega = 1. / (temperature**omega) out_w = grid_w.flatten()[..., None] @omega[None] out_h = grid_h.flatten()[..., None] @omega[None] pos_emb = paddle.concat( [ paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h), paddle.cos(out_h) ], axis=1)[None, :, :] pe_token = paddle.zeros([1, 1, embed_dim], dtype=paddle.float32) pos_embed = paddle.concat([pe_token, pos_emb], axis=1) # pos_embed.stop_gradient = True return pos_embed def forward(self, x): x = x['image'] if isinstance(x, dict) else x _, _, h, w = x.shape x = self.patch_embed(x) B, D, Hp, Wp = x.shape # b * c * h * w cls_tokens = self.cls_token.expand( (B, self.cls_token.shape[-2], self.cls_token.shape[-1])) x = x.flatten(2).transpose([0, 2, 1]) # b * hw * c x = paddle.concat([cls_tokens, x], axis=1) if self.pos_embed is not None: # x = x + self.interpolate_pos_encoding(x, w, h) x = x + self.interpolate_pos_encoding(x, h, w) x = self.pos_drop(x) rel_pos_bias = self.rel_pos_bias( ) if self.rel_pos_bias is not None else None feats = [] for idx, blk in enumerate(self.blocks): if self.use_checkpoint and self.training: x = paddle.distributed.fleet.utils.recompute( blk, x, rel_pos_bias, **{"preserve_rng_state": True}) else: x = blk(x, rel_pos_bias) if idx in self.out_indices: xp = paddle.reshape( paddle.transpose( self.norm(x[:, 1:, :]), perm=[0, 2, 1]), shape=[B, D, Hp, Wp]) feats.append(xp) if self.with_fpn: fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4][ -self.num_fpn_levels:] assert len(fpns) == len(feats) or len(feats) == 1, '' outputs = [] for i, m in enumerate(fpns): outputs.append( m(feats[i] if len(feats) == len(fpns) else feats[-1])) return outputs return feats @property def num_layers(self): return len(self.blocks) @property def no_weight_decay(self): return {'pos_embed', 'cls_token'} @property def out_shape(self): return [ ShapeSpec( channels=c, stride=s) for c, s in zip(self.out_channels, self.out_strides) ] ================================================ FILE: ppdet/modeling/backbones/vit_mae.py ================================================ # copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F import numpy as np import math from paddle import ParamAttr from paddle.regularizer import L2Decay from paddle.nn.initializer import Constant, TruncatedNormal from ppdet.modeling.shape_spec import ShapeSpec from ppdet.core.workspace import register, serializable from .transformer_utils import (zeros_, DropPath, Identity, window_partition, window_unpartition) from ..initializer import linear_init_ __all__ = ['VisionTransformer2D', 'SimpleFeaturePyramid'] class Mlp(nn.Layer): def __init__(self, in_features, hidden_features=None, out_features=None, act_layer='nn.GELU', drop=0., lr_factor=1.0): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear( in_features, hidden_features, weight_attr=ParamAttr(learning_rate=lr_factor), bias_attr=ParamAttr(learning_rate=lr_factor)) self.act = eval(act_layer)() self.fc2 = nn.Linear( hidden_features, out_features, weight_attr=ParamAttr(learning_rate=lr_factor), bias_attr=ParamAttr(learning_rate=lr_factor)) self.drop = nn.Dropout(drop) self._init_weights() def _init_weights(self): linear_init_(self.fc1) linear_init_(self.fc2) def forward(self, x): x = self.drop(self.act(self.fc1(x))) x = self.drop(self.fc2(x)) return x class Attention(nn.Layer): def __init__(self, dim, num_heads=8, qkv_bias=False, attn_bias=False, attn_drop=0., proj_drop=0., use_rel_pos=False, rel_pos_zero_init=True, window_size=None, input_size=None, qk_scale=None, lr_factor=1.0): super().__init__() self.num_heads = num_heads self.head_dim = dim // num_heads self.scale = qk_scale or self.head_dim**-0.5 self.use_rel_pos = use_rel_pos self.input_size = input_size self.rel_pos_zero_init = rel_pos_zero_init self.window_size = window_size self.lr_factor = lr_factor self.qkv = nn.Linear( dim, dim * 3, weight_attr=ParamAttr(learning_rate=lr_factor), bias_attr=ParamAttr(learning_rate=lr_factor) if attn_bias else False) if qkv_bias: self.q_bias = self.create_parameter( shape=([dim]), default_initializer=zeros_) self.v_bias = self.create_parameter( shape=([dim]), default_initializer=zeros_) else: self.q_bias = None self.v_bias = None self.proj = nn.Linear( dim, dim, weight_attr=ParamAttr(learning_rate=lr_factor), bias_attr=ParamAttr(learning_rate=lr_factor)) self.attn_drop = nn.Dropout(attn_drop) if window_size is None: self.window_size = self.input_size[0] self._init_weights() def _init_weights(self): linear_init_(self.qkv) linear_init_(self.proj) if self.use_rel_pos: self.rel_pos_h = self.create_parameter( [2 * self.window_size - 1, self.head_dim], attr=ParamAttr(learning_rate=self.lr_factor), default_initializer=Constant(value=0.)) self.rel_pos_w = self.create_parameter( [2 * self.window_size - 1, self.head_dim], attr=ParamAttr(learning_rate=self.lr_factor), default_initializer=Constant(value=0.)) if not self.rel_pos_zero_init: TruncatedNormal(self.rel_pos_h, std=0.02) TruncatedNormal(self.rel_pos_w, std=0.02) def get_rel_pos(self, seq_size, rel_pos): max_rel_dist = int(2 * seq_size - 1) # Interpolate rel pos if needed. if rel_pos.shape[0] != max_rel_dist: # Interpolate rel pos. rel_pos = rel_pos.reshape([1, rel_pos.shape[0], -1]) rel_pos = rel_pos.transpose([0, 2, 1]) rel_pos_resized = F.interpolate( rel_pos, size=(max_rel_dist, ), mode="linear", data_format='NCW') rel_pos_resized = rel_pos_resized.reshape([-1, max_rel_dist]) rel_pos_resized = rel_pos_resized.transpose([1, 0]) else: rel_pos_resized = rel_pos coords = paddle.arange(seq_size, dtype='float32') relative_coords = coords.unsqueeze(-1) - coords.unsqueeze(0) relative_coords += (seq_size - 1) relative_coords = relative_coords.astype('int64').flatten() return paddle.index_select(rel_pos_resized, relative_coords).reshape( [seq_size, seq_size, self.head_dim]) def add_decomposed_rel_pos(self, attn, q, h, w): """ Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`. Args: attn (Tensor): attention map. q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C). Returns: attn (Tensor): attention map with added relative positional embeddings. """ Rh = self.get_rel_pos(h, self.rel_pos_h) Rw = self.get_rel_pos(w, self.rel_pos_w) B, _, dim = q.shape r_q = q.reshape([B, h, w, dim]) # bhwc, hch->bhwh1 # bwhc, wcw->bhw1w rel_h = paddle.einsum("bhwc,hkc->bhwk", r_q, Rh).unsqueeze(-1) rel_w = paddle.einsum("bhwc,wkc->bhwk", r_q, Rw).unsqueeze(-2) attn = attn.reshape([B, h, w, h, w]) + rel_h + rel_w return attn.reshape([B, h * w, h * w]) def forward(self, x): B, H, W, C = x.shape if self.q_bias is not None: qkv_bias = paddle.concat( (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias)) qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias) else: qkv = self.qkv(x).reshape( [B, H * W, 3, self.num_heads, self.head_dim]).transpose( [2, 0, 3, 1, 4]).reshape( [3, B * self.num_heads, H * W, self.head_dim]) q, k, v = qkv[0], qkv[1], qkv[2] attn = q.matmul(k.transpose([0, 2, 1])) * self.scale if self.use_rel_pos: attn = self.add_decomposed_rel_pos(attn, q, H, W) attn = F.softmax(attn, axis=-1) attn = self.attn_drop(attn) x = attn.matmul(v).reshape( [B, self.num_heads, H * W, self.head_dim]).transpose( [0, 2, 1, 3]).reshape([B, H, W, C]) x = self.proj(x) return x class Block(nn.Layer): def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, attn_bias=False, qk_scale=None, init_values=None, drop=0., attn_drop=0., drop_path=0., use_rel_pos=True, rel_pos_zero_init=True, window_size=None, input_size=None, act_layer='nn.GELU', norm_layer='nn.LayerNorm', lr_factor=1.0, epsilon=1e-5): super().__init__() self.window_size = window_size self.norm1 = eval(norm_layer)(dim, weight_attr=ParamAttr( learning_rate=lr_factor, regularizer=L2Decay(0.0)), bias_attr=ParamAttr( learning_rate=lr_factor, regularizer=L2Decay(0.0)), epsilon=epsilon) self.attn = Attention( dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_bias=attn_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, use_rel_pos=use_rel_pos, rel_pos_zero_init=rel_pos_zero_init, window_size=window_size, input_size=input_size, lr_factor=lr_factor) self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() self.norm2 = eval(norm_layer)(dim, weight_attr=ParamAttr( learning_rate=lr_factor, regularizer=L2Decay(0.0)), bias_attr=ParamAttr( learning_rate=lr_factor, regularizer=L2Decay(0.0)), epsilon=epsilon) self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop, lr_factor=lr_factor) if init_values is not None: self.gamma_1 = self.create_parameter( shape=([dim]), default_initializer=Constant(value=init_values)) self.gamma_2 = self.create_parameter( shape=([dim]), default_initializer=Constant(value=init_values)) else: self.gamma_1, self.gamma_2 = None, None def forward(self, x): y = self.norm1(x) if self.window_size is not None: y, pad_hw, num_hw = window_partition(y, self.window_size) y = self.attn(y) if self.gamma_1 is not None: y = self.gamma_1 * y if self.window_size is not None: y = window_unpartition(y, pad_hw, num_hw, (x.shape[1], x.shape[2])) x = x + self.drop_path(y) if self.gamma_2 is None: x = x + self.drop_path(self.mlp(self.norm2(x))) else: x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) return x class PatchEmbed(nn.Layer): """ Image to Patch Embedding """ def __init__(self, img_size=(224, 224), patch_size=16, in_chans=3, embed_dim=768, lr_factor=0.01): super().__init__() self.img_size = img_size self.patch_size = patch_size self.proj = nn.Conv2D( in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, weight_attr=ParamAttr(learning_rate=lr_factor), bias_attr=ParamAttr(learning_rate=lr_factor)) @property def num_patches_in_h(self): return self.img_size[1] // self.patch_size @property def num_patches_in_w(self): return self.img_size[0] // self.patch_size def forward(self, x): out = self.proj(x) return out @register @serializable class VisionTransformer2D(nn.Layer): """ Vision Transformer with support for patch input """ def __init__(self, img_size=(1024, 1024), patch_size=16, in_chans=3, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=False, attn_bias=False, qk_scale=None, init_values=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., act_layer='nn.GELU', norm_layer='nn.LayerNorm', lr_decay_rate=1.0, global_attn_indexes=(2, 5, 8, 11), use_abs_pos=False, use_rel_pos=False, use_abs_pos_emb=False, use_sincos_pos_emb=False, rel_pos_zero_init=True, epsilon=1e-5, final_norm=False, pretrained=None, window_size=None, out_indices=(11, ), with_fpn=False, use_checkpoint=False, *args, **kwargs): super().__init__() self.img_size = img_size self.patch_size = patch_size self.embed_dim = embed_dim self.num_heads = num_heads self.depth = depth self.global_attn_indexes = global_attn_indexes self.epsilon = epsilon self.with_fpn = with_fpn self.use_checkpoint = use_checkpoint self.patch_h = img_size[0] // patch_size self.patch_w = img_size[1] // patch_size self.num_patches = self.patch_h * self.patch_w self.use_abs_pos = use_abs_pos self.use_abs_pos_emb = use_abs_pos_emb self.patch_embed = PatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) dpr = np.linspace(0, drop_path_rate, depth) if use_checkpoint: paddle.seed(0) if use_abs_pos_emb: self.pos_w = self.patch_embed.num_patches_in_w self.pos_h = self.patch_embed.num_patches_in_h self.pos_embed = self.create_parameter( shape=(1, self.pos_w * self.pos_h + 1, embed_dim), default_initializer=paddle.nn.initializer.TruncatedNormal( std=.02)) elif use_sincos_pos_emb: pos_embed = self.get_2d_sincos_position_embedding(self.patch_h, self.patch_w) self.pos_embed = pos_embed self.pos_embed = self.create_parameter(shape=pos_embed.shape) self.pos_embed.set_value(pos_embed.numpy()) self.pos_embed.stop_gradient = True else: self.pos_embed = None self.blocks = nn.LayerList([ Block( embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, attn_bias=attn_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], use_rel_pos=use_rel_pos, rel_pos_zero_init=rel_pos_zero_init, window_size=None if i in self.global_attn_indexes else window_size, input_size=[self.patch_h, self.patch_w], act_layer=act_layer, lr_factor=self.get_vit_lr_decay_rate(i, lr_decay_rate), norm_layer=norm_layer, init_values=init_values, epsilon=epsilon) for i in range(depth) ]) assert len(out_indices) <= 4, 'out_indices out of bound' self.out_indices = out_indices self.pretrained = pretrained self.init_weight() self.out_channels = [embed_dim for _ in range(len(out_indices))] self.out_strides = [4, 8, 16, 32][-len(out_indices):] if with_fpn else [ patch_size for _ in range(len(out_indices)) ] self.norm = Identity() if self.with_fpn: self.init_fpn( embed_dim=embed_dim, patch_size=patch_size, out_with_norm=final_norm) def get_vit_lr_decay_rate(self, layer_id, lr_decay_rate): return lr_decay_rate**(self.depth - layer_id) def init_weight(self): pretrained = self.pretrained if pretrained: if 'http' in pretrained: path = paddle.utils.download.get_weights_path_from_url( pretrained) else: path = pretrained load_state_dict = paddle.load(path) model_state_dict = self.state_dict() pos_embed_name = "pos_embed" if pos_embed_name in load_state_dict.keys( ) and self.use_abs_pos_emb: load_pos_embed = paddle.to_tensor( load_state_dict[pos_embed_name], dtype="float32") if self.pos_embed.shape != load_pos_embed.shape: pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1)) model_state_dict[pos_embed_name] = self.resize_pos_embed( load_pos_embed, (pos_size, pos_size), (self.pos_h, self.pos_w)) # self.set_state_dict(model_state_dict) load_state_dict[pos_embed_name] = model_state_dict[ pos_embed_name] print("Load pos_embed and resize it from {} to {} .".format( load_pos_embed.shape, self.pos_embed.shape)) self.set_state_dict(load_state_dict) print("Load load_state_dict....") def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False): if patch_size == 16: self.fpn1 = nn.Sequential( nn.Conv2DTranspose( embed_dim, embed_dim, kernel_size=2, stride=2), nn.BatchNorm2D(embed_dim), nn.GELU(), nn.Conv2DTranspose( embed_dim, embed_dim, kernel_size=2, stride=2), ) self.fpn2 = nn.Sequential( nn.Conv2DTranspose( embed_dim, embed_dim, kernel_size=2, stride=2), ) self.fpn3 = Identity() self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2) elif patch_size == 8: self.fpn1 = nn.Sequential( nn.Conv2DTranspose( embed_dim, embed_dim, kernel_size=2, stride=2), ) self.fpn2 = Identity() self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), ) self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), ) if not out_with_norm: self.norm = Identity() else: self.norm = nn.LayerNorm(embed_dim, epsilon=self.epsilon) def resize_pos_embed(self, pos_embed, old_hw, new_hw): """ Resize pos_embed weight. Args: pos_embed (Tensor): the pos_embed weight old_hw (list[int]): the height and width of old pos_embed new_hw (list[int]): the height and width of new pos_embed Returns: Tensor: the resized pos_embed weight """ cls_pos_embed = pos_embed[:, :1, :] pos_embed = pos_embed[:, 1:, :] pos_embed = pos_embed.transpose([0, 2, 1]) pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]]) pos_embed = F.interpolate( pos_embed, new_hw, mode='bicubic', align_corners=False) pos_embed = pos_embed.flatten(2).transpose([0, 2, 1]) pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1) return pos_embed def get_2d_sincos_position_embedding(self, h, w, temperature=10000.): grid_y, grid_x = paddle.meshgrid( paddle.arange( h, dtype=paddle.float32), paddle.arange( w, dtype=paddle.float32)) assert self.embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding' pos_dim = self.embed_dim // 4 omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim omega = (1. / (temperature**omega)).unsqueeze(0) out_x = grid_x.reshape([-1, 1]).matmul(omega) out_y = grid_y.reshape([-1, 1]).matmul(omega) pos_emb = paddle.concat( [ paddle.sin(out_y), paddle.cos(out_y), paddle.sin(out_x), paddle.cos(out_x) ], axis=1) return pos_emb.reshape([1, h, w, self.embed_dim]) def forward(self, inputs): x = self.patch_embed(inputs['image']).transpose([0, 2, 3, 1]) B, Hp, Wp, _ = x.shape if self.use_abs_pos: x = x + self.get_2d_sincos_position_embedding(Hp, Wp) if self.use_abs_pos_emb: x = x + self.resize_pos_embed(self.pos_embed, (self.pos_h, self.pos_w), (Hp, Wp)) feats = [] for idx, blk in enumerate(self.blocks): if self.use_checkpoint and self.training: x = paddle.distributed.fleet.utils.recompute( blk, x, **{"preserve_rng_state": True}) else: x = blk(x) if idx in self.out_indices: feats.append(self.norm(x.transpose([0, 3, 1, 2]))) if self.with_fpn: fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4] for i in range(len(feats)): feats[i] = fpns[i](feats[i]) return feats @property def num_layers(self): return len(self.blocks) @property def no_weight_decay(self): return {'pos_embed', 'cls_token'} @property def out_shape(self): return [ ShapeSpec( channels=c, stride=s) for c, s in zip(self.out_channels, self.out_strides) ] class LayerNorm(nn.Layer): """ A LayerNorm variant, popularized by Transformers, that performs point-wise mean and variance normalization over the channel dimension for inputs that have shape (batch_size, channels, height, width). Note that, the modified LayerNorm on used in ResBlock and SimpleFeaturePyramid. In ViT, we use the nn.LayerNorm """ def __init__(self, normalized_shape, eps=1e-6): super().__init__() self.weight = self.create_parameter([normalized_shape]) self.bias = self.create_parameter([normalized_shape]) self.eps = eps self.normalized_shape = (normalized_shape, ) def forward(self, x): u = x.mean(1, keepdim=True) s = (x - u).pow(2).mean(1, keepdim=True) x = (x - u) / paddle.sqrt(s + self.eps) x = self.weight[:, None, None] * x + self.bias[:, None, None] return x @register @serializable class SimpleFeaturePyramid(nn.Layer): def __init__(self, in_channels, out_channels, spatial_scales, num_levels=4, use_bias=False): """ Args: in_channels (list[int]): input channels of each level which can be derived from the output shape of backbone by from_config out_channel (int): output channel of each level. spatial_scales (list[float]): list of scaling factors to upsample or downsample the input features for creating pyramid features which can be derived from the output shape of backbone by from_config num_levels (int): number of levels of output features. use_bias (bool): whether use bias or not. """ super(SimpleFeaturePyramid, self).__init__() self.in_channels = in_channels[0] self.out_channels = out_channels self.num_levels = num_levels self.stages = [] dim = self.in_channels if num_levels == 4: scale_factors = [2.0, 1.0, 0.5] elif num_levels == 5: scale_factors = [4.0, 2.0, 1.0, 0.5] else: raise NotImplementedError( f"num_levels={num_levels} is not supported yet.") dim = in_channels[0] for idx, scale in enumerate(scale_factors): out_dim = dim if scale == 4.0: layers = [ nn.Conv2DTranspose( dim, dim // 2, kernel_size=2, stride=2), nn.LayerNorm(dim // 2), nn.GELU(), nn.Conv2DTranspose( dim // 2, dim // 4, kernel_size=2, stride=2), ] out_dim = dim // 4 elif scale == 2.0: layers = [ nn.Conv2DTranspose( dim, dim // 2, kernel_size=2, stride=2) ] out_dim = dim // 2 elif scale == 1.0: layers = [] elif scale == 0.5: layers = [nn.MaxPool2D(kernel_size=2, stride=2)] layers.extend([ nn.Conv2D( out_dim, out_channels, kernel_size=1, bias_attr=use_bias, ), LayerNorm(out_channels), nn.Conv2D( out_channels, out_channels, kernel_size=3, padding=1, bias_attr=use_bias, ), LayerNorm(out_channels) ]) layers = nn.Sequential(*layers) stage = -int(math.log2(spatial_scales[0] * scale_factors[idx])) self.add_sublayer(f"simfp_{stage}", layers) self.stages.append(layers) # top block output feature maps. self.top_block = nn.Sequential( nn.MaxPool2D( kernel_size=1, stride=2, padding=0)) @classmethod def from_config(cls, cfg, input_shape): return { 'in_channels': [i.channels for i in input_shape], 'spatial_scales': [1.0 / i.stride for i in input_shape], } @property def out_shape(self): return [ ShapeSpec(channels=self.out_channels) for _ in range(self.num_levels) ] def forward(self, feats): """ Args: x: Tensor of shape (N,C,H,W). """ features = feats[0] results = [] for stage in self.stages: results.append(stage(features)) top_block_in_feature = results[-1] results.append(self.top_block(top_block_in_feature)) assert self.num_levels == len(results) return results ================================================ FILE: ppdet/modeling/backbones/vitpose.py ================================================ # copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Code was based on https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py # reference: https://arxiv.org/abs/2010.11929 from collections.abc import Callable import numpy as np import paddle import paddle.nn as nn from paddle.nn.initializer import TruncatedNormal, Constant, Normal from ppdet.core.workspace import register, serializable trunc_normal_ = TruncatedNormal(std=.02) def to_2tuple(x): if isinstance(x, (list, tuple)): return x return tuple([x] * 2) def drop_path(x, drop_prob=0., training=False): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... """ if drop_prob == 0. or not training: return x keep_prob = paddle.to_tensor(1.0 - drop_prob).astype(x.dtype) shape = (x.shape[0], ) + (1, ) * (x.ndim - 1) random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype) random_tensor = paddle.floor(random_tensor) # binarize output = x.divide(keep_prob) * random_tensor return output class DropPath(nn.Layer): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). """ def __init__(self, drop_prob=None): super(DropPath, self).__init__() self.drop_prob = drop_prob def forward(self, x): return drop_path(x, self.drop_prob, self.training) class Identity(nn.Layer): def __init__(self): super(Identity, self).__init__() def forward(self, input): return input class Mlp(nn.Layer): def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.fc2(x) x = self.drop(x) return x class Attention(nn.Layer): def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): super().__init__() self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim**-0.5 self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) def forward(self, x): N, C = x.shape[1:] qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C // self.num_heads)).transpose((2, 0, 3, 1, 4)) q, k, v = qkv[0], qkv[1], qkv[2] attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale attn = nn.functional.softmax(attn, axis=-1) attn = self.attn_drop(attn) x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C)) x = self.proj(x) x = self.proj_drop(x) return x class Block(nn.Layer): def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer='nn.LayerNorm', epsilon=1e-5): super().__init__() if isinstance(norm_layer, str): self.norm1 = eval(norm_layer)(dim, epsilon=epsilon) elif isinstance(norm_layer, Callable): self.norm1 = norm_layer(dim) else: raise TypeError( "The norm_layer must be str or paddle.nn.layer.Layer class") self.attn = Attention( dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() if isinstance(norm_layer, str): self.norm2 = eval(norm_layer)(dim, epsilon=epsilon) elif isinstance(norm_layer, Callable): self.norm2 = norm_layer(dim) else: raise TypeError( "The norm_layer must be str or paddle.nn.layer.Layer class") mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) def forward(self, x): x = x + self.drop_path(self.attn(self.norm1(x))) x = x + self.drop_path(self.mlp(self.norm2(x))) return x class PatchEmbed(nn.Layer): """ Image to Patch Embedding """ def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, ratio=1): super().__init__() img_size = to_2tuple(img_size) patch_size = to_2tuple(patch_size) num_patches = (img_size[1] // patch_size[1]) * ( img_size[0] // patch_size[0]) * (ratio**2) self.img_size = img_size self.patch_size = patch_size self.num_patches = num_patches self.proj = nn.Conv2D( in_chans, embed_dim, kernel_size=patch_size, stride=(patch_size[0] // ratio), padding=(4 + 2 * (ratio // 2 - 1), 4 + 2 * (ratio // 2 - 1))) def forward(self, x): B, C, H, W = x.shape assert H == self.img_size[0] and W == self.img_size[1], \ f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." x = self.proj(x) return x @register @serializable class ViT(nn.Layer): """ Vision Transformer with support for patch input This module is different from ppdet's VisionTransformer (from ppdet/modeling/backbones/visio_transformer.py), the main differences are: 1.the module PatchEmbed.proj has padding set,padding=(4 + 2 * (ratio // 2 - 1), 4 + 2 * (ratio // 2 - 1), VisionTransformer dose not 2.Attention module qkv is standard.but VisionTransformer provide more options 3.MLP module only one Dropout,and VisionTransformer twice; 4.VisionTransformer provide fpn layer,but the module does not. """ def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer='nn.LayerNorm', epsilon=1e-5, ratio=1, pretrained=None, **kwargs): super().__init__() self.pretrained = pretrained self.num_features = self.embed_dim = embed_dim self.patch_embed = PatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ratio=ratio) num_patches = self.patch_embed.num_patches self.pos_embed = self.create_parameter( shape=(1, num_patches + 1, embed_dim), default_initializer=trunc_normal_) self.add_parameter("pos_embed", self.pos_embed) dpr = np.linspace(0, drop_path_rate, depth, dtype='float32') self.blocks = nn.LayerList([ Block( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, epsilon=epsilon) for i in range(depth) ]) self.last_norm = eval(norm_layer)(embed_dim, epsilon=epsilon) trunc_normal_(self.pos_embed) self._init_weights() def _init_weights(self): pretrained = self.pretrained if pretrained: if 'http' in pretrained: #URL path = paddle.utils.download.get_weights_path_from_url( pretrained) else: #model in local path path = pretrained load_state_dict = paddle.load(path) self.set_state_dict(load_state_dict) print("Load load_state_dict:", path) def forward_features(self, x): B = x.shape[0] x = self.patch_embed(x) B, D, Hp, Wp = x.shape x = x.flatten(2).transpose([0, 2, 1]) x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1] for blk in self.blocks: x = blk(x) x = self.last_norm(x) xp = paddle.reshape( paddle.transpose( x, perm=[0, 2, 1]), shape=[B, -1, Hp, Wp]) return xp ================================================ FILE: ppdet/modeling/bbox_utils.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math import paddle import numpy as np def bbox2delta(src_boxes, tgt_boxes, weights=[1.0, 1.0, 1.0, 1.0]): """Encode bboxes to deltas. """ src_w = src_boxes[:, 2] - src_boxes[:, 0] src_h = src_boxes[:, 3] - src_boxes[:, 1] src_ctr_x = src_boxes[:, 0] + 0.5 * src_w src_ctr_y = src_boxes[:, 1] + 0.5 * src_h tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0] tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1] tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h wx, wy, ww, wh = weights dx = wx * (tgt_ctr_x - src_ctr_x) / src_w dy = wy * (tgt_ctr_y - src_ctr_y) / src_h dw = ww * paddle.log(tgt_w / src_w) dh = wh * paddle.log(tgt_h / src_h) deltas = paddle.stack((dx, dy, dw, dh), axis=1) return deltas def delta2bbox(deltas, boxes, weights=[1.0, 1.0, 1.0, 1.0], max_shape=None): """Decode deltas to boxes. Used in RCNNBox,CascadeHead,RCNNHead,RetinaHead. Note: return tensor shape [n,1,4] If you want to add a reshape, please add after the calling code instead of here. """ clip_scale = math.log(1000.0 / 16) widths = boxes[:, 2] - boxes[:, 0] heights = boxes[:, 3] - boxes[:, 1] ctr_x = boxes[:, 0] + 0.5 * widths ctr_y = boxes[:, 1] + 0.5 * heights wx, wy, ww, wh = weights dx = deltas[:, 0::4] / wx dy = deltas[:, 1::4] / wy dw = deltas[:, 2::4] / ww dh = deltas[:, 3::4] / wh # Prevent sending too large values into paddle.exp() dw = paddle.clip(dw, max=clip_scale) dh = paddle.clip(dh, max=clip_scale) pred_ctr_x = dx * widths.unsqueeze(1) + ctr_x.unsqueeze(1) pred_ctr_y = dy * heights.unsqueeze(1) + ctr_y.unsqueeze(1) pred_w = paddle.exp(dw) * widths.unsqueeze(1) pred_h = paddle.exp(dh) * heights.unsqueeze(1) pred_boxes = [] pred_boxes.append(pred_ctr_x - 0.5 * pred_w) pred_boxes.append(pred_ctr_y - 0.5 * pred_h) pred_boxes.append(pred_ctr_x + 0.5 * pred_w) pred_boxes.append(pred_ctr_y + 0.5 * pred_h) pred_boxes = paddle.stack(pred_boxes, axis=-1) if max_shape is not None: pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip( min=0, max=max_shape[1]) pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip( min=0, max=max_shape[0]) return pred_boxes def bbox2delta_v2(src_boxes, tgt_boxes, delta_mean=[0.0, 0.0, 0.0, 0.0], delta_std=[1.0, 1.0, 1.0, 1.0]): """Encode bboxes to deltas. Modified from bbox2delta() which just use weight parameters to multiply deltas. """ src_w = src_boxes[:, 2] - src_boxes[:, 0] src_h = src_boxes[:, 3] - src_boxes[:, 1] src_ctr_x = src_boxes[:, 0] + 0.5 * src_w src_ctr_y = src_boxes[:, 1] + 0.5 * src_h tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0] tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1] tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h dx = (tgt_ctr_x - src_ctr_x) / src_w dy = (tgt_ctr_y - src_ctr_y) / src_h dw = paddle.log(tgt_w / src_w) dh = paddle.log(tgt_h / src_h) deltas = paddle.stack((dx, dy, dw, dh), axis=1) deltas = ( deltas - paddle.to_tensor(delta_mean)) / paddle.to_tensor(delta_std) return deltas def delta2bbox_v2(deltas, boxes, delta_mean=[0.0, 0.0, 0.0, 0.0], delta_std=[1.0, 1.0, 1.0, 1.0], max_shape=None, ctr_clip=32.0): """Decode deltas to bboxes. Modified from delta2bbox() which just use weight parameters to be divided by deltas. Used in YOLOFHead. Note: return tensor shape [n,1,4] If you want to add a reshape, please add after the calling code instead of here. """ clip_scale = math.log(1000.0 / 16) widths = boxes[:, 2] - boxes[:, 0] heights = boxes[:, 3] - boxes[:, 1] ctr_x = boxes[:, 0] + 0.5 * widths ctr_y = boxes[:, 1] + 0.5 * heights deltas = deltas * paddle.to_tensor(delta_std) + paddle.to_tensor(delta_mean) dx = deltas[:, 0::4] dy = deltas[:, 1::4] dw = deltas[:, 2::4] dh = deltas[:, 3::4] # Prevent sending too large values into paddle.exp() dx = dx * widths.unsqueeze(1) dy = dy * heights.unsqueeze(1) if ctr_clip is not None: dx = paddle.clip(dx, max=ctr_clip, min=-ctr_clip) dy = paddle.clip(dy, max=ctr_clip, min=-ctr_clip) dw = paddle.clip(dw, max=clip_scale) dh = paddle.clip(dh, max=clip_scale) else: dw = dw.clip(min=-clip_scale, max=clip_scale) dh = dh.clip(min=-clip_scale, max=clip_scale) pred_ctr_x = dx + ctr_x.unsqueeze(1) pred_ctr_y = dy + ctr_y.unsqueeze(1) pred_w = paddle.exp(dw) * widths.unsqueeze(1) pred_h = paddle.exp(dh) * heights.unsqueeze(1) pred_boxes = [] pred_boxes.append(pred_ctr_x - 0.5 * pred_w) pred_boxes.append(pred_ctr_y - 0.5 * pred_h) pred_boxes.append(pred_ctr_x + 0.5 * pred_w) pred_boxes.append(pred_ctr_y + 0.5 * pred_h) pred_boxes = paddle.stack(pred_boxes, axis=-1) if max_shape is not None: pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip( min=0, max=max_shape[1]) pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip( min=0, max=max_shape[0]) return pred_boxes def expand_bbox(bboxes, scale): w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5 h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5 x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5 y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5 w_half *= scale h_half *= scale bboxes_exp = np.zeros(bboxes.shape, dtype=np.float32) bboxes_exp[:, 0] = x_c - w_half bboxes_exp[:, 2] = x_c + w_half bboxes_exp[:, 1] = y_c - h_half bboxes_exp[:, 3] = y_c + h_half return bboxes_exp def clip_bbox(boxes, im_shape): h, w = im_shape[0], im_shape[1] x1 = boxes[:, 0].clip(0, w) y1 = boxes[:, 1].clip(0, h) x2 = boxes[:, 2].clip(0, w) y2 = boxes[:, 3].clip(0, h) return paddle.stack([x1, y1, x2, y2], axis=1) def nonempty_bbox(boxes, min_size=0, return_mask=False): w = boxes[:, 2] - boxes[:, 0] h = boxes[:, 3] - boxes[:, 1] mask = paddle.logical_and(h > min_size, w > min_size) if return_mask: return mask keep = paddle.nonzero(mask).flatten() return keep def bbox_area(boxes): return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) def bbox_overlaps(boxes1, boxes2): """ Calculate overlaps between boxes1 and boxes2 Args: boxes1 (Tensor): boxes with shape [M, 4] boxes2 (Tensor): boxes with shape [N, 4] Return: overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N] """ M = boxes1.shape[0] N = boxes2.shape[0] if M * N == 0: return paddle.zeros([M, N], dtype='float32') area1 = bbox_area(boxes1) area2 = bbox_area(boxes2) xy_max = paddle.minimum( paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:]) xy_min = paddle.maximum( paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2]) width_height = xy_max - xy_min width_height = width_height.clip(min=0) inter = width_height.prod(axis=2) overlaps = paddle.where(inter > 0, inter / (paddle.unsqueeze(area1, 1) + area2 - inter), paddle.zeros_like(inter)) return overlaps def batch_bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6): """Calculate overlap between two set of bboxes. If ``is_aligned `` is ``False``, then calculate the overlaps between each bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned pair of bboxes1 and bboxes2. Args: bboxes1 (Tensor): shape (B, m, 4) in format or empty. bboxes2 (Tensor): shape (B, n, 4) in format or empty. B indicates the batch dim, in shape (B1, B2, ..., Bn). If ``is_aligned `` is ``True``, then m and n must be equal. mode (str): "iou" (intersection over union) or "iof" (intersection over foreground). is_aligned (bool, optional): If True, then m and n must be equal. Default False. eps (float, optional): A value added to the denominator for numerical stability. Default 1e-6. Returns: Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,) """ assert mode in ['iou', 'iof', 'giou'], 'Unsupported mode {}'.format(mode) # Either the boxes are empty or the length of boxes's last dimenstion is 4 assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0) assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0) # Batch dim must be the same # Batch dim: (B1, B2, ... Bn) assert bboxes1.shape[:-2] == bboxes2.shape[:-2] batch_shape = bboxes1.shape[:-2] rows = bboxes1.shape[-2] if bboxes1.shape[0] > 0 else 0 cols = bboxes2.shape[-2] if bboxes2.shape[0] > 0 else 0 if is_aligned: assert rows == cols if rows * cols == 0: if is_aligned: return paddle.full(batch_shape + (rows, ), 1) else: return paddle.full(batch_shape + (rows, cols), 1) area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1]) area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1]) if is_aligned: lt = paddle.maximum(bboxes1[:, :2], bboxes2[:, :2]) # [B, rows, 2] rb = paddle.minimum(bboxes1[:, 2:], bboxes2[:, 2:]) # [B, rows, 2] wh = (rb - lt).clip(min=0) # [B, rows, 2] overlap = wh[:, 0] * wh[:, 1] if mode in ['iou', 'giou']: union = area1 + area2 - overlap else: union = area1 if mode == 'giou': enclosed_lt = paddle.minimum(bboxes1[:, :2], bboxes2[:, :2]) enclosed_rb = paddle.maximum(bboxes1[:, 2:], bboxes2[:, 2:]) else: lt = paddle.maximum(bboxes1[:, :2].reshape([rows, 1, 2]), bboxes2[:, :2]) # [B, rows, cols, 2] rb = paddle.minimum(bboxes1[:, 2:].reshape([rows, 1, 2]), bboxes2[:, 2:]) # [B, rows, cols, 2] wh = (rb - lt).clip(min=0) # [B, rows, cols, 2] overlap = wh[:, :, 0] * wh[:, :, 1] if mode in ['iou', 'giou']: union = area1.reshape([rows,1]) \ + area2.reshape([1,cols]) - overlap else: union = area1[:, None] if mode == 'giou': enclosed_lt = paddle.minimum(bboxes1[:, :2].reshape([rows, 1, 2]), bboxes2[:, :2]) enclosed_rb = paddle.maximum(bboxes1[:, 2:].reshape([rows, 1, 2]), bboxes2[:, 2:]) eps = paddle.to_tensor([eps]) union = paddle.maximum(union, eps) ious = overlap / union if mode in ['iou', 'iof']: return ious # calculate gious enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0) enclose_area = enclose_wh[:, :, 0] * enclose_wh[:, :, 1] enclose_area = paddle.maximum(enclose_area, eps) gious = ious - (enclose_area - union) / enclose_area return 1 - gious def xywh2xyxy(box): x, y, w, h = box x1 = x - w * 0.5 y1 = y - h * 0.5 x2 = x + w * 0.5 y2 = y + h * 0.5 return [x1, y1, x2, y2] def make_grid(h, w, dtype): yv, xv = paddle.meshgrid([paddle.arange(h), paddle.arange(w)]) return paddle.stack((xv, yv), 2).cast(dtype=dtype) def decode_yolo(box, anchor, downsample_ratio): """decode yolo box Args: box (list): [x, y, w, h], all have the shape [b, na, h, w, 1] anchor (list): anchor with the shape [na, 2] downsample_ratio (int): downsample ratio, default 32 scale (float): scale, default 1. Return: box (list): decoded box, [x, y, w, h], all have the shape [b, na, h, w, 1] """ x, y, w, h = box na, grid_h, grid_w = x.shape[1:4] grid = make_grid(grid_h, grid_w, x.dtype).reshape((1, 1, grid_h, grid_w, 2)) x1 = (x + grid[:, :, :, :, 0:1]) / grid_w y1 = (y + grid[:, :, :, :, 1:2]) / grid_h anchor = paddle.to_tensor(anchor, dtype=x.dtype) anchor = anchor.reshape((1, na, 1, 1, 2)) w1 = paddle.exp(w) * anchor[:, :, :, :, 0:1] / (downsample_ratio * grid_w) h1 = paddle.exp(h) * anchor[:, :, :, :, 1:2] / (downsample_ratio * grid_h) return [x1, y1, w1, h1] def batch_iou_similarity(box1, box2, eps=1e-9): """Calculate iou of box1 and box2 in batch Args: box1 (Tensor): box with the shape [N, M1, 4] box2 (Tensor): box with the shape [N, M2, 4] Return: iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2] """ box1 = box1.unsqueeze(2) # [N, M1, 4] -> [N, M1, 1, 4] box2 = box2.unsqueeze(1) # [N, M2, 4] -> [N, 1, M2, 4] px1y1, px2y2 = box1[:, :, :, 0:2], box1[:, :, :, 2:4] gx1y1, gx2y2 = box2[:, :, :, 0:2], box2[:, :, :, 2:4] x1y1 = paddle.maximum(px1y1, gx1y1) x2y2 = paddle.minimum(px2y2, gx2y2) overlap = (x2y2 - x1y1).clip(0).prod(-1) area1 = (px2y2 - px1y1).clip(0).prod(-1) area2 = (gx2y2 - gx1y1).clip(0).prod(-1) union = area1 + area2 - overlap + eps return overlap / union def bbox_iou(box1, box2, giou=False, diou=False, ciou=False, eps=1e-9): """calculate the iou of box1 and box2 Args: box1 (list): [x, y, w, h], all have the shape [b, na, h, w, 1] box2 (list): [x, y, w, h], all have the shape [b, na, h, w, 1] giou (bool): whether use giou or not, default False diou (bool): whether use diou or not, default False ciou (bool): whether use ciou or not, default False eps (float): epsilon to avoid divide by zero Return: iou (Tensor): iou of box1 and box1, with the shape [b, na, h, w, 1] """ px1, py1, px2, py2 = box1 gx1, gy1, gx2, gy2 = box2 x1 = paddle.maximum(px1, gx1) y1 = paddle.maximum(py1, gy1) x2 = paddle.minimum(px2, gx2) y2 = paddle.minimum(py2, gy2) overlap = ((x2 - x1).clip(0)) * ((y2 - y1).clip(0)) area1 = (px2 - px1) * (py2 - py1) area1 = area1.clip(0) area2 = (gx2 - gx1) * (gy2 - gy1) area2 = area2.clip(0) union = area1 + area2 - overlap + eps iou = overlap / union if giou or ciou or diou: # convex w, h cw = paddle.maximum(px2, gx2) - paddle.minimum(px1, gx1) ch = paddle.maximum(py2, gy2) - paddle.minimum(py1, gy1) if giou: c_area = cw * ch + eps return iou - (c_area - union) / c_area else: # convex diagonal squared c2 = cw**2 + ch**2 + eps # center distance rho2 = ((px1 + px2 - gx1 - gx2)**2 + (py1 + py2 - gy1 - gy2)**2) / 4 if diou: return iou - rho2 / c2 else: w1, h1 = px2 - px1, py2 - py1 + eps w2, h2 = gx2 - gx1, gy2 - gy1 + eps delta = paddle.atan(w1 / h1) - paddle.atan(w2 / h2) v = (4 / math.pi**2) * paddle.pow(delta, 2) alpha = v / (1 + eps - iou + v) alpha.stop_gradient = True return iou - (rho2 / c2 + v * alpha) else: return iou def bbox_iou_np_expand(box1, box2, x1y1x2y2=True, eps=1e-16): """ Calculate the iou of box1 and box2 with numpy. Args: box1 (ndarray): [N, 4] box2 (ndarray): [M, 4], usually N != M x1y1x2y2 (bool): whether in x1y1x2y2 stype, default True eps (float): epsilon to avoid divide by zero Return: iou (ndarray): iou of box1 and box2, [N, M] """ N, M = len(box1), len(box2) # usually N != M if x1y1x2y2: b1_x1, b1_y1 = box1[:, 0], box1[:, 1] b1_x2, b1_y2 = box1[:, 2], box1[:, 3] b2_x1, b2_y1 = box2[:, 0], box2[:, 1] b2_x2, b2_y2 = box2[:, 2], box2[:, 3] else: # cxcywh style # Transform from center and width to exact coordinates b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 # get the coordinates of the intersection rectangle inter_rect_x1 = np.zeros((N, M), dtype=np.float32) inter_rect_y1 = np.zeros((N, M), dtype=np.float32) inter_rect_x2 = np.zeros((N, M), dtype=np.float32) inter_rect_y2 = np.zeros((N, M), dtype=np.float32) for i in range(len(box2)): inter_rect_x1[:, i] = np.maximum(b1_x1, b2_x1[i]) inter_rect_y1[:, i] = np.maximum(b1_y1, b2_y1[i]) inter_rect_x2[:, i] = np.minimum(b1_x2, b2_x2[i]) inter_rect_y2[:, i] = np.minimum(b1_y2, b2_y2[i]) # Intersection area inter_area = np.maximum(inter_rect_x2 - inter_rect_x1, 0) * np.maximum( inter_rect_y2 - inter_rect_y1, 0) # Union Area b1_area = np.repeat( ((b1_x2 - b1_x1) * (b1_y2 - b1_y1)).reshape(-1, 1), M, axis=-1) b2_area = np.repeat( ((b2_x2 - b2_x1) * (b2_y2 - b2_y1)).reshape(1, -1), N, axis=0) ious = inter_area / (b1_area + b2_area - inter_area + eps) return ious def bbox2distance(points, bbox, max_dis=None, eps=0.1): """Decode bounding box based on distances. Args: points (Tensor): Shape (n, 2), [x, y]. bbox (Tensor): Shape (n, 4), "xyxy" format max_dis (float): Upper bound of the distance. eps (float): a small value to ensure target < max_dis, instead <= Returns: Tensor: Decoded distances. """ left = points[:, 0] - bbox[:, 0] top = points[:, 1] - bbox[:, 1] right = bbox[:, 2] - points[:, 0] bottom = bbox[:, 3] - points[:, 1] if max_dis is not None: left = left.clip(min=0, max=max_dis - eps) top = top.clip(min=0, max=max_dis - eps) right = right.clip(min=0, max=max_dis - eps) bottom = bottom.clip(min=0, max=max_dis - eps) return paddle.stack([left, top, right, bottom], -1) def distance2bbox(points, distance, max_shape=None): """Decode distance prediction to bounding box. Args: points (Tensor): Shape (n, 2), [x, y]. distance (Tensor): Distance from the given point to 4 boundaries (left, top, right, bottom). max_shape (tuple): Shape of the image. Returns: Tensor: Decoded bboxes. """ x1 = points[:, 0] - distance[:, 0] y1 = points[:, 1] - distance[:, 1] x2 = points[:, 0] + distance[:, 2] y2 = points[:, 1] + distance[:, 3] if max_shape is not None: x1 = x1.clip(min=0, max=max_shape[1]) y1 = y1.clip(min=0, max=max_shape[0]) x2 = x2.clip(min=0, max=max_shape[1]) y2 = y2.clip(min=0, max=max_shape[0]) return paddle.stack([x1, y1, x2, y2], -1) def bbox_center(boxes): """Get bbox centers from boxes. Args: boxes (Tensor): boxes with shape (..., 4), "xmin, ymin, xmax, ymax" format. Returns: Tensor: boxes centers with shape (..., 2), "cx, cy" format. """ boxes_cx = (boxes[..., 0] + boxes[..., 2]) / 2 boxes_cy = (boxes[..., 1] + boxes[..., 3]) / 2 return paddle.stack([boxes_cx, boxes_cy], axis=-1) def batch_distance2bbox(points, distance, max_shapes=None): """Decode distance prediction to bounding box for batch. Args: points (Tensor): [B, ..., 2], "xy" format distance (Tensor): [B, ..., 4], "ltrb" format max_shapes (Tensor): [B, 2], "h,w" format, Shape of the image. Returns: Tensor: Decoded bboxes, "x1y1x2y2" format. """ lt, rb = paddle.split(distance, 2, -1) # while tensor add parameters, parameters should be better placed on the second place x1y1 = -lt + points x2y2 = rb + points out_bbox = paddle.concat([x1y1, x2y2], -1) if max_shapes is not None: max_shapes = max_shapes.flip(-1).tile([1, 2]) delta_dim = out_bbox.ndim - max_shapes.ndim for _ in range(delta_dim): max_shapes.unsqueeze_(1) out_bbox = paddle.where(out_bbox < max_shapes, out_bbox, max_shapes) out_bbox = paddle.where(out_bbox > 0, out_bbox, paddle.zeros_like(out_bbox)) return out_bbox def iou_similarity(box1, box2, eps=1e-10): """Calculate iou of box1 and box2 Args: box1 (Tensor): box with the shape [M1, 4] box2 (Tensor): box with the shape [M2, 4] Return: iou (Tensor): iou between box1 and box2 with the shape [M1, M2] """ box1 = box1.unsqueeze(1) # [M1, 4] -> [M1, 1, 4] box2 = box2.unsqueeze(0) # [M2, 4] -> [1, M2, 4] px1y1, px2y2 = box1[:, :, 0:2], box1[:, :, 2:4] gx1y1, gx2y2 = box2[:, :, 0:2], box2[:, :, 2:4] x1y1 = paddle.maximum(px1y1, gx1y1) x2y2 = paddle.minimum(px2y2, gx2y2) overlap = (x2y2 - x1y1).clip(0).prod(-1) area1 = (px2y2 - px1y1).clip(0).prod(-1) area2 = (gx2y2 - gx1y1).clip(0).prod(-1) union = area1 + area2 - overlap + eps return overlap / union ================================================ FILE: ppdet/modeling/clrnet_utils.py ================================================ import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.modeling.initializer import constant_ from paddle.nn.initializer import KaimingNormal class ConvModule(nn.Layer): def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, groups=1, bias=False, norm_type='bn', wtih_act=True): super(ConvModule, self).__init__() assert norm_type in ['bn', 'sync_bn', 'gn', None] self.with_norm = norm_type is not None self.wtih_act = wtih_act self.conv = nn.Conv2D( in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias_attr=bias, weight_attr=KaimingNormal()) if self.with_norm: if norm_type == 'bn': self.bn = nn.BatchNorm2D(out_channels) elif norm_type == 'gn': self.bn = nn.GroupNorm(out_channels, out_channels) if self.wtih_act: self.act = nn.ReLU() def forward(self, inputs): x = self.conv(inputs) if self.with_norm: x = self.bn(x) if self.wtih_act: x = self.act(x) return x def LinearModule(hidden_dim): return nn.LayerList( [nn.Linear( hidden_dim, hidden_dim, bias_attr=True), nn.ReLU()]) class FeatureResize(nn.Layer): def __init__(self, size=(10, 25)): super(FeatureResize, self).__init__() self.size = size def forward(self, x): x = F.interpolate(x, self.size) return x.flatten(2) class ROIGather(nn.Layer): ''' ROIGather module for gather global information Args: in_channels: prior feature channels num_priors: prior numbers we predefined sample_points: the number of sampled points when we extract feature from line fc_hidden_dim: the fc output channel refine_layers: the total number of layers to build refine ''' def __init__(self, in_channels, num_priors, sample_points, fc_hidden_dim, refine_layers, mid_channels=48): super(ROIGather, self).__init__() self.in_channels = in_channels self.num_priors = num_priors self.f_key = ConvModule( in_channels=self.in_channels, out_channels=self.in_channels, kernel_size=1, stride=1, padding=0, norm_type='bn') self.f_query = nn.Sequential( nn.Conv1D( in_channels=num_priors, out_channels=num_priors, kernel_size=1, stride=1, padding=0, groups=num_priors), nn.ReLU(), ) self.f_value = nn.Conv2D( in_channels=self.in_channels, out_channels=self.in_channels, kernel_size=1, stride=1, padding=0) self.W = nn.Conv1D( in_channels=num_priors, out_channels=num_priors, kernel_size=1, stride=1, padding=0, groups=num_priors) self.resize = FeatureResize() constant_(self.W.weight, 0) constant_(self.W.bias, 0) self.convs = nn.LayerList() self.catconv = nn.LayerList() for i in range(refine_layers): self.convs.append( ConvModule( in_channels, mid_channels, (9, 1), padding=(4, 0), bias=False, norm_type='bn')) self.catconv.append( ConvModule( mid_channels * (i + 1), in_channels, (9, 1), padding=(4, 0), bias=False, norm_type='bn')) self.fc = nn.Linear( sample_points * fc_hidden_dim, fc_hidden_dim, bias_attr=True) self.fc_norm = nn.LayerNorm(fc_hidden_dim) def roi_fea(self, x, layer_index): feats = [] for i, feature in enumerate(x): feat_trans = self.convs[i](feature) feats.append(feat_trans) cat_feat = paddle.concat(feats, axis=1) cat_feat = self.catconv[layer_index](cat_feat) return cat_feat def forward(self, roi_features, x, layer_index): ''' Args: roi_features: prior feature, shape: (Batch * num_priors, prior_feat_channel, sample_point, 1) x: feature map layer_index: currently on which layer to refine Return: roi: prior features with gathered global information, shape: (Batch, num_priors, fc_hidden_dim) ''' roi = self.roi_fea(roi_features, layer_index) # return roi # print(roi.shape) # return roi bs = x.shape[0] # print(bs) #roi = roi.contiguous().view(bs * self.num_priors, -1) roi = roi.reshape([bs * self.num_priors, -1]) # roi = paddle.randn([192,2304]) # return roi # print(roi) # print(self.fc) # print(self.fc.weight) roi = self.fc(roi) roi = F.relu(self.fc_norm(roi)) # return roi #roi = roi.view(bs, self.num_priors, -1) roi = roi.reshape([bs, self.num_priors, -1]) query = roi value = self.resize(self.f_value(x)) # (B, C, N) global feature query = self.f_query( query) # (B, N, 1) sample context feature from prior roi key = self.f_key(x) value = value.transpose(perm=[0, 2, 1]) key = self.resize(key) # (B, C, N) global feature sim_map = paddle.matmul(query, key) sim_map = (self.in_channels**-.5) * sim_map sim_map = F.softmax(sim_map, axis=-1) context = paddle.matmul(sim_map, value) context = self.W(context) roi = roi + F.dropout(context, p=0.1, training=self.training) return roi class SegDecoder(nn.Layer): ''' Optionaly seg decoder ''' def __init__(self, image_height, image_width, num_class, prior_feat_channels=64, refine_layers=3): super().__init__() self.dropout = nn.Dropout2D(0.1) self.conv = nn.Conv2D(prior_feat_channels * refine_layers, num_class, 1) self.image_height = image_height self.image_width = image_width def forward(self, x): x = self.dropout(x) x = self.conv(x) x = F.interpolate( x, size=[self.image_height, self.image_width], mode='bilinear', align_corners=False) return x import paddle.nn as nn def accuracy(pred, target, topk=1, thresh=None): """Calculate accuracy according to the prediction and target. Args: pred (torch.Tensor): The model prediction, shape (N, num_class) target (torch.Tensor): The target of each prediction, shape (N, ) topk (int | tuple[int], optional): If the predictions in ``topk`` matches the target, the predictions will be regarded as correct ones. Defaults to 1. thresh (float, optional): If not None, predictions with scores under this threshold are considered incorrect. Default to None. Returns: float | tuple[float]: If the input ``topk`` is a single integer, the function will return a single float as accuracy. If ``topk`` is a tuple containing multiple integers, the function will return a tuple containing accuracies of each ``topk`` number. """ assert isinstance(topk, (int, tuple)) if isinstance(topk, int): topk = (topk, ) return_single = True else: return_single = False maxk = max(topk) if pred.shape[0] == 0: accu = [pred.new_tensor(0.) for i in range(len(topk))] return accu[0] if return_single else accu assert pred.ndim == 2 and target.ndim == 1 assert pred.shape[0] == target.shape[0] assert maxk <= pred.shape[1], \ f'maxk {maxk} exceeds pred dimension {pred.shape[1]}' pred_value, pred_label = pred.topk(maxk, axis=1) pred_label = pred_label.t() # transpose to shape (maxk, N) correct = pred_label.equal(target.reshape([1, -1]).expand_as(pred_label)) if thresh is not None: # Only prediction values larger than thresh are counted as correct correct = correct & (pred_value > thresh).t() res = [] for k in topk: correct_k = correct[:k].reshape([-1]).cast("float32").sum(0, keepdim=True) correct_k = correct_k * (100.0 / pred.shape[0]) res.append(correct_k) return res[0] if return_single else res class Accuracy(nn.Layer): def __init__(self, topk=(1, ), thresh=None): """Module to calculate the accuracy. Args: topk (tuple, optional): The criterion used to calculate the accuracy. Defaults to (1,). thresh (float, optional): If not None, predictions with scores under this threshold are considered incorrect. Default to None. """ super().__init__() self.topk = topk self.thresh = thresh def forward(self, pred, target): """Forward function to calculate accuracy. Args: pred (torch.Tensor): Prediction of models. target (torch.Tensor): Target for each prediction. Returns: tuple[float]: The accuracies under different topk criterions. """ return accuracy(pred, target, self.topk, self.thresh) ================================================ FILE: ppdet/modeling/cls_utils.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. def _get_class_default_kwargs(cls, *args, **kwargs): """ Get default arguments of a class in dict format, if args and kwargs is specified, it will replace default arguments """ varnames = cls.__init__.__code__.co_varnames argcount = cls.__init__.__code__.co_argcount keys = varnames[:argcount] assert keys[0] == 'self' keys = keys[1:] values = list(cls.__init__.__defaults__) assert len(values) == len(keys) if len(args) > 0: for i, arg in enumerate(args): values[i] = arg default_kwargs = dict(zip(keys, values)) if len(kwargs) > 0: for k, v in kwargs.items(): default_kwargs[k] = v return default_kwargs ================================================ FILE: ppdet/modeling/heads/__init__.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import bbox_head from . import mask_head from . import yolo_head from . import roi_extractor from . import ssd_head from . import fcos_head from . import solov2_head from . import ttf_head from . import cascade_head from . import face_head from . import s2anet_head from . import keypoint_hrhrnet_head from . import centernet_head from . import gfl_head from . import simota_head from . import pico_head from . import detr_head from . import sparsercnn_head from . import tood_head from . import retina_head from . import ppyoloe_head from . import fcosr_head from . import ppyoloe_r_head from . import yolof_head from . import ppyoloe_contrast_head from . import centertrack_head from . import sparse_roi_head from . import vitpose_head from . import clrnet_head from . import ppyoloe_ins_head from .bbox_head import * from .mask_head import * from .yolo_head import * from .roi_extractor import * from .ssd_head import * from .fcos_head import * from .solov2_head import * from .ttf_head import * from .cascade_head import * from .face_head import * from .s2anet_head import * from .keypoint_hrhrnet_head import * from .centernet_head import * from .gfl_head import * from .simota_head import * from .pico_head import * from .detr_head import * from .sparsercnn_head import * from .tood_head import * from .retina_head import * from .ppyoloe_head import * from .fcosr_head import * from .ppyoloe_r_head import * from .yolof_head import * from .ppyoloe_contrast_head import * from .centertrack_head import * from .sparse_roi_head import * from .petr_head import * from .vitpose_head import * from .clrnet_head import * from .ppyoloe_ins_head import PPYOLOEInsHead ================================================ FILE: ppdet/modeling/heads/bbox_head.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle.nn.initializer import Normal, XavierUniform, KaimingNormal from paddle.regularizer import L2Decay from ppdet.core.workspace import register, create from .roi_extractor import RoIAlign from ..shape_spec import ShapeSpec from ..bbox_utils import bbox2delta from ..cls_utils import _get_class_default_kwargs from ppdet.modeling.layers import ConvNormLayer __all__ = ['TwoFCHead', 'XConvNormHead', 'BBoxHead'] @register class TwoFCHead(nn.Layer): """ RCNN bbox head with Two fc layers to extract feature Args: in_channel (int): Input channel which can be derived by from_config out_channel (int): Output channel resolution (int): Resolution of input feature map, default 7 """ def __init__(self, in_channel=256, out_channel=1024, resolution=7): super(TwoFCHead, self).__init__() self.in_channel = in_channel self.out_channel = out_channel fan = in_channel * resolution * resolution self.fc6 = nn.Linear( in_channel * resolution * resolution, out_channel, weight_attr=paddle.ParamAttr( initializer=XavierUniform(fan_out=fan))) self.fc6.skip_quant = True self.fc7 = nn.Linear( out_channel, out_channel, weight_attr=paddle.ParamAttr(initializer=XavierUniform())) self.fc7.skip_quant = True @classmethod def from_config(cls, cfg, input_shape): s = input_shape s = s[0] if isinstance(s, (list, tuple)) else s return {'in_channel': s.channels} @property def out_shape(self): return [ShapeSpec(channels=self.out_channel, )] def forward(self, rois_feat): rois_feat = paddle.flatten(rois_feat, start_axis=1, stop_axis=-1) fc6 = self.fc6(rois_feat) fc6 = F.relu(fc6) fc7 = self.fc7(fc6) fc7 = F.relu(fc7) return fc7 @register class XConvNormHead(nn.Layer): __shared__ = ['norm_type', 'freeze_norm'] """ RCNN bbox head with serveral convolution layers Args: in_channel (int): Input channels which can be derived by from_config num_convs (int): The number of conv layers conv_dim (int): The number of channels for the conv layers out_channel (int): Output channels resolution (int): Resolution of input feature map norm_type (string): Norm type, bn, gn, sync_bn are available, default `gn` freeze_norm (bool): Whether to freeze the norm stage_name (string): Prefix name for conv layer, '' by default """ def __init__(self, in_channel=256, num_convs=4, conv_dim=256, out_channel=1024, resolution=7, norm_type='gn', freeze_norm=False, stage_name=''): super(XConvNormHead, self).__init__() self.in_channel = in_channel self.num_convs = num_convs self.conv_dim = conv_dim self.out_channel = out_channel self.norm_type = norm_type self.freeze_norm = freeze_norm self.bbox_head_convs = [] fan = conv_dim * 3 * 3 initializer = KaimingNormal(fan_in=fan) for i in range(self.num_convs): in_c = in_channel if i == 0 else conv_dim head_conv_name = stage_name + 'bbox_head_conv{}'.format(i) head_conv = self.add_sublayer( head_conv_name, ConvNormLayer( ch_in=in_c, ch_out=conv_dim, filter_size=3, stride=1, norm_type=self.norm_type, freeze_norm=self.freeze_norm, initializer=initializer)) self.bbox_head_convs.append(head_conv) fan = conv_dim * resolution * resolution self.fc6 = nn.Linear( conv_dim * resolution * resolution, out_channel, weight_attr=paddle.ParamAttr( initializer=XavierUniform(fan_out=fan)), bias_attr=paddle.ParamAttr( learning_rate=2., regularizer=L2Decay(0.))) @classmethod def from_config(cls, cfg, input_shape): s = input_shape s = s[0] if isinstance(s, (list, tuple)) else s return {'in_channel': s.channels} @property def out_shape(self): return [ShapeSpec(channels=self.out_channel, )] def forward(self, rois_feat): for i in range(self.num_convs): rois_feat = F.relu(self.bbox_head_convs[i](rois_feat)) rois_feat = paddle.flatten(rois_feat, start_axis=1, stop_axis=-1) fc6 = F.relu(self.fc6(rois_feat)) return fc6 @register class BBoxHead(nn.Layer): __shared__ = ['num_classes', 'use_cot'] __inject__ = ['bbox_assigner', 'bbox_loss', 'loss_cot'] """ RCNN bbox head Args: head (nn.Layer): Extract feature in bbox head in_channel (int): Input channel after RoI extractor roi_extractor (object): The module of RoI Extractor bbox_assigner (object): The module of Box Assigner, label and sample the box. with_pool (bool): Whether to use pooling for the RoI feature. num_classes (int): The number of classes bbox_weight (List[float]): The weight to get the decode box cot_classes (int): The number of base classes loss_cot (object): The module of Label-cotuning use_cot(bool): whether to use Label-cotuning """ def __init__(self, head, in_channel, roi_extractor=_get_class_default_kwargs(RoIAlign), bbox_assigner='BboxAssigner', with_pool=False, num_classes=80, bbox_weight=[10., 10., 5., 5.], bbox_loss=None, loss_normalize_pos=False, cot_classes=None, loss_cot='COTLoss', use_cot=False): super(BBoxHead, self).__init__() self.head = head self.roi_extractor = roi_extractor if isinstance(roi_extractor, dict): self.roi_extractor = RoIAlign(**roi_extractor) self.bbox_assigner = bbox_assigner self.with_pool = with_pool self.num_classes = num_classes self.bbox_weight = bbox_weight self.bbox_loss = bbox_loss self.loss_normalize_pos = loss_normalize_pos self.loss_cot = loss_cot self.cot_relation = None self.cot_classes = cot_classes self.use_cot = use_cot if use_cot: self.cot_bbox_score = nn.Linear( in_channel, self.num_classes + 1, weight_attr=paddle.ParamAttr(initializer=Normal( mean=0.0, std=0.01))) self.bbox_score = nn.Linear( in_channel, self.cot_classes + 1, weight_attr=paddle.ParamAttr(initializer=Normal( mean=0.0, std=0.01))) self.cot_bbox_score.skip_quant = True else: self.bbox_score = nn.Linear( in_channel, self.num_classes + 1, weight_attr=paddle.ParamAttr(initializer=Normal( mean=0.0, std=0.01))) self.bbox_score.skip_quant = True self.bbox_delta = nn.Linear( in_channel, 4 * self.num_classes, weight_attr=paddle.ParamAttr(initializer=Normal( mean=0.0, std=0.001))) self.bbox_delta.skip_quant = True self.assigned_label = None self.assigned_rois = None def init_cot_head(self, relationship): self.cot_relation = relationship @classmethod def from_config(cls, cfg, input_shape): roi_pooler = cfg['roi_extractor'] assert isinstance(roi_pooler, dict) kwargs = RoIAlign.from_config(cfg, input_shape) roi_pooler.update(kwargs) kwargs = {'input_shape': input_shape} head = create(cfg['head'], **kwargs) return { 'roi_extractor': roi_pooler, 'head': head, 'in_channel': head.out_shape[0].channels } def forward(self, body_feats=None, rois=None, rois_num=None, inputs=None, cot=False): """ body_feats (list[Tensor]): Feature maps from backbone rois (list[Tensor]): RoIs generated from RPN module rois_num (Tensor): The number of RoIs in each image inputs (dict{Tensor}): The ground-truth of image """ if self.training: rois, rois_num, targets = self.bbox_assigner(rois, rois_num, inputs) self.assigned_rois = (rois, rois_num) self.assigned_targets = targets rois_feat = self.roi_extractor(body_feats, rois, rois_num) bbox_feat = self.head(rois_feat) if self.with_pool: feat = F.adaptive_avg_pool2d(bbox_feat, output_size=1) feat = paddle.squeeze(feat, axis=[2, 3]) else: feat = bbox_feat if self.use_cot: scores = self.cot_bbox_score(feat) cot_scores = self.bbox_score(feat) else: scores = self.bbox_score(feat) deltas = self.bbox_delta(feat) if self.training: loss = self.get_loss( scores, deltas, targets, rois, self.bbox_weight, loss_normalize_pos=self.loss_normalize_pos) if self.cot_relation is not None: loss_cot = self.loss_cot(cot_scores, targets, self.cot_relation) loss.update(loss_cot) return loss, bbox_feat else: if cot: pred = self.get_prediction(cot_scores, deltas) else: pred = self.get_prediction(scores, deltas) return pred, self.head def get_loss(self, scores, deltas, targets, rois, bbox_weight, loss_normalize_pos=False): """ scores (Tensor): scores from bbox head outputs deltas (Tensor): deltas from bbox head outputs targets (list[List[Tensor]]): bbox targets containing tgt_labels, tgt_bboxes and tgt_gt_inds rois (List[Tensor]): RoIs generated in each batch """ cls_name = 'loss_bbox_cls' reg_name = 'loss_bbox_reg' loss_bbox = {} # TODO: better pass args tgt_labels, tgt_bboxes, tgt_gt_inds = targets # bbox cls tgt_labels = paddle.concat(tgt_labels) if len( tgt_labels) > 1 else tgt_labels[0] valid_inds = paddle.nonzero(tgt_labels >= 0).flatten() if valid_inds.shape[0] == 0: loss_bbox[cls_name] = paddle.zeros([1], dtype='float32') else: tgt_labels = tgt_labels.cast('int64') tgt_labels.stop_gradient = True if not loss_normalize_pos: loss_bbox_cls = F.cross_entropy( input=scores, label=tgt_labels, reduction='mean') else: loss_bbox_cls = F.cross_entropy( input=scores, label=tgt_labels, reduction='none').sum() / (tgt_labels.shape[0] + 1e-7) loss_bbox[cls_name] = loss_bbox_cls # bbox reg cls_agnostic_bbox_reg = deltas.shape[1] == 4 fg_inds = paddle.nonzero( paddle.logical_and(tgt_labels >= 0, tgt_labels < self.num_classes)).flatten() if fg_inds.numel() == 0: # loss_bbox[reg_name] = paddle.zeros([1], dtype='float32') loss_bbox[reg_name] = scores.mean() * 0. + deltas.mean() * 0. return loss_bbox if cls_agnostic_bbox_reg: reg_delta = paddle.gather(deltas, fg_inds) else: fg_gt_classes = paddle.gather(tgt_labels, fg_inds) reg_row_inds = paddle.arange(fg_gt_classes.shape[0]).unsqueeze(1) reg_row_inds = paddle.tile(reg_row_inds, [1, 4]).reshape([-1, 1]) reg_col_inds = 4 * fg_gt_classes.unsqueeze(1) + paddle.arange(4) reg_col_inds = reg_col_inds.reshape([-1, 1]) reg_inds = paddle.concat([reg_row_inds, reg_col_inds], axis=1) reg_delta = paddle.gather(deltas, fg_inds) reg_delta = paddle.gather_nd(reg_delta, reg_inds).reshape([-1, 4]) rois = paddle.concat(rois) if len(rois) > 1 else rois[0] tgt_bboxes = paddle.concat(tgt_bboxes) if len( tgt_bboxes) > 1 else tgt_bboxes[0] reg_target = bbox2delta(rois, tgt_bboxes, bbox_weight) reg_target = paddle.gather(reg_target, fg_inds) reg_target.stop_gradient = True if self.bbox_loss is not None: reg_delta = self.bbox_transform(reg_delta) reg_target = self.bbox_transform(reg_target) if not loss_normalize_pos: loss_bbox_reg = self.bbox_loss( reg_delta, reg_target).sum() / tgt_labels.shape[0] loss_bbox_reg *= self.num_classes else: loss_bbox_reg = self.bbox_loss( reg_delta, reg_target).sum() / (tgt_labels.shape[0] + 1e-7) else: loss_bbox_reg = paddle.abs(reg_delta - reg_target).sum( ) / tgt_labels.shape[0] loss_bbox[reg_name] = loss_bbox_reg return loss_bbox def bbox_transform(self, deltas, weights=[0.1, 0.1, 0.2, 0.2]): wx, wy, ww, wh = weights deltas = paddle.reshape(deltas, shape=(0, -1, 4)) dx = paddle.slice(deltas, axes=[2], starts=[0], ends=[1]) * wx dy = paddle.slice(deltas, axes=[2], starts=[1], ends=[2]) * wy dw = paddle.slice(deltas, axes=[2], starts=[2], ends=[3]) * ww dh = paddle.slice(deltas, axes=[2], starts=[3], ends=[4]) * wh dw = paddle.clip(dw, -1.e10, np.log(1000. / 16)) dh = paddle.clip(dh, -1.e10, np.log(1000. / 16)) pred_ctr_x = dx pred_ctr_y = dy pred_w = paddle.exp(dw) pred_h = paddle.exp(dh) x1 = pred_ctr_x - 0.5 * pred_w y1 = pred_ctr_y - 0.5 * pred_h x2 = pred_ctr_x + 0.5 * pred_w y2 = pred_ctr_y + 0.5 * pred_h x1 = paddle.reshape(x1, shape=(-1, )) y1 = paddle.reshape(y1, shape=(-1, )) x2 = paddle.reshape(x2, shape=(-1, )) y2 = paddle.reshape(y2, shape=(-1, )) return paddle.concat([x1, y1, x2, y2]) def get_prediction(self, score, delta): bbox_prob = F.softmax(score) return delta, bbox_prob def get_head(self, ): return self.head def get_assigned_targets(self, ): return self.assigned_targets def get_assigned_rois(self, ): return self.assigned_rois ================================================ FILE: ppdet/modeling/heads/cascade_head.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle.nn.initializer import Normal from ppdet.core.workspace import register from .bbox_head import BBoxHead, TwoFCHead, XConvNormHead from .roi_extractor import RoIAlign from ..shape_spec import ShapeSpec from ..bbox_utils import delta2bbox, clip_bbox, nonempty_bbox from ..cls_utils import _get_class_default_kwargs __all__ = ['CascadeTwoFCHead', 'CascadeXConvNormHead', 'CascadeHead'] @register class CascadeTwoFCHead(nn.Layer): __shared__ = ['num_cascade_stage'] """ Cascade RCNN bbox head with Two fc layers to extract feature Args: in_channel (int): Input channel which can be derived by from_config out_channel (int): Output channel resolution (int): Resolution of input feature map, default 7 num_cascade_stage (int): The number of cascade stage, default 3 """ def __init__(self, in_channel=256, out_channel=1024, resolution=7, num_cascade_stage=3): super(CascadeTwoFCHead, self).__init__() self.in_channel = in_channel self.out_channel = out_channel self.head_list = [] for stage in range(num_cascade_stage): head_per_stage = self.add_sublayer( str(stage), TwoFCHead(in_channel, out_channel, resolution)) self.head_list.append(head_per_stage) @classmethod def from_config(cls, cfg, input_shape): s = input_shape s = s[0] if isinstance(s, (list, tuple)) else s return {'in_channel': s.channels} @property def out_shape(self): return [ShapeSpec(channels=self.out_channel, )] def forward(self, rois_feat, stage=0): out = self.head_list[stage](rois_feat) return out @register class CascadeXConvNormHead(nn.Layer): __shared__ = ['norm_type', 'freeze_norm', 'num_cascade_stage'] """ Cascade RCNN bbox head with serveral convolution layers Args: in_channel (int): Input channels which can be derived by from_config num_convs (int): The number of conv layers conv_dim (int): The number of channels for the conv layers out_channel (int): Output channels resolution (int): Resolution of input feature map norm_type (string): Norm type, bn, gn, sync_bn are available, default `gn` freeze_norm (bool): Whether to freeze the norm num_cascade_stage (int): The number of cascade stage, default 3 """ def __init__(self, in_channel=256, num_convs=4, conv_dim=256, out_channel=1024, resolution=7, norm_type='gn', freeze_norm=False, num_cascade_stage=3): super(CascadeXConvNormHead, self).__init__() self.in_channel = in_channel self.out_channel = out_channel self.head_list = [] for stage in range(num_cascade_stage): head_per_stage = self.add_sublayer( str(stage), XConvNormHead( in_channel, num_convs, conv_dim, out_channel, resolution, norm_type, freeze_norm, stage_name='stage{}_'.format(stage))) self.head_list.append(head_per_stage) @classmethod def from_config(cls, cfg, input_shape): s = input_shape s = s[0] if isinstance(s, (list, tuple)) else s return {'in_channel': s.channels} @property def out_shape(self): return [ShapeSpec(channels=self.out_channel, )] def forward(self, rois_feat, stage=0): out = self.head_list[stage](rois_feat) return out @register class CascadeHead(BBoxHead): __shared__ = ['num_classes', 'num_cascade_stages'] __inject__ = ['bbox_assigner', 'bbox_loss'] """ Cascade RCNN bbox head Args: head (nn.Layer): Extract feature in bbox head in_channel (int): Input channel after RoI extractor roi_extractor (object): The module of RoI Extractor bbox_assigner (object): The module of Box Assigner, label and sample the box. num_classes (int): The number of classes bbox_weight (List[List[float]]): The weight to get the decode box and the length of weight is the number of cascade stage num_cascade_stages (int): THe number of stage to refine the box """ def __init__(self, head, in_channel, roi_extractor=_get_class_default_kwargs(RoIAlign), bbox_assigner='BboxAssigner', num_classes=80, bbox_weight=[[10., 10., 5., 5.], [20.0, 20.0, 10.0, 10.0], [30.0, 30.0, 15.0, 15.0]], num_cascade_stages=3, bbox_loss=None, reg_class_agnostic=True, stage_loss_weights=None, loss_normalize_pos=False, add_gt_as_proposals=[True, False, False]): nn.Layer.__init__(self, ) self.head = head self.roi_extractor = roi_extractor if isinstance(roi_extractor, dict): self.roi_extractor = RoIAlign(**roi_extractor) self.bbox_assigner = bbox_assigner self.num_classes = num_classes self.bbox_weight = bbox_weight self.num_cascade_stages = num_cascade_stages self.bbox_loss = bbox_loss self.stage_loss_weights = [ 1. / num_cascade_stages for _ in range(num_cascade_stages) ] if stage_loss_weights is None else stage_loss_weights self.add_gt_as_proposals = add_gt_as_proposals assert len( self.stage_loss_weights ) == num_cascade_stages, f'stage_loss_weights({len(self.stage_loss_weights)}) do not equal to num_cascade_stages({num_cascade_stages})' self.reg_class_agnostic = reg_class_agnostic num_bbox_delta = 4 if reg_class_agnostic else 4 * num_classes self.loss_normalize_pos = loss_normalize_pos self.bbox_score_list = [] self.bbox_delta_list = [] for i in range(num_cascade_stages): score_name = 'bbox_score_stage{}'.format(i) delta_name = 'bbox_delta_stage{}'.format(i) bbox_score = self.add_sublayer( score_name, nn.Linear( in_channel, self.num_classes + 1, weight_attr=paddle.ParamAttr(initializer=Normal( mean=0.0, std=0.01)))) bbox_delta = self.add_sublayer( delta_name, nn.Linear( in_channel, num_bbox_delta, weight_attr=paddle.ParamAttr(initializer=Normal( mean=0.0, std=0.001)))) self.bbox_score_list.append(bbox_score) self.bbox_delta_list.append(bbox_delta) self.assigned_label = None self.assigned_rois = None def forward(self, body_feats=None, rois=None, rois_num=None, inputs=None): """ body_feats (list[Tensor]): Feature maps from backbone rois (Tensor): RoIs generated from RPN module rois_num (Tensor): The number of RoIs in each image inputs (dict{Tensor}): The ground-truth of image """ targets = [] if self.training: rois, rois_num, targets = self.bbox_assigner( rois, rois_num, inputs, add_gt_as_proposals=self.add_gt_as_proposals[0]) targets_list = [targets] self.assigned_rois = (rois, rois_num) self.assigned_targets = targets pred_bbox = None head_out_list = [] for i in range(self.num_cascade_stages): if i > 0: rois, rois_num = self._get_rois_from_boxes(pred_bbox, inputs['im_shape']) if self.training: rois, rois_num, targets = self.bbox_assigner( rois, rois_num, inputs, i, is_cascade=True, add_gt_as_proposals=self.add_gt_as_proposals[i]) targets_list.append(targets) rois_feat = self.roi_extractor(body_feats, rois, rois_num) bbox_feat = self.head(rois_feat, i) scores = self.bbox_score_list[i](bbox_feat) deltas = self.bbox_delta_list[i](bbox_feat) # TODO (lyuwenyu) Is it correct for only one class ? if not self.reg_class_agnostic and i < self.num_cascade_stages - 1: deltas = deltas.reshape([deltas.shape[0], self.num_classes, 4]) labels = scores[:, :-1].argmax(axis=-1) if self.training: deltas = deltas[paddle.arange(deltas.shape[0]), labels] else: deltas = deltas[((deltas + 10000) * F.one_hot( labels, num_classes=self.num_classes).unsqueeze(-1) != 0 ).nonzero(as_tuple=True)].reshape( [deltas.shape[0], 4]) head_out_list.append([scores, deltas, rois]) pred_bbox = self._get_pred_bbox(deltas, rois, self.bbox_weight[i]) if self.training: loss = {} for stage, value in enumerate(zip(head_out_list, targets_list)): (scores, deltas, rois), targets = value loss_stage = self.get_loss( scores, deltas, targets, rois, self.bbox_weight[stage], loss_normalize_pos=self.loss_normalize_pos) for k, v in loss_stage.items(): loss[k + "_stage{}".format( stage)] = v * self.stage_loss_weights[stage] return loss, bbox_feat else: scores, deltas, self.refined_rois = self.get_prediction( head_out_list) return (deltas, scores), self.head def _get_rois_from_boxes(self, boxes, im_shape): rois = [] for i, boxes_per_image in enumerate(boxes): clip_box = clip_bbox(boxes_per_image, im_shape[i]) if self.training: keep = nonempty_bbox(clip_box) if keep.shape[0] == 0: keep = paddle.zeros([1], dtype='int32') clip_box = paddle.gather(clip_box, keep) rois.append(clip_box) rois_num = paddle.concat([paddle.shape(r)[0:1] for r in rois]) return rois, rois_num def _get_pred_bbox(self, deltas, proposals, weights): pred_proposals = paddle.concat(proposals) if len( proposals) > 1 else proposals[0] pred_bbox = delta2bbox(deltas, pred_proposals, weights) pred_bbox = paddle.reshape(pred_bbox, [-1, deltas.shape[-1]]) num_prop = [] for p in proposals: num_prop.append(p.shape[0]) # NOTE(dev): num_prob will be tagged as LoDTensorArray because it # depends on batch_size under @to_static. However the argument # num_or_sections in paddle.split does not support LoDTensorArray, # so we use [-1] to replace it if num_prop is not list. The modification # This ensures the correctness of both dynamic and static graphs. if not isinstance(num_prop, list): num_prop = [-1] return pred_bbox.split(num_prop) def get_prediction(self, head_out_list): """ head_out_list(List[Tensor]): scores, deltas, rois """ pred_list = [] scores_list = [F.softmax(head[0]) for head in head_out_list] scores = paddle.add_n(scores_list) / self.num_cascade_stages # Get deltas and rois from the last stage _, deltas, rois = head_out_list[-1] return scores, deltas, rois def get_refined_rois(self, ): return self.refined_rois ================================================ FILE: ppdet/modeling/heads/centernet_head.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle.nn.initializer import Constant, Uniform from ppdet.core.workspace import register from ppdet.modeling.losses import CTFocalLoss, GIoULoss class ConvLayer(nn.Layer): def __init__(self, ch_in, ch_out, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=False): super(ConvLayer, self).__init__() bias_attr = False fan_in = ch_in * kernel_size**2 bound = 1 / math.sqrt(fan_in) param_attr = paddle.ParamAttr(initializer=Uniform(-bound, bound)) if bias: bias_attr = paddle.ParamAttr(initializer=Constant(0.)) self.conv = nn.Conv2D( in_channels=ch_in, out_channels=ch_out, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, weight_attr=param_attr, bias_attr=bias_attr) def forward(self, inputs): out = self.conv(inputs) return out @register class CenterNetHead(nn.Layer): """ Args: in_channels (int): the channel number of input to CenterNetHead. num_classes (int): the number of classes, 80 (COCO dataset) by default. head_planes (int): the channel number in all head, 256 by default. prior_bias (float): prior bias in heatmap head, -2.19 by default, -4.6 in CenterTrack regress_ltrb (bool): whether to regress left/top/right/bottom or width/height for a box, True by default. size_loss (str): the type of size regression loss, 'L1' by default, can be 'giou'. loss_weight (dict): the weight of each loss. add_iou (bool): whether to add iou branch, False by default. """ __shared__ = ['num_classes'] def __init__(self, in_channels, num_classes=80, head_planes=256, prior_bias=-2.19, regress_ltrb=True, size_loss='L1', loss_weight={ 'heatmap': 1.0, 'size': 0.1, 'offset': 1.0, 'iou': 0.0, }, add_iou=False): super(CenterNetHead, self).__init__() self.regress_ltrb = regress_ltrb self.loss_weight = loss_weight self.add_iou = add_iou # heatmap head self.heatmap = nn.Sequential( ConvLayer( in_channels, head_planes, kernel_size=3, padding=1, bias=True), nn.ReLU(), ConvLayer( head_planes, num_classes, kernel_size=1, stride=1, padding=0, bias=True)) with paddle.no_grad(): self.heatmap[2].conv.bias[:] = prior_bias # size(ltrb or wh) head self.size = nn.Sequential( ConvLayer( in_channels, head_planes, kernel_size=3, padding=1, bias=True), nn.ReLU(), ConvLayer( head_planes, 4 if regress_ltrb else 2, kernel_size=1, stride=1, padding=0, bias=True)) self.size_loss = size_loss # offset head self.offset = nn.Sequential( ConvLayer( in_channels, head_planes, kernel_size=3, padding=1, bias=True), nn.ReLU(), ConvLayer( head_planes, 2, kernel_size=1, stride=1, padding=0, bias=True)) # iou head (optinal) if self.add_iou and 'iou' in self.loss_weight: self.iou = nn.Sequential( ConvLayer( in_channels, head_planes, kernel_size=3, padding=1, bias=True), nn.ReLU(), ConvLayer( head_planes, 4 if regress_ltrb else 2, kernel_size=1, stride=1, padding=0, bias=True)) @classmethod def from_config(cls, cfg, input_shape): if isinstance(input_shape, (list, tuple)): input_shape = input_shape[0] return {'in_channels': input_shape.channels} def forward(self, feat, inputs): heatmap = F.sigmoid(self.heatmap(feat)) size = self.size(feat) offset = self.offset(feat) head_outs = {'heatmap': heatmap, 'size': size, 'offset': offset} if self.add_iou and 'iou' in self.loss_weight: iou = self.iou(feat) head_outs.update({'iou': iou}) if self.training: losses = self.get_loss(inputs, self.loss_weight, head_outs) return losses else: return head_outs def get_loss(self, inputs, weights, head_outs): # 1.heatmap(hm) head loss: CTFocalLoss heatmap = head_outs['heatmap'] heatmap_target = inputs['heatmap'] heatmap = paddle.clip(heatmap, 1e-4, 1 - 1e-4) ctfocal_loss = CTFocalLoss() heatmap_loss = ctfocal_loss(heatmap, heatmap_target) # 2.size(wh) head loss: L1 loss or GIoU loss size = head_outs['size'] index = inputs['index'] mask = inputs['index_mask'] size = paddle.transpose(size, perm=[0, 2, 3, 1]) size_n, _, _, size_c = size.shape size = paddle.reshape(size, shape=[size_n, -1, size_c]) index = paddle.unsqueeze(index, 2) batch_inds = list() for i in range(size_n): batch_ind = paddle.full( shape=[1, index.shape[1], 1], fill_value=i, dtype='int64') batch_inds.append(batch_ind) batch_inds = paddle.concat(batch_inds, axis=0) index = paddle.concat(x=[batch_inds, index], axis=2) pos_size = paddle.gather_nd(size, index=index) mask = paddle.unsqueeze(mask, axis=2) size_mask = paddle.expand_as(mask, pos_size) size_mask = paddle.cast(size_mask, dtype=pos_size.dtype) pos_num = size_mask.sum() size_mask.stop_gradient = True if self.size_loss == 'L1': if self.regress_ltrb: size_target = inputs['size'] # shape: [bs, max_per_img, 4] else: if inputs['size'].shape[-1] == 2: # inputs['size'] is wh, and regress as wh # shape: [bs, max_per_img, 2] size_target = inputs['size'] else: # inputs['size'] is ltrb, but regress as wh # shape: [bs, max_per_img, 4] size_target = inputs['size'][:, :, 0:2] + inputs[ 'size'][:, :, 2:] size_target.stop_gradient = True size_loss = F.l1_loss( pos_size * size_mask, size_target * size_mask, reduction='sum') size_loss = size_loss / (pos_num + 1e-4) elif self.size_loss == 'giou': size_target = inputs['bbox_xys'] size_target.stop_gradient = True centers_x = (size_target[:, :, 0:1] + size_target[:, :, 2:3]) / 2.0 centers_y = (size_target[:, :, 1:2] + size_target[:, :, 3:4]) / 2.0 x1 = centers_x - pos_size[:, :, 0:1] y1 = centers_y - pos_size[:, :, 1:2] x2 = centers_x + pos_size[:, :, 2:3] y2 = centers_y + pos_size[:, :, 3:4] pred_boxes = paddle.concat([x1, y1, x2, y2], axis=-1) giou_loss = GIoULoss(reduction='sum') size_loss = giou_loss( pred_boxes * size_mask, size_target * size_mask, iou_weight=size_mask, loc_reweight=None) size_loss = size_loss / (pos_num + 1e-4) # 3.offset(reg) head loss: L1 loss offset = head_outs['offset'] offset_target = inputs['offset'] offset = paddle.transpose(offset, perm=[0, 2, 3, 1]) offset_n, _, _, offset_c = offset.shape offset = paddle.reshape(offset, shape=[offset_n, -1, offset_c]) pos_offset = paddle.gather_nd(offset, index=index) offset_mask = paddle.expand_as(mask, pos_offset) offset_mask = paddle.cast(offset_mask, dtype=pos_offset.dtype) pos_num = offset_mask.sum() offset_mask.stop_gradient = True offset_target.stop_gradient = True offset_loss = F.l1_loss( pos_offset * offset_mask, offset_target * offset_mask, reduction='sum') offset_loss = offset_loss / (pos_num + 1e-4) # 4.iou head loss: GIoU loss (optinal) if self.add_iou and 'iou' in self.loss_weight: iou = head_outs['iou'] iou = paddle.transpose(iou, perm=[0, 2, 3, 1]) iou_n, _, _, iou_c = iou.shape iou = paddle.reshape(iou, shape=[iou_n, -1, iou_c]) pos_iou = paddle.gather_nd(iou, index=index) iou_mask = paddle.expand_as(mask, pos_iou) iou_mask = paddle.cast(iou_mask, dtype=pos_iou.dtype) pos_num = iou_mask.sum() iou_mask.stop_gradient = True gt_bbox_xys = inputs['bbox_xys'] gt_bbox_xys.stop_gradient = True centers_x = (gt_bbox_xys[:, :, 0:1] + gt_bbox_xys[:, :, 2:3]) / 2.0 centers_y = (gt_bbox_xys[:, :, 1:2] + gt_bbox_xys[:, :, 3:4]) / 2.0 x1 = centers_x - pos_size[:, :, 0:1] y1 = centers_y - pos_size[:, :, 1:2] x2 = centers_x + pos_size[:, :, 2:3] y2 = centers_y + pos_size[:, :, 3:4] pred_boxes = paddle.concat([x1, y1, x2, y2], axis=-1) giou_loss = GIoULoss(reduction='sum') iou_loss = giou_loss( pred_boxes * iou_mask, gt_bbox_xys * iou_mask, iou_weight=iou_mask, loc_reweight=None) iou_loss = iou_loss / (pos_num + 1e-4) losses = { 'heatmap_loss': heatmap_loss, 'size_loss': size_loss, 'offset_loss': offset_loss, } det_loss = weights['heatmap'] * heatmap_loss + weights[ 'size'] * size_loss + weights['offset'] * offset_loss if self.add_iou and 'iou' in self.loss_weight: losses.update({'iou_loss': iou_loss}) det_loss += weights['iou'] * iou_loss losses.update({'det_loss': det_loss}) return losses ================================================ FILE: ppdet/modeling/heads/centertrack_head.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from .centernet_head import ConvLayer from ..keypoint_utils import get_affine_transform __all__ = ['CenterTrackHead'] @register class CenterTrackHead(nn.Layer): """ Args: in_channels (int): the channel number of input to CenterNetHead. num_classes (int): the number of classes, 1 (MOT17 dataset) by default. head_planes (int): the channel number in all head, 256 by default. task (str): the type of task for regression, 'tracking' by default. loss_weight (dict): the weight of each loss. add_ltrb_amodal (bool): whether to add ltrb_amodal branch, False by default. """ __shared__ = ['num_classes'] def __init__(self, in_channels, num_classes=1, head_planes=256, task='tracking', loss_weight={ 'tracking': 1.0, 'ltrb_amodal': 0.1, }, add_ltrb_amodal=True): super(CenterTrackHead, self).__init__() self.task = task self.loss_weight = loss_weight self.add_ltrb_amodal = add_ltrb_amodal # tracking head self.tracking = nn.Sequential( ConvLayer( in_channels, head_planes, kernel_size=3, padding=1, bias=True), nn.ReLU(), ConvLayer( head_planes, 2, kernel_size=1, stride=1, padding=0, bias=True)) # ltrb_amodal head if self.add_ltrb_amodal and 'ltrb_amodal' in self.loss_weight: self.ltrb_amodal = nn.Sequential( ConvLayer( in_channels, head_planes, kernel_size=3, padding=1, bias=True), nn.ReLU(), ConvLayer( head_planes, 4, kernel_size=1, stride=1, padding=0, bias=True)) # TODO: add more tasks @classmethod def from_config(cls, cfg, input_shape): if isinstance(input_shape, (list, tuple)): input_shape = input_shape[0] return {'in_channels': input_shape.channels} def forward(self, feat, inputs, bboxes=None, bbox_inds=None, topk_clses=None, topk_ys=None, topk_xs=None): tracking = self.tracking(feat) head_outs = {'tracking': tracking} if self.add_ltrb_amodal and 'ltrb_amodal' in self.loss_weight: ltrb_amodal = self.ltrb_amodal(feat) head_outs.update({'ltrb_amodal': ltrb_amodal}) if self.training: losses = self.get_loss(inputs, self.loss_weight, head_outs) return losses else: ret = self.generic_decode(head_outs, bboxes, bbox_inds, topk_ys, topk_xs) return ret def get_loss(self, inputs, weights, head_outs): index = inputs['index'].unsqueeze(2) mask = inputs['index_mask'].unsqueeze(2) batch_inds = list() for i in range(head_outs['tracking'].shape[0]): batch_ind = paddle.full( shape=[1, index.shape[1], 1], fill_value=i, dtype='int64') batch_inds.append(batch_ind) batch_inds = paddle.concat(batch_inds, axis=0) index = paddle.concat(x=[batch_inds, index], axis=2) # 1.tracking head loss: L1 loss tracking = head_outs['tracking'].transpose([0, 2, 3, 1]) tracking_target = inputs['tracking'] bs, _, _, c = tracking.shape tracking = tracking.reshape([bs, -1, c]) pos_tracking = paddle.gather_nd(tracking, index=index) tracking_mask = paddle.cast( paddle.expand_as(mask, pos_tracking), dtype=pos_tracking.dtype) pos_num = tracking_mask.sum() tracking_mask.stop_gradient = True tracking_target.stop_gradient = True tracking_loss = F.l1_loss( pos_tracking * tracking_mask, tracking_target * tracking_mask, reduction='sum') tracking_loss = tracking_loss / (pos_num + 1e-4) # 2.ltrb_amodal head loss(optinal): L1 loss if self.add_ltrb_amodal and 'ltrb_amodal' in self.loss_weight: ltrb_amodal = head_outs['ltrb_amodal'].transpose([0, 2, 3, 1]) ltrb_amodal_target = inputs['ltrb_amodal'] bs, _, _, c = ltrb_amodal.shape ltrb_amodal = ltrb_amodal.reshape([bs, -1, c]) pos_ltrb_amodal = paddle.gather_nd(ltrb_amodal, index=index) ltrb_amodal_mask = paddle.cast( paddle.expand_as(mask, pos_ltrb_amodal), dtype=pos_ltrb_amodal.dtype) pos_num = ltrb_amodal_mask.sum() ltrb_amodal_mask.stop_gradient = True ltrb_amodal_target.stop_gradient = True ltrb_amodal_loss = F.l1_loss( pos_ltrb_amodal * ltrb_amodal_mask, ltrb_amodal_target * ltrb_amodal_mask, reduction='sum') ltrb_amodal_loss = ltrb_amodal_loss / (pos_num + 1e-4) losses = {'tracking_loss': tracking_loss, } plugin_loss = weights['tracking'] * tracking_loss if self.add_ltrb_amodal and 'ltrb_amodal' in self.loss_weight: losses.update({'ltrb_amodal_loss': ltrb_amodal_loss}) plugin_loss += weights['ltrb_amodal'] * ltrb_amodal_loss losses.update({'plugin_loss': plugin_loss}) return losses def generic_decode(self, head_outs, bboxes, bbox_inds, topk_ys, topk_xs): topk_ys = paddle.floor(topk_ys) # note: More accurate topk_xs = paddle.floor(topk_xs) cts = paddle.concat([topk_xs, topk_ys], 1) ret = {'bboxes': bboxes, 'cts': cts} regression_heads = ['tracking'] # todo: add more tasks for head in regression_heads: if head in head_outs: ret[head] = _tranpose_and_gather_feat(head_outs[head], bbox_inds) if 'ltrb_amodal' in head_outs: ltrb_amodal = head_outs['ltrb_amodal'] ltrb_amodal = _tranpose_and_gather_feat(ltrb_amodal, bbox_inds) bboxes_amodal = paddle.concat( [ topk_xs * 1.0 + ltrb_amodal[..., 0:1], topk_ys * 1.0 + ltrb_amodal[..., 1:2], topk_xs * 1.0 + ltrb_amodal[..., 2:3], topk_ys * 1.0 + ltrb_amodal[..., 3:4] ], axis=1) ret['bboxes'] = paddle.concat([bboxes[:, 0:2], bboxes_amodal], 1) # cls_id, score, x0, y0, x1, y1 return ret def centertrack_post_process(self, dets, meta, out_thresh): if not ('bboxes' in dets): return [{}] preds = [] c, s = meta['center'].numpy(), meta['scale'].numpy() h, w = meta['out_height'].numpy(), meta['out_width'].numpy() trans = get_affine_transform( center=c[0], input_size=s[0], rot=0, output_size=[w[0], h[0]], shift=(0., 0.), inv=True).astype(np.float32) for i, dets_bbox in enumerate(dets['bboxes']): if dets_bbox[1] < out_thresh: break item = {} item['score'] = dets_bbox[1] item['class'] = int(dets_bbox[0]) + 1 item['ct'] = transform_preds_with_trans( dets['cts'][i].reshape([1, 2]), trans).reshape(2) if 'tracking' in dets: tracking = transform_preds_with_trans( (dets['tracking'][i] + dets['cts'][i]).reshape([1, 2]), trans).reshape(2) item['tracking'] = tracking - item['ct'] if 'bboxes' in dets: bbox = transform_preds_with_trans( dets_bbox[2:6].reshape([2, 2]), trans).reshape(4) item['bbox'] = bbox preds.append(item) return preds def transform_preds_with_trans(coords, trans): target_coords = np.ones((coords.shape[0], 3), np.float32) target_coords[:, :2] = coords target_coords = np.dot(trans, target_coords.transpose()).transpose() return target_coords[:, :2] def _tranpose_and_gather_feat(feat, bbox_inds): feat = feat.transpose([0, 2, 3, 1]) feat = feat.reshape([-1, feat.shape[3]]) feat = paddle.gather(feat, bbox_inds) return feat ================================================ FILE: ppdet/modeling/heads/clrnet_head.py ================================================ import math import paddle import numpy as np import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from ppdet.modeling.initializer import normal_ from ppdet.modeling.lane_utils import Lane from ppdet.modeling.losses import line_iou from ppdet.modeling.clrnet_utils import ROIGather, LinearModule, SegDecoder __all__ = ['CLRHead'] @register class CLRHead(nn.Layer): __inject__ = ['loss'] __shared__ = [ 'img_w', 'img_h', 'ori_img_h', 'num_classes', 'cut_height', 'num_points', "max_lanes" ] def __init__(self, num_points=72, prior_feat_channels=64, fc_hidden_dim=64, num_priors=192, img_w=800, img_h=320, ori_img_h=590, cut_height=270, num_classes=5, num_fc=2, refine_layers=3, sample_points=36, conf_threshold=0.4, nms_thres=0.5, max_lanes=4, loss='CLRNetLoss'): super(CLRHead, self).__init__() self.img_w = img_w self.img_h = img_h self.n_strips = num_points - 1 self.n_offsets = num_points self.num_priors = num_priors self.sample_points = sample_points self.refine_layers = refine_layers self.num_classes = num_classes self.fc_hidden_dim = fc_hidden_dim self.ori_img_h = ori_img_h self.cut_height = cut_height self.conf_threshold = conf_threshold self.nms_thres = nms_thres self.max_lanes = max_lanes self.prior_feat_channels = prior_feat_channels self.loss = loss self.register_buffer( name='sample_x_indexs', tensor=(paddle.linspace( start=0, stop=1, num=self.sample_points, dtype=paddle.float32) * self.n_strips).astype(dtype='int64')) self.register_buffer( name='prior_feat_ys', tensor=paddle.flip( x=(1 - self.sample_x_indexs.astype('float32') / self.n_strips), axis=[-1])) self.register_buffer( name='prior_ys', tensor=paddle.linspace( start=1, stop=0, num=self.n_offsets).astype('float32')) self.prior_feat_channels = prior_feat_channels self._init_prior_embeddings() init_priors, priors_on_featmap = self.generate_priors_from_embeddings() self.register_buffer(name='priors', tensor=init_priors) self.register_buffer(name='priors_on_featmap', tensor=priors_on_featmap) self.seg_decoder = SegDecoder(self.img_h, self.img_w, self.num_classes, self.prior_feat_channels, self.refine_layers) reg_modules = list() cls_modules = list() for _ in range(num_fc): reg_modules += [*LinearModule(self.fc_hidden_dim)] cls_modules += [*LinearModule(self.fc_hidden_dim)] self.reg_modules = nn.LayerList(sublayers=reg_modules) self.cls_modules = nn.LayerList(sublayers=cls_modules) self.roi_gather = ROIGather(self.prior_feat_channels, self.num_priors, self.sample_points, self.fc_hidden_dim, self.refine_layers) self.reg_layers = nn.Linear( in_features=self.fc_hidden_dim, out_features=self.n_offsets + 1 + 2 + 1, bias_attr=True) self.cls_layers = nn.Linear( in_features=self.fc_hidden_dim, out_features=2, bias_attr=True) self.init_weights() def init_weights(self): for m in self.cls_layers.parameters(): normal_(m, mean=0.0, std=0.001) for m in self.reg_layers.parameters(): normal_(m, mean=0.0, std=0.001) def pool_prior_features(self, batch_features, num_priors, prior_xs): """ pool prior feature from feature map. Args: batch_features (Tensor): Input feature maps, shape: (B, C, H, W) """ batch_size = batch_features.shape[0] prior_xs = prior_xs.reshape([batch_size, num_priors, -1, 1]) prior_ys = self.prior_feat_ys.tile(repeat_times=[ batch_size * num_priors ]).reshape([batch_size, num_priors, -1, 1]) prior_xs = prior_xs * 2.0 - 1.0 prior_ys = prior_ys * 2.0 - 1.0 grid = paddle.concat(x=(prior_xs, prior_ys), axis=-1) feature = F.grid_sample( x=batch_features, grid=grid, align_corners=True).transpose(perm=[0, 2, 1, 3]) feature = feature.reshape([ batch_size * num_priors, self.prior_feat_channels, self.sample_points, 1 ]) return feature def generate_priors_from_embeddings(self): predictions = self.prior_embeddings.weight # 2 scores, 1 start_y, 1 start_x, 1 theta, 1 length, 72 coordinates, score[0] = negative prob, score[1] = positive prob priors = paddle.zeros( (self.num_priors, 2 + 2 + 2 + self.n_offsets), dtype=predictions.dtype) priors[:, 2:5] = predictions.clone() priors[:, 6:] = ( priors[:, 3].unsqueeze(1).clone().tile([1, self.n_offsets]) * (self.img_w - 1) + ((1 - self.prior_ys.tile([self.num_priors, 1]) - priors[:, 2].unsqueeze(1).clone().tile([1, self.n_offsets])) * self.img_h / paddle.tan(x=priors[:, 4].unsqueeze(1).clone().tile( [1, self.n_offsets]) * math.pi + 1e-05))) / (self.img_w - 1) priors_on_featmap = paddle.index_select( priors, 6 + self.sample_x_indexs, axis=-1) return priors, priors_on_featmap def _init_prior_embeddings(self): self.prior_embeddings = nn.Embedding(self.num_priors, 3) bottom_priors_nums = self.num_priors * 3 // 4 left_priors_nums, _ = self.num_priors // 8, self.num_priors // 8 strip_size = 0.5 / (left_priors_nums // 2 - 1) bottom_strip_size = 1 / (bottom_priors_nums // 4 + 1) with paddle.no_grad(): for i in range(left_priors_nums): self.prior_embeddings.weight[i, 0] = i // 2 * strip_size self.prior_embeddings.weight[i, 1] = 0.0 self.prior_embeddings.weight[i, 2] = 0.16 if i % 2 == 0 else 0.32 for i in range(left_priors_nums, left_priors_nums + bottom_priors_nums): self.prior_embeddings.weight[i, 0] = 0.0 self.prior_embeddings.weight[i, 1] = ( (i - left_priors_nums) // 4 + 1) * bottom_strip_size self.prior_embeddings.weight[i, 2] = 0.2 * (i % 4 + 1) for i in range(left_priors_nums + bottom_priors_nums, self.num_priors): self.prior_embeddings.weight[i, 0] = ( i - left_priors_nums - bottom_priors_nums) // 2 * strip_size self.prior_embeddings.weight[i, 1] = 1.0 self.prior_embeddings.weight[i, 2] = 0.68 if i % 2 == 0 else 0.84 def forward(self, x, inputs=None): """ Take pyramid features as input to perform Cross Layer Refinement and finally output the prediction lanes. Each feature is a 4D tensor. Args: x: input features (list[Tensor]) Return: prediction_list: each layer's prediction result seg: segmentation result for auxiliary loss """ batch_features = list(x[len(x) - self.refine_layers:]) batch_features.reverse() batch_size = batch_features[-1].shape[0] if self.training: self.priors, self.priors_on_featmap = self.generate_priors_from_embeddings( ) priors, priors_on_featmap = self.priors.tile( [batch_size, 1, 1]), self.priors_on_featmap.tile([batch_size, 1, 1]) predictions_lists = [] prior_features_stages = [] for stage in range(self.refine_layers): num_priors = priors_on_featmap.shape[1] prior_xs = paddle.flip(x=priors_on_featmap, axis=[2]) batch_prior_features = self.pool_prior_features( batch_features[stage], num_priors, prior_xs) prior_features_stages.append(batch_prior_features) fc_features = self.roi_gather(prior_features_stages, batch_features[stage], stage) # return fc_features fc_features = fc_features.reshape( [num_priors, batch_size, -1]).reshape( [batch_size * num_priors, self.fc_hidden_dim]) cls_features = fc_features.clone() reg_features = fc_features.clone() for cls_layer in self.cls_modules: cls_features = cls_layer(cls_features) # return cls_features for reg_layer in self.reg_modules: reg_features = reg_layer(reg_features) cls_logits = self.cls_layers(cls_features) reg = self.reg_layers(reg_features) cls_logits = cls_logits.reshape( [batch_size, -1, cls_logits.shape[1]]) reg = reg.reshape([batch_size, -1, reg.shape[1]]) predictions = priors.clone() predictions[:, :, :2] = cls_logits predictions[:, :, 2:5] += reg[:, :, :3] predictions[:, :, 5] = reg[:, :, 3] def tran_tensor(t): return t.unsqueeze(axis=2).clone().tile([1, 1, self.n_offsets]) predictions[..., 6:] = ( tran_tensor(predictions[..., 3]) * (self.img_w - 1) + ((1 - self.prior_ys.tile([batch_size, num_priors, 1]) - tran_tensor(predictions[..., 2])) * self.img_h / paddle.tan( tran_tensor(predictions[..., 4]) * math.pi + 1e-05))) / ( self.img_w - 1) prediction_lines = predictions.clone() predictions[..., 6:] += reg[..., 4:] predictions_lists.append(predictions) if stage != self.refine_layers - 1: priors = prediction_lines.detach().clone() priors_on_featmap = priors.index_select( 6 + self.sample_x_indexs, axis=-1) if self.training: seg = None seg_features = paddle.concat( [ F.interpolate( feature, size=[ batch_features[-1].shape[2], batch_features[-1].shape[3] ], mode='bilinear', align_corners=False) for feature in batch_features ], axis=1) seg = self.seg_decoder(seg_features) output = {'predictions_lists': predictions_lists, 'seg': seg} return self.loss(output, inputs) return predictions_lists[-1] def predictions_to_pred(self, predictions): """ Convert predictions to internal Lane structure for evaluation. """ self.prior_ys = paddle.to_tensor(self.prior_ys) self.prior_ys = self.prior_ys.astype('float64') lanes = [] for lane in predictions: lane_xs = lane[6:].clone() start = min( max(0, int(round(lane[2].item() * self.n_strips))), self.n_strips) length = int(round(lane[5].item())) end = start + length - 1 end = min(end, len(self.prior_ys) - 1) if start > 0: mask = ((lane_xs[:start] >= 0.) & (lane_xs[:start] <= 1.)).cpu().detach().numpy()[::-1] mask = ~((mask.cumprod()[::-1]).astype(np.bool_)) lane_xs[:start][mask] = -2 if end < len(self.prior_ys) - 1: lane_xs[end + 1:] = -2 lane_ys = self.prior_ys[lane_xs >= 0].clone() lane_xs = lane_xs[lane_xs >= 0] lane_xs = lane_xs.flip(axis=0).astype('float64') lane_ys = lane_ys.flip(axis=0) lane_ys = (lane_ys * (self.ori_img_h - self.cut_height) + self.cut_height ) / self.ori_img_h if len(lane_xs) <= 1: continue points = paddle.stack( x=(lane_xs.reshape([-1, 1]), lane_ys.reshape([-1, 1])), axis=1).squeeze(axis=2) lane = Lane( points=points.cpu().numpy(), metadata={ 'start_x': lane[3], 'start_y': lane[2], 'conf': lane[1] }) lanes.append(lane) return lanes def lane_nms(self, predictions, scores, nms_overlap_thresh, top_k): """ NMS for lane detection. predictions: paddle.Tensor [num_lanes,conf,y,x,lenght,72offsets] [12,77] scores: paddle.Tensor [num_lanes] nms_overlap_thresh: float top_k: int """ # sort by scores to get idx idx = scores.argsort(descending=True) keep = [] condidates = predictions.clone() condidates = condidates.index_select(idx) while len(condidates) > 0: keep.append(idx[0]) if len(keep) >= top_k or len(condidates) == 1: break ious = [] for i in range(1, len(condidates)): ious.append(1 - line_iou( condidates[i].unsqueeze(0), condidates[0].unsqueeze(0), img_w=self.img_w, length=15)) ious = paddle.to_tensor(ious) mask = ious <= nms_overlap_thresh id = paddle.where(mask == False)[0] if id.shape[0] == 0: break condidates = condidates[1:].index_select(id) idx = idx[1:].index_select(id) keep = paddle.stack(keep) return keep def get_lanes(self, output, as_lanes=True): """ Convert model output to lanes. """ softmax = nn.Softmax(axis=1) decoded = [] for predictions in output: threshold = self.conf_threshold scores = softmax(predictions[:, :2])[:, 1] keep_inds = scores >= threshold predictions = predictions[keep_inds] scores = scores[keep_inds] if predictions.shape[0] == 0: decoded.append([]) continue nms_predictions = predictions.detach().clone() nms_predictions = paddle.concat( x=[nms_predictions[..., :4], nms_predictions[..., 5:]], axis=-1) nms_predictions[..., 4] = nms_predictions[..., 4] * self.n_strips nms_predictions[..., 5:] = nms_predictions[..., 5:] * ( self.img_w - 1) keep = self.lane_nms( nms_predictions[..., 5:], scores, nms_overlap_thresh=self.nms_thres, top_k=self.max_lanes) predictions = predictions.index_select(keep) if predictions.shape[0] == 0: decoded.append([]) continue predictions[:, 5] = paddle.round(predictions[:, 5] * self.n_strips) if as_lanes: pred = self.predictions_to_pred(predictions) else: pred = predictions decoded.append(pred) return decoded ================================================ FILE: ppdet/modeling/heads/detr_head.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register import pycocotools.mask as mask_util from ..initializer import linear_init_, constant_ from ..transformers.utils import inverse_sigmoid __all__ = ['DETRHead', 'DeformableDETRHead', 'DINOHead', 'MaskDINOHead', 'DINOv3Head'] class MLP(nn.Layer): """This code is based on https://github.com/facebookresearch/detr/blob/main/models/detr.py """ def __init__(self, input_dim, hidden_dim, output_dim, num_layers): super().__init__() self.num_layers = num_layers h = [hidden_dim] * (num_layers - 1) self.layers = nn.LayerList( nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) self._reset_parameters() def _reset_parameters(self): for l in self.layers: linear_init_(l) def forward(self, x): for i, layer in enumerate(self.layers): x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) return x class MultiHeadAttentionMap(nn.Layer): """This code is based on https://github.com/facebookresearch/detr/blob/main/models/segmentation.py This is a 2D attention module, which only returns the attention softmax (no multiplication by value) """ def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True): super().__init__() self.num_heads = num_heads self.hidden_dim = hidden_dim self.dropout = nn.Dropout(dropout) weight_attr = paddle.ParamAttr( initializer=paddle.nn.initializer.XavierUniform()) bias_attr = paddle.framework.ParamAttr( initializer=paddle.nn.initializer.Constant()) if bias else False self.q_proj = nn.Linear(query_dim, hidden_dim, weight_attr, bias_attr) self.k_proj = nn.Conv2D( query_dim, hidden_dim, 1, weight_attr=weight_attr, bias_attr=bias_attr) self.normalize_fact = float(hidden_dim / self.num_heads)**-0.5 def forward(self, q, k, mask=None): q = self.q_proj(q) k = self.k_proj(k) bs, num_queries, n, c, h, w = q.shape[0], q.shape[1], self.num_heads,\ self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1] qh = q.reshape([bs, num_queries, n, c]) kh = k.reshape([bs, n, c, h, w]) # weights = paddle.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh) qh = qh.transpose([0, 2, 1, 3]).reshape([-1, num_queries, c]) kh = kh.reshape([-1, c, h * w]) weights = paddle.bmm(qh * self.normalize_fact, kh).reshape( [bs, n, num_queries, h, w]).transpose([0, 2, 1, 3, 4]) if mask is not None: weights += mask # fix a potenial bug: https://github.com/facebookresearch/detr/issues/247 weights = F.softmax(weights.flatten(3), axis=-1).reshape(weights.shape) weights = self.dropout(weights) return weights class MaskHeadFPNConv(nn.Layer): """This code is based on https://github.com/facebookresearch/detr/blob/main/models/segmentation.py Simple convolutional head, using group norm. Upsampling is done using a FPN approach """ def __init__(self, input_dim, fpn_dims, context_dim, num_groups=8): super().__init__() inter_dims = [input_dim, ] + [context_dim // (2**i) for i in range(1, 5)] weight_attr = paddle.ParamAttr( initializer=paddle.nn.initializer.KaimingUniform()) bias_attr = paddle.framework.ParamAttr( initializer=paddle.nn.initializer.Constant()) self.conv0 = self._make_layers(input_dim, input_dim, 3, num_groups, weight_attr, bias_attr) self.conv_inter = nn.LayerList() for in_dims, out_dims in zip(inter_dims[:-1], inter_dims[1:]): self.conv_inter.append( self._make_layers(in_dims, out_dims, 3, num_groups, weight_attr, bias_attr)) self.conv_out = nn.Conv2D( inter_dims[-1], 1, 3, padding=1, weight_attr=weight_attr, bias_attr=bias_attr) self.adapter = nn.LayerList() for i in range(len(fpn_dims)): self.adapter.append( nn.Conv2D( fpn_dims[i], inter_dims[i + 1], 1, weight_attr=weight_attr, bias_attr=bias_attr)) def _make_layers(self, in_dims, out_dims, kernel_size, num_groups, weight_attr=None, bias_attr=None): return nn.Sequential( nn.Conv2D( in_dims, out_dims, kernel_size, padding=kernel_size // 2, weight_attr=weight_attr, bias_attr=bias_attr), nn.GroupNorm(num_groups, out_dims), nn.ReLU()) def forward(self, x, bbox_attention_map, fpns): x = paddle.concat([ x.tile([bbox_attention_map.shape[1], 1, 1, 1]), bbox_attention_map.flatten(0, 1) ], 1) x = self.conv0(x) for inter_layer, adapter_layer, feat in zip(self.conv_inter[:-1], self.adapter, fpns): feat = adapter_layer(feat).tile( [bbox_attention_map.shape[1], 1, 1, 1]) x = inter_layer(x) x = feat + F.interpolate(x, size=feat.shape[-2:]) x = self.conv_inter[-1](x) x = self.conv_out(x) return x @register class DETRHead(nn.Layer): __shared__ = ['num_classes', 'hidden_dim', 'use_focal_loss'] __inject__ = ['loss'] def __init__(self, num_classes=80, hidden_dim=256, nhead=8, num_mlp_layers=3, loss='DETRLoss', fpn_dims=[1024, 512, 256], with_mask_head=False, use_focal_loss=False): super(DETRHead, self).__init__() # add background class self.num_classes = num_classes if use_focal_loss else num_classes + 1 self.hidden_dim = hidden_dim self.loss = loss self.with_mask_head = with_mask_head self.use_focal_loss = use_focal_loss self.score_head = nn.Linear(hidden_dim, self.num_classes) self.bbox_head = MLP(hidden_dim, hidden_dim, output_dim=4, num_layers=num_mlp_layers) if self.with_mask_head: self.bbox_attention = MultiHeadAttentionMap(hidden_dim, hidden_dim, nhead) self.mask_head = MaskHeadFPNConv(hidden_dim + nhead, fpn_dims, hidden_dim) self._reset_parameters() def _reset_parameters(self): linear_init_(self.score_head) @classmethod def from_config(cls, cfg, hidden_dim, nhead, input_shape): return { 'hidden_dim': hidden_dim, 'nhead': nhead, 'fpn_dims': [i.channels for i in input_shape[::-1]][1:] } @staticmethod def get_gt_mask_from_polygons(gt_poly, pad_mask): out_gt_mask = [] for polygons, padding in zip(gt_poly, pad_mask): height, width = int(padding[:, 0].sum()), int(padding[0, :].sum()) masks = [] for obj_poly in polygons: rles = mask_util.frPyObjects(obj_poly, height, width) rle = mask_util.merge(rles) masks.append( paddle.to_tensor(mask_util.decode(rle)).astype('float32')) masks = paddle.stack(masks) masks_pad = paddle.zeros( [masks.shape[0], pad_mask.shape[1], pad_mask.shape[2]]) masks_pad[:, :height, :width] = masks out_gt_mask.append(masks_pad) return out_gt_mask def forward(self, out_transformer, body_feats, inputs=None): r""" Args: out_transformer (Tuple): (feats: [num_levels, batch_size, num_queries, hidden_dim], memory: [batch_size, hidden_dim, h, w], src_proj: [batch_size, h*w, hidden_dim], src_mask: [batch_size, 1, 1, h, w]) body_feats (List(Tensor)): list[[B, C, H, W]] inputs (dict): dict(inputs) """ feats, memory, src_proj, src_mask = out_transformer outputs_logit = self.score_head(feats) outputs_bbox = F.sigmoid(self.bbox_head(feats)) outputs_seg = None if self.with_mask_head: bbox_attention_map = self.bbox_attention(feats[-1], memory, src_mask) fpn_feats = [a for a in body_feats[::-1]][1:] outputs_seg = self.mask_head(src_proj, bbox_attention_map, fpn_feats) outputs_seg = outputs_seg.reshape([ feats.shape[1], feats.shape[2], outputs_seg.shape[-2], outputs_seg.shape[-1] ]) if self.training: assert inputs is not None assert 'gt_bbox' in inputs and 'gt_class' in inputs gt_mask = self.get_gt_mask_from_polygons( inputs['gt_poly'], inputs['pad_mask']) if 'gt_poly' in inputs else None return self.loss( outputs_bbox, outputs_logit, inputs['gt_bbox'], inputs['gt_class'], masks=outputs_seg, gt_mask=gt_mask) else: return (outputs_bbox[-1], outputs_logit[-1], outputs_seg) @register class DeformableDETRHead(nn.Layer): __shared__ = ['num_classes', 'hidden_dim'] __inject__ = ['loss'] def __init__(self, num_classes=80, hidden_dim=512, nhead=8, num_mlp_layers=3, loss='DETRLoss'): super(DeformableDETRHead, self).__init__() self.num_classes = num_classes self.hidden_dim = hidden_dim self.nhead = nhead self.loss = loss self.score_head = nn.Linear(hidden_dim, self.num_classes) self.bbox_head = MLP(hidden_dim, hidden_dim, output_dim=4, num_layers=num_mlp_layers) self._reset_parameters() def _reset_parameters(self): linear_init_(self.score_head) constant_(self.score_head.bias, -4.595) constant_(self.bbox_head.layers[-1].weight) with paddle.no_grad(): bias = paddle.zeros_like(self.bbox_head.layers[-1].bias) bias[2:] = -2.0 self.bbox_head.layers[-1].bias.set_value(bias) @classmethod def from_config(cls, cfg, hidden_dim, nhead, input_shape): return {'hidden_dim': hidden_dim, 'nhead': nhead} def forward(self, out_transformer, body_feats, inputs=None): r""" Args: out_transformer (Tuple): (feats: [num_levels, batch_size, num_queries, hidden_dim], memory: [batch_size, \sum_{l=0}^{L-1} H_l \cdot W_l, hidden_dim], reference_points: [batch_size, num_queries, 2]) body_feats (List(Tensor)): list[[B, C, H, W]] inputs (dict): dict(inputs) """ feats, memory, reference_points = out_transformer reference_points = inverse_sigmoid(reference_points.unsqueeze(0)) outputs_bbox = self.bbox_head(feats) # It's equivalent to "outputs_bbox[:, :, :, :2] += reference_points", # but the gradient is wrong in paddle. outputs_bbox = paddle.concat( [ outputs_bbox[:, :, :, :2] + reference_points, outputs_bbox[:, :, :, 2:] ], axis=-1) outputs_bbox = F.sigmoid(outputs_bbox) outputs_logit = self.score_head(feats) if self.training: assert inputs is not None assert 'gt_bbox' in inputs and 'gt_class' in inputs return self.loss(outputs_bbox, outputs_logit, inputs['gt_bbox'], inputs['gt_class']) else: return (outputs_bbox[-1], outputs_logit[-1], None) @register class DINOHead(nn.Layer): __inject__ = ['loss'] def __init__(self, loss='DINOLoss', eval_idx=-1): super(DINOHead, self).__init__() self.loss = loss self.eval_idx = eval_idx def forward(self, out_transformer, body_feats, inputs=None): (dec_out_bboxes, dec_out_logits, enc_topk_bboxes, enc_topk_logits, dn_meta) = out_transformer if self.training: assert inputs is not None assert 'gt_bbox' in inputs and 'gt_class' in inputs if dn_meta is not None: if isinstance(dn_meta, list): dual_groups = len(dn_meta) - 1 dec_out_bboxes = paddle.split( dec_out_bboxes, dual_groups + 1, axis=2) dec_out_logits = paddle.split( dec_out_logits, dual_groups + 1, axis=2) enc_topk_bboxes = paddle.split( enc_topk_bboxes, dual_groups + 1, axis=1) enc_topk_logits = paddle.split( enc_topk_logits, dual_groups + 1, axis=1) dec_out_bboxes_list = [] dec_out_logits_list = [] dn_out_bboxes_list = [] dn_out_logits_list = [] loss = {} for g_id in range(dual_groups + 1): if dn_meta[g_id] is not None: dn_out_bboxes_gid, dec_out_bboxes_gid = paddle.split( dec_out_bboxes[g_id], dn_meta[g_id]['dn_num_split'], axis=2) dn_out_logits_gid, dec_out_logits_gid = paddle.split( dec_out_logits[g_id], dn_meta[g_id]['dn_num_split'], axis=2) else: dn_out_bboxes_gid, dn_out_logits_gid = None, None dec_out_bboxes_gid = dec_out_bboxes[g_id] dec_out_logits_gid = dec_out_logits[g_id] out_bboxes_gid = paddle.concat([ enc_topk_bboxes[g_id].unsqueeze(0), dec_out_bboxes_gid ]) out_logits_gid = paddle.concat([ enc_topk_logits[g_id].unsqueeze(0), dec_out_logits_gid ]) loss_gid = self.loss( out_bboxes_gid, out_logits_gid, inputs['gt_bbox'], inputs['gt_class'], dn_out_bboxes=dn_out_bboxes_gid, dn_out_logits=dn_out_logits_gid, dn_meta=dn_meta[g_id]) # sum loss for key, value in loss_gid.items(): loss.update({ key: loss.get(key, paddle.zeros([1])) + value }) # average across (dual_groups + 1) for key, value in loss.items(): loss.update({key: value / (dual_groups + 1)}) return loss else: dn_out_bboxes, dec_out_bboxes = paddle.split( dec_out_bboxes, dn_meta['dn_num_split'], axis=2) dn_out_logits, dec_out_logits = paddle.split( dec_out_logits, dn_meta['dn_num_split'], axis=2) else: dn_out_bboxes, dn_out_logits = None, None out_bboxes = paddle.concat( [enc_topk_bboxes.unsqueeze(0), dec_out_bboxes]) out_logits = paddle.concat( [enc_topk_logits.unsqueeze(0), dec_out_logits]) return self.loss( out_bboxes, out_logits, inputs['gt_bbox'], inputs['gt_class'], dn_out_bboxes=dn_out_bboxes, dn_out_logits=dn_out_logits, dn_meta=dn_meta, gt_score=inputs.get('gt_score', None)) else: return (dec_out_bboxes[self.eval_idx], dec_out_logits[self.eval_idx], None) @register class MaskDINOHead(nn.Layer): __inject__ = ['loss'] def __init__(self, loss='DINOLoss'): super(MaskDINOHead, self).__init__() self.loss = loss def forward(self, out_transformer, body_feats, inputs=None): (dec_out_logits, dec_out_bboxes, dec_out_masks, enc_out, init_out, dn_meta) = out_transformer if self.training: assert inputs is not None assert 'gt_bbox' in inputs and 'gt_class' in inputs assert 'gt_segm' in inputs if dn_meta is not None: dn_out_logits, dec_out_logits = paddle.split( dec_out_logits, dn_meta['dn_num_split'], axis=2) dn_out_bboxes, dec_out_bboxes = paddle.split( dec_out_bboxes, dn_meta['dn_num_split'], axis=2) dn_out_masks, dec_out_masks = paddle.split( dec_out_masks, dn_meta['dn_num_split'], axis=2) if init_out is not None: init_out_logits, init_out_bboxes, init_out_masks = init_out init_out_logits_dn, init_out_logits = paddle.split( init_out_logits, dn_meta['dn_num_split'], axis=1) init_out_bboxes_dn, init_out_bboxes = paddle.split( init_out_bboxes, dn_meta['dn_num_split'], axis=1) init_out_masks_dn, init_out_masks = paddle.split( init_out_masks, dn_meta['dn_num_split'], axis=1) dec_out_logits = paddle.concat( [init_out_logits.unsqueeze(0), dec_out_logits]) dec_out_bboxes = paddle.concat( [init_out_bboxes.unsqueeze(0), dec_out_bboxes]) dec_out_masks = paddle.concat( [init_out_masks.unsqueeze(0), dec_out_masks]) dn_out_logits = paddle.concat( [init_out_logits_dn.unsqueeze(0), dn_out_logits]) dn_out_bboxes = paddle.concat( [init_out_bboxes_dn.unsqueeze(0), dn_out_bboxes]) dn_out_masks = paddle.concat( [init_out_masks_dn.unsqueeze(0), dn_out_masks]) else: dn_out_bboxes, dn_out_logits = None, None dn_out_masks = None enc_out_logits, enc_out_bboxes, enc_out_masks = enc_out out_logits = paddle.concat( [enc_out_logits.unsqueeze(0), dec_out_logits]) out_bboxes = paddle.concat( [enc_out_bboxes.unsqueeze(0), dec_out_bboxes]) out_masks = paddle.concat( [enc_out_masks.unsqueeze(0), dec_out_masks]) inputs['gt_segm'] = [gt_segm.astype(out_masks.dtype) for gt_segm in inputs['gt_segm']] return self.loss( out_bboxes, out_logits, inputs['gt_bbox'], inputs['gt_class'], masks=out_masks, gt_mask=inputs['gt_segm'], dn_out_logits=dn_out_logits, dn_out_bboxes=dn_out_bboxes, dn_out_masks=dn_out_masks, dn_meta=dn_meta) else: return (dec_out_bboxes[-1], dec_out_logits[-1], dec_out_masks[-1]) @register class DINOv3Head(nn.Layer): __inject__ = ['loss'] __shared__ = ['o2m_branch', 'num_queries_o2m'] def __init__(self, loss='DINOLoss', eval_idx=-1, o2m=4, o2m_branch=False, num_queries_o2m=450): super(DINOv3Head, self).__init__() self.loss = loss self.eval_idx = eval_idx self.o2m = o2m self.o2m_branch = o2m_branch self.num_queries_o2m = num_queries_o2m def forward(self, out_transformer, body_feats, inputs=None): (dec_out_bboxes, dec_out_logits, enc_topk_bboxes, enc_topk_logits, dn_meta) = out_transformer if self.training: assert inputs is not None assert 'gt_bbox' in inputs and 'gt_class' in inputs if dn_meta is not None: num_groups = len(dn_meta) total_dec_queries = dec_out_bboxes.shape[2] total_enc_queries = enc_topk_bboxes.shape[1] loss = {} if self.o2m_branch: dec_out_bboxes, dec_out_bboxes_o2m = paddle.split(dec_out_bboxes, [total_dec_queries - self.num_queries_o2m, self.num_queries_o2m], axis=2) dec_out_logits, dec_out_logits_o2m = paddle.split(dec_out_logits, [total_dec_queries - self.num_queries_o2m, self.num_queries_o2m], axis=2) enc_topk_bboxes, enc_topk_bboxes_o2m = paddle.split(enc_topk_bboxes, [total_enc_queries - self.num_queries_o2m, self.num_queries_o2m], axis=1) enc_topk_logits, enc_topk_logits_o2m = paddle.split(enc_topk_logits, [total_enc_queries - self.num_queries_o2m, self.num_queries_o2m], axis=1) out_bboxes_o2m = paddle.concat([enc_topk_bboxes_o2m.unsqueeze(0), dec_out_bboxes_o2m]) out_logits_o2m = paddle.concat([enc_topk_logits_o2m.unsqueeze(0), dec_out_logits_o2m]) loss_o2m = self.loss( out_bboxes_o2m, out_logits_o2m, inputs['gt_bbox'], inputs['gt_class'], dn_out_bboxes=None, dn_out_logits=None, dn_meta=None, o2m=self.o2m) for key, value in loss_o2m.items(): key = key + '_o2m_branch' loss.update({ key: loss.get(key, paddle.zeros([1])) + value }) split_dec_num = [sum(dn['dn_num_split']) for dn in dn_meta] split_enc_num = [dn['dn_num_split'][1] for dn in dn_meta] dec_out_bboxes = paddle.split(dec_out_bboxes, split_dec_num, axis=2) dec_out_logits = paddle.split(dec_out_logits, split_dec_num, axis=2) enc_topk_bboxes = paddle.split(enc_topk_bboxes, split_enc_num, axis=1) enc_topk_logits = paddle.split(enc_topk_logits, split_enc_num, axis=1) for g_id in range(num_groups): dn_out_bboxes_gid, dec_out_bboxes_gid = paddle.split( dec_out_bboxes[g_id], dn_meta[g_id]['dn_num_split'], axis=2) dn_out_logits_gid, dec_out_logits_gid = paddle.split( dec_out_logits[g_id], dn_meta[g_id]['dn_num_split'], axis=2) out_bboxes_gid = paddle.concat([ enc_topk_bboxes[g_id].unsqueeze(0), dec_out_bboxes_gid]) out_logits_gid = paddle.concat([ enc_topk_logits[g_id].unsqueeze(0), dec_out_logits_gid]) loss_gid = self.loss( out_bboxes_gid, out_logits_gid, inputs['gt_bbox'], inputs['gt_class'], dn_out_bboxes=dn_out_bboxes_gid, dn_out_logits=dn_out_logits_gid, dn_meta=dn_meta[g_id]) # sum loss for key, value in loss_gid.items(): loss.update({ key: loss.get(key, paddle.zeros([1])) + value }) # average across (dual_groups + 1) for key, value in loss.items(): if '_o2m_branch' not in key: loss.update({key: value / num_groups}) return loss else: dn_out_bboxes, dn_out_logits = None, None out_bboxes = paddle.concat( [enc_topk_bboxes.unsqueeze(0), dec_out_bboxes]) out_logits = paddle.concat( [enc_topk_logits.unsqueeze(0), dec_out_logits]) return self.loss( out_bboxes, out_logits, inputs['gt_bbox'], inputs['gt_class'], dn_out_bboxes=dn_out_bboxes, dn_out_logits=dn_out_logits, dn_meta=dn_meta, gt_score=inputs.get('gt_score', None)) else: return (dec_out_bboxes[self.eval_idx], dec_out_logits[self.eval_idx], None) ================================================ FILE: ppdet/modeling/heads/face_head.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn from ppdet.core.workspace import register from ..layers import AnchorGeneratorSSD from ..cls_utils import _get_class_default_kwargs @register class FaceHead(nn.Layer): """ Head block for Face detection network Args: num_classes (int): Number of output classes. in_channels (int): Number of input channels. anchor_generator(object): instance of anchor genertor method. kernel_size (int): kernel size of Conv2D in FaceHead. padding (int): padding of Conv2D in FaceHead. conv_decay (float): norm_decay (float): weight decay for conv layer weights. loss (object): loss of face detection model. """ __shared__ = ['num_classes'] __inject__ = ['anchor_generator', 'loss'] def __init__(self, num_classes=80, in_channels=[96, 96], anchor_generator=_get_class_default_kwargs(AnchorGeneratorSSD), kernel_size=3, padding=1, conv_decay=0., loss='SSDLoss'): super(FaceHead, self).__init__() # add background class self.num_classes = num_classes + 1 self.in_channels = in_channels self.anchor_generator = anchor_generator self.loss = loss if isinstance(anchor_generator, dict): self.anchor_generator = AnchorGeneratorSSD(**anchor_generator) self.num_priors = self.anchor_generator.num_priors self.box_convs = [] self.score_convs = [] for i, num_prior in enumerate(self.num_priors): box_conv_name = "boxes{}".format(i) box_conv = self.add_sublayer( box_conv_name, nn.Conv2D( in_channels=self.in_channels[i], out_channels=num_prior * 4, kernel_size=kernel_size, padding=padding)) self.box_convs.append(box_conv) score_conv_name = "scores{}".format(i) score_conv = self.add_sublayer( score_conv_name, nn.Conv2D( in_channels=self.in_channels[i], out_channels=num_prior * self.num_classes, kernel_size=kernel_size, padding=padding)) self.score_convs.append(score_conv) @classmethod def from_config(cls, cfg, input_shape): return {'in_channels': [i.channels for i in input_shape], } def forward(self, feats, image, gt_bbox=None, gt_class=None): box_preds = [] cls_scores = [] prior_boxes = [] for feat, box_conv, score_conv in zip(feats, self.box_convs, self.score_convs): box_pred = box_conv(feat) box_pred = paddle.transpose(box_pred, [0, 2, 3, 1]) box_pred = paddle.reshape(box_pred, [0, -1, 4]) box_preds.append(box_pred) cls_score = score_conv(feat) cls_score = paddle.transpose(cls_score, [0, 2, 3, 1]) cls_score = paddle.reshape(cls_score, [0, -1, self.num_classes]) cls_scores.append(cls_score) prior_boxes = self.anchor_generator(feats, image) if self.training: return self.get_loss(box_preds, cls_scores, gt_bbox, gt_class, prior_boxes) else: return (box_preds, cls_scores), prior_boxes def get_loss(self, boxes, scores, gt_bbox, gt_class, prior_boxes): return self.loss(boxes, scores, gt_bbox, gt_class, prior_boxes) ================================================ FILE: ppdet/modeling/heads/fcos_head.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import math import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.nn.initializer import Normal, Constant from ppdet.core.workspace import register from ppdet.modeling.layers import ConvNormLayer, MultiClassNMS __all__ = ['FCOSFeat', 'FCOSHead', 'FCOSHead_ARSL'] class ScaleReg(nn.Layer): """ Parameter for scaling the regression outputs. """ def __init__(self): super(ScaleReg, self).__init__() self.scale_reg = self.create_parameter( shape=[1], attr=ParamAttr(initializer=Constant(value=1.)), dtype="float32") def forward(self, inputs): out = inputs * self.scale_reg return out @register class FCOSFeat(nn.Layer): """ FCOSFeat of FCOS Args: feat_in (int): The channel number of input Tensor. feat_out (int): The channel number of output Tensor. num_convs (int): The convolution number of the FCOSFeat. norm_type (str): Normalization type, 'bn'/'sync_bn'/'gn'. use_dcn (bool): Whether to use dcn in tower or not. """ def __init__(self, feat_in=256, feat_out=256, num_convs=4, norm_type='bn', use_dcn=False): super(FCOSFeat, self).__init__() self.feat_in = feat_in self.feat_out = feat_out self.num_convs = num_convs self.norm_type = norm_type self.cls_subnet_convs = [] self.reg_subnet_convs = [] for i in range(self.num_convs): in_c = feat_in if i == 0 else feat_out cls_conv_name = 'fcos_head_cls_tower_conv_{}'.format(i) cls_conv = self.add_sublayer( cls_conv_name, ConvNormLayer( ch_in=in_c, ch_out=feat_out, filter_size=3, stride=1, norm_type=norm_type, use_dcn=use_dcn, bias_on=True, lr_scale=2.)) self.cls_subnet_convs.append(cls_conv) reg_conv_name = 'fcos_head_reg_tower_conv_{}'.format(i) reg_conv = self.add_sublayer( reg_conv_name, ConvNormLayer( ch_in=in_c, ch_out=feat_out, filter_size=3, stride=1, norm_type=norm_type, use_dcn=use_dcn, bias_on=True, lr_scale=2.)) self.reg_subnet_convs.append(reg_conv) def forward(self, fpn_feat): cls_feat = fpn_feat reg_feat = fpn_feat for i in range(self.num_convs): cls_feat = F.relu(self.cls_subnet_convs[i](cls_feat)) reg_feat = F.relu(self.reg_subnet_convs[i](reg_feat)) return cls_feat, reg_feat @register class FCOSHead(nn.Layer): """ FCOSHead Args: num_classes (int): Number of classes fcos_feat (object): Instance of 'FCOSFeat' fpn_stride (list): The stride of each FPN Layer prior_prob (float): Used to set the bias init for the class prediction layer norm_reg_targets (bool): Normalization the regression target if true centerness_on_reg (bool): The prediction of centerness on regression or clssification branch num_shift (float): Relative offset between the center of the first shift and the top-left corner of img fcos_loss (object): Instance of 'FCOSLoss' nms (object): Instance of 'MultiClassNMS' trt (bool): Whether to use trt in nms of deploy """ __inject__ = ['fcos_feat', 'fcos_loss', 'nms'] __shared__ = ['num_classes', 'trt'] def __init__(self, num_classes=80, fcos_feat='FCOSFeat', fpn_stride=[8, 16, 32, 64, 128], prior_prob=0.01, multiply_strides_reg_targets=False, norm_reg_targets=True, centerness_on_reg=True, num_shift=0.5, sqrt_score=False, fcos_loss='FCOSLoss', nms='MultiClassNMS', trt=False): super(FCOSHead, self).__init__() self.fcos_feat = fcos_feat self.num_classes = num_classes self.fpn_stride = fpn_stride self.prior_prob = prior_prob self.fcos_loss = fcos_loss self.norm_reg_targets = norm_reg_targets self.centerness_on_reg = centerness_on_reg self.multiply_strides_reg_targets = multiply_strides_reg_targets self.num_shift = num_shift self.nms = nms if isinstance(self.nms, MultiClassNMS) and trt: self.nms.trt = trt self.sqrt_score = sqrt_score self.is_teacher = False conv_cls_name = "fcos_head_cls" bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob) self.fcos_head_cls = self.add_sublayer( conv_cls_name, nn.Conv2D( in_channels=256, out_channels=self.num_classes, kernel_size=3, stride=1, padding=1, weight_attr=ParamAttr(initializer=Normal( mean=0., std=0.01)), bias_attr=ParamAttr( initializer=Constant(value=bias_init_value)))) conv_reg_name = "fcos_head_reg" self.fcos_head_reg = self.add_sublayer( conv_reg_name, nn.Conv2D( in_channels=256, out_channels=4, kernel_size=3, stride=1, padding=1, weight_attr=ParamAttr(initializer=Normal( mean=0., std=0.01)), bias_attr=ParamAttr(initializer=Constant(value=0)))) conv_centerness_name = "fcos_head_centerness" self.fcos_head_centerness = self.add_sublayer( conv_centerness_name, nn.Conv2D( in_channels=256, out_channels=1, kernel_size=3, stride=1, padding=1, weight_attr=ParamAttr(initializer=Normal( mean=0., std=0.01)), bias_attr=ParamAttr(initializer=Constant(value=0)))) self.scales_regs = [] for i in range(len(self.fpn_stride)): lvl = int(math.log(int(self.fpn_stride[i]), 2)) feat_name = 'p{}_feat'.format(lvl) scale_reg = self.add_sublayer(feat_name, ScaleReg()) self.scales_regs.append(scale_reg) def _compute_locations_by_level(self, fpn_stride, feature, num_shift=0.5): """ Compute locations of anchor points of each FPN layer Args: fpn_stride (int): The stride of current FPN feature map feature (Tensor): Tensor of current FPN feature map Return: Anchor points locations of current FPN feature map """ h, w = feature.shape[2], feature.shape[3] shift_x = paddle.arange(0, w * fpn_stride, fpn_stride) shift_y = paddle.arange(0, h * fpn_stride, fpn_stride) shift_x = paddle.unsqueeze(shift_x, axis=0) shift_y = paddle.unsqueeze(shift_y, axis=1) shift_x = paddle.expand(shift_x, shape=[h, w]) shift_y = paddle.expand(shift_y, shape=[h, w]) shift_x = paddle.reshape(shift_x, shape=[-1]) shift_y = paddle.reshape(shift_y, shape=[-1]) location = paddle.stack( [shift_x, shift_y], axis=-1) + float(fpn_stride * num_shift) return location def forward(self, fpn_feats, targets=None): assert len(fpn_feats) == len( self.fpn_stride ), "The size of fpn_feats is not equal to size of fpn_stride" cls_logits_list = [] bboxes_reg_list = [] centerness_list = [] for scale_reg, fpn_stride, fpn_feat in zip(self.scales_regs, self.fpn_stride, fpn_feats): fcos_cls_feat, fcos_reg_feat = self.fcos_feat(fpn_feat) cls_logits = self.fcos_head_cls(fcos_cls_feat) bbox_reg = scale_reg(self.fcos_head_reg(fcos_reg_feat)) if self.centerness_on_reg: centerness = self.fcos_head_centerness(fcos_reg_feat) else: centerness = self.fcos_head_centerness(fcos_cls_feat) if self.norm_reg_targets: bbox_reg = F.relu(bbox_reg) if self.multiply_strides_reg_targets: bbox_reg = bbox_reg * fpn_stride else: if not self.training or targets.get( 'get_data', False) or targets.get('is_teacher', False): bbox_reg = bbox_reg * fpn_stride else: bbox_reg = paddle.exp(bbox_reg) cls_logits_list.append(cls_logits) bboxes_reg_list.append(bbox_reg) centerness_list.append(centerness) if targets is not None: self.is_teacher = targets.get('is_teacher', False) if self.is_teacher: return [cls_logits_list, bboxes_reg_list, centerness_list] if self.training and targets is not None: get_data = targets.get('get_data', False) if get_data: return [cls_logits_list, bboxes_reg_list, centerness_list] losses = {} fcos_head_outs = [cls_logits_list, bboxes_reg_list, centerness_list] losses_fcos = self.get_loss(fcos_head_outs, targets) losses.update(losses_fcos) total_loss = paddle.add_n(list(losses.values())) losses.update({'loss': total_loss}) return losses else: # eval or infer locations_list = [] for fpn_stride, feature in zip(self.fpn_stride, fpn_feats): location = self._compute_locations_by_level(fpn_stride, feature, self.num_shift) locations_list.append(location) fcos_head_outs = [ locations_list, cls_logits_list, bboxes_reg_list, centerness_list ] return fcos_head_outs def get_loss(self, fcos_head_outs, targets): cls_logits, bboxes_reg, centerness = fcos_head_outs # get labels,reg_target,centerness tag_labels, tag_bboxes, tag_centerness = [], [], [] for i in range(len(self.fpn_stride)): k_lbl = 'labels{}'.format(i) if k_lbl in targets: tag_labels.append(targets[k_lbl]) k_box = 'reg_target{}'.format(i) if k_box in targets: tag_bboxes.append(targets[k_box]) k_ctn = 'centerness{}'.format(i) if k_ctn in targets: tag_centerness.append(targets[k_ctn]) losses_fcos = self.fcos_loss(cls_logits, bboxes_reg, centerness, tag_labels, tag_bboxes, tag_centerness) return losses_fcos def _post_process_by_level(self, locations, box_cls, box_reg, box_ctn, sqrt_score=False): box_scores = F.sigmoid(box_cls).flatten(2).transpose([0, 2, 1]) box_centerness = F.sigmoid(box_ctn).flatten(2).transpose([0, 2, 1]) pred_scores = box_scores * box_centerness if sqrt_score: pred_scores = paddle.sqrt(pred_scores) box_reg_ch_last = box_reg.flatten(2).transpose([0, 2, 1]) box_reg_decoding = paddle.stack( [ locations[:, 0] - box_reg_ch_last[:, :, 0], locations[:, 1] - box_reg_ch_last[:, :, 1], locations[:, 0] + box_reg_ch_last[:, :, 2], locations[:, 1] + box_reg_ch_last[:, :, 3] ], axis=1) pred_boxes = box_reg_decoding.transpose([0, 2, 1]) return pred_scores, pred_boxes def post_process(self, fcos_head_outs, scale_factor): locations, cls_logits, bboxes_reg, centerness = fcos_head_outs pred_bboxes, pred_scores = [], [] for pts, cls, reg, ctn in zip(locations, cls_logits, bboxes_reg, centerness): scores, boxes = self._post_process_by_level(pts, cls, reg, ctn, self.sqrt_score) pred_scores.append(scores) pred_bboxes.append(boxes) pred_bboxes = paddle.concat(pred_bboxes, axis=1) pred_scores = paddle.concat(pred_scores, axis=1) # scale bbox to origin scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1) scale_factor = paddle.concat( [scale_x, scale_y, scale_x, scale_y], axis=-1).reshape([-1, 1, 4]) pred_bboxes /= scale_factor pred_scores = pred_scores.transpose([0, 2, 1]) bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores) return bbox_pred, bbox_num @register class FCOSHead_ARSL(FCOSHead): """ FCOSHead of ARSL for semi-det(ssod) Args: fcos_feat (object): Instance of 'FCOSFeat' num_classes (int): Number of classes fpn_stride (list): The stride of each FPN Layer prior_prob (float): Used to set the bias init for the class prediction layer fcos_loss (object): Instance of 'FCOSLoss' norm_reg_targets (bool): Normalization the regression target if true centerness_on_reg (bool): The prediction of centerness on regression or clssification branch nms (object): Instance of 'MultiClassNMS' trt (bool): Whether to use trt in nms of deploy """ __inject__ = ['fcos_feat', 'fcos_loss', 'nms'] __shared__ = ['num_classes', 'trt'] def __init__(self, num_classes=80, fcos_feat='FCOSFeat', fpn_stride=[8, 16, 32, 64, 128], prior_prob=0.01, multiply_strides_reg_targets=False, norm_reg_targets=True, centerness_on_reg=True, num_shift=0.5, sqrt_score=False, fcos_loss='FCOSLossMILC', nms='MultiClassNMS', trt=False): super(FCOSHead_ARSL, self).__init__() self.fcos_feat = fcos_feat self.num_classes = num_classes self.fpn_stride = fpn_stride self.prior_prob = prior_prob self.fcos_loss = fcos_loss self.norm_reg_targets = norm_reg_targets self.centerness_on_reg = centerness_on_reg self.multiply_strides_reg_targets = multiply_strides_reg_targets self.num_shift = num_shift self.nms = nms if isinstance(self.nms, MultiClassNMS) and trt: self.nms.trt = trt self.sqrt_score = sqrt_score conv_cls_name = "fcos_head_cls" bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob) self.fcos_head_cls = self.add_sublayer( conv_cls_name, nn.Conv2D( in_channels=256, out_channels=self.num_classes, kernel_size=3, stride=1, padding=1, weight_attr=ParamAttr(initializer=Normal( mean=0., std=0.01)), bias_attr=ParamAttr( initializer=Constant(value=bias_init_value)))) conv_reg_name = "fcos_head_reg" self.fcos_head_reg = self.add_sublayer( conv_reg_name, nn.Conv2D( in_channels=256, out_channels=4, kernel_size=3, stride=1, padding=1, weight_attr=ParamAttr(initializer=Normal( mean=0., std=0.01)), bias_attr=ParamAttr(initializer=Constant(value=0)))) conv_centerness_name = "fcos_head_centerness" self.fcos_head_centerness = self.add_sublayer( conv_centerness_name, nn.Conv2D( in_channels=256, out_channels=1, kernel_size=3, stride=1, padding=1, weight_attr=ParamAttr(initializer=Normal( mean=0., std=0.01)), bias_attr=ParamAttr(initializer=Constant(value=0)))) self.scales_regs = [] for i in range(len(self.fpn_stride)): lvl = int(math.log(int(self.fpn_stride[i]), 2)) feat_name = 'p{}_feat'.format(lvl) scale_reg = self.add_sublayer(feat_name, ScaleReg()) self.scales_regs.append(scale_reg) def forward(self, fpn_feats, targets=None): assert len(fpn_feats) == len( self.fpn_stride ), "The size of fpn_feats is not equal to size of fpn_stride" cls_logits_list = [] bboxes_reg_list = [] centerness_list = [] for scale_reg, fpn_stride, fpn_feat in zip(self.scales_regs, self.fpn_stride, fpn_feats): fcos_cls_feat, fcos_reg_feat = self.fcos_feat(fpn_feat) cls_logits = self.fcos_head_cls(fcos_cls_feat) bbox_reg = scale_reg(self.fcos_head_reg(fcos_reg_feat)) if self.centerness_on_reg: centerness = self.fcos_head_centerness(fcos_reg_feat) else: centerness = self.fcos_head_centerness(fcos_cls_feat) if self.norm_reg_targets: bbox_reg = F.relu(bbox_reg) if not self.training: bbox_reg = bbox_reg * fpn_stride else: bbox_reg = paddle.exp(bbox_reg) cls_logits_list.append(cls_logits) bboxes_reg_list.append(bbox_reg) centerness_list.append(centerness) if not self.training: locations_list = [] for fpn_stride, feature in zip(self.fpn_stride, fpn_feats): location = self._compute_locations_by_level(fpn_stride, feature) locations_list.append(location) return locations_list, cls_logits_list, bboxes_reg_list, centerness_list else: return cls_logits_list, bboxes_reg_list, centerness_list def get_loss(self, fcos_head_outs, tag_labels, tag_bboxes, tag_centerness): cls_logits, bboxes_reg, centerness = fcos_head_outs return self.fcos_loss(cls_logits, bboxes_reg, centerness, tag_labels, tag_bboxes, tag_centerness) ================================================ FILE: ppdet/modeling/heads/fcosr_head.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from paddle import ParamAttr from paddle.regularizer import L2Decay from .fcos_head import ScaleReg from ..initializer import bias_init_with_prob, constant_, normal_ from ..ops import get_act_fn, anchor_generator from ..rbox_utils import box2corners from ..losses import ProbIoULoss import numpy as np __all__ = ['FCOSRHead'] def trunc_div(a, b): ipt = paddle.divide(a, b) sign_ipt = paddle.sign(ipt) abs_ipt = paddle.abs(ipt) abs_ipt = paddle.floor(abs_ipt) out = paddle.multiply(sign_ipt, abs_ipt) return out def fmod(a, b): return a - trunc_div(a, b) * b def fmod_eval(a, b): return a - a.divide(b).cast(paddle.int32).cast(paddle.float32) * b class ConvBNLayer(nn.Layer): def __init__(self, ch_in, ch_out, filter_size=3, stride=1, groups=1, padding=0, norm_cfg={'name': 'gn', 'num_groups': 32}, act=None): super(ConvBNLayer, self).__init__() self.conv = nn.Conv2D( in_channels=ch_in, out_channels=ch_out, kernel_size=filter_size, stride=stride, padding=padding, groups=groups, bias_attr=False) norm_type = norm_cfg['name'] if norm_type in ['sync_bn', 'bn']: self.norm = nn.BatchNorm2D( ch_out, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) else: groups = norm_cfg.get('num_groups', 1) self.norm = nn.GroupNorm( num_groups=groups, num_channels=ch_out, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) self.act = get_act_fn(act) if act is None or isinstance(act, ( str, dict)) else act def forward(self, x): x = self.conv(x) x = self.norm(x) x = self.act(x) return x @register class FCOSRHead(nn.Layer): """ FCOSR Head, refer to https://arxiv.org/abs/2111.10780 for details """ __shared__ = ['num_classes', 'trt'] __inject__ = ['assigner', 'nms'] def __init__(self, num_classes=15, in_channels=256, feat_channels=256, stacked_convs=4, act='relu', fpn_strides=[4, 8, 16, 32, 64], trt=False, loss_weight={'class': 1.0, 'probiou': 1.0}, norm_cfg={'name': 'gn', 'num_groups': 32}, assigner='FCOSRAssigner', nms='MultiClassNMS'): super(FCOSRHead, self).__init__() self.in_channels = in_channels self.num_classes = num_classes self.fpn_strides = fpn_strides self.stacked_convs = stacked_convs self.loss_weight = loss_weight self.half_pi = paddle.to_tensor( [1.5707963267948966], dtype=paddle.float32) self.probiou_loss = ProbIoULoss(mode='l1') act = get_act_fn( act, trt=trt) if act is None or isinstance(act, (str, dict)) else act self.trt = trt self.loss_weight = loss_weight self.assigner = assigner self.nms = nms # stem self.stem_cls = nn.LayerList() self.stem_reg = nn.LayerList() for i in range(self.stacked_convs): self.stem_cls.append( ConvBNLayer( self.in_channels[i], feat_channels, filter_size=3, stride=1, padding=1, norm_cfg=norm_cfg, act=act)) self.stem_reg.append( ConvBNLayer( self.in_channels[i], feat_channels, filter_size=3, stride=1, padding=1, norm_cfg=norm_cfg, act=act)) self.scales = nn.LayerList( [ScaleReg() for _ in range(len(fpn_strides))]) # prediction self.pred_cls = nn.Conv2D(feat_channels, self.num_classes, 3, padding=1) self.pred_xy = nn.Conv2D(feat_channels, 2, 3, padding=1) self.pred_wh = nn.Conv2D(feat_channels, 2, 3, padding=1) self.pred_angle = nn.Conv2D(feat_channels, 1, 3, padding=1) self._init_weights() def _init_weights(self): for cls_, reg_ in zip(self.stem_cls, self.stem_reg): normal_(cls_.conv.weight, std=0.01) normal_(reg_.conv.weight, std=0.01) bias_cls = bias_init_with_prob(0.01) normal_(self.pred_cls.weight, std=0.01) constant_(self.pred_cls.bias, bias_cls) normal_(self.pred_xy.weight, std=0.01) normal_(self.pred_wh.weight, std=0.01) normal_(self.pred_angle.weight, std=0.01) @classmethod def from_config(cls, cfg, input_shape): return {'in_channels': [i.channels for i in input_shape], } def _generate_anchors(self, feats): if self.trt: anchor_points = [] for feat, stride in zip(feats, self.fpn_strides): _, _, h, w = feat.shape anchor, _ = anchor_generator( feat, stride * 4, 1.0, [1.0, 1.0, 1.0, 1.0], [stride, stride], offset=0.5) x1, y1, x2, y2 = paddle.split(anchor, 4, axis=-1) xc = (x1 + x2 + 1) / 2 yc = (y1 + y2 + 1) / 2 anchor_point = paddle.concat( [xc, yc], axis=-1).reshape((1, h * w, 2)) anchor_points.append(anchor_point) anchor_points = paddle.concat(anchor_points, axis=1) return anchor_points, None, None else: anchor_points = [] stride_tensor = [] num_anchors_list = [] for feat, stride in zip(feats, self.fpn_strides): _, _, h, w = feat.shape shift_x = (paddle.arange(end=w) + 0.5) * stride shift_y = (paddle.arange(end=h) + 0.5) * stride shift_y, shift_x = paddle.meshgrid(shift_y, shift_x) anchor_point = paddle.cast( paddle.stack( [shift_x, shift_y], axis=-1), dtype='float32') anchor_points.append(anchor_point.reshape([1, -1, 2])) stride_tensor.append( paddle.full( [1, h * w, 1], stride, dtype='float32')) num_anchors_list.append(h * w) anchor_points = paddle.concat(anchor_points, axis=1) stride_tensor = paddle.concat(stride_tensor, axis=1) return anchor_points, stride_tensor, num_anchors_list def forward(self, feats, target=None): if self.training: return self.forward_train(feats, target) else: return self.forward_eval(feats, target) def forward_train(self, feats, target=None): anchor_points, stride_tensor, num_anchors_list = self._generate_anchors( feats) cls_pred_list, reg_pred_list = [], [] for stride, feat, scale in zip(self.fpn_strides, feats, self.scales): # cls cls_feat = feat for cls_layer in self.stem_cls: cls_feat = cls_layer(cls_feat) cls_pred = F.sigmoid(self.pred_cls(cls_feat)) cls_pred_list.append(cls_pred.flatten(2).transpose((0, 2, 1))) # reg reg_feat = feat for reg_layer in self.stem_reg: reg_feat = reg_layer(reg_feat) reg_xy = scale(self.pred_xy(reg_feat)) * stride reg_wh = F.elu(scale(self.pred_wh(reg_feat)) + 1.) * stride reg_angle = self.pred_angle(reg_feat) reg_angle = fmod(reg_angle, self.half_pi) reg_pred = paddle.concat([reg_xy, reg_wh, reg_angle], axis=1) reg_pred_list.append(reg_pred.flatten(2).transpose((0, 2, 1))) cls_pred_list = paddle.concat(cls_pred_list, axis=1) reg_pred_list = paddle.concat(reg_pred_list, axis=1) return self.get_loss([ cls_pred_list, reg_pred_list, anchor_points, stride_tensor, num_anchors_list ], target) def forward_eval(self, feats, target=None): cls_pred_list, reg_pred_list = [], [] anchor_points, _, _ = self._generate_anchors(feats) for stride, feat, scale in zip(self.fpn_strides, feats, self.scales): b, _, h, w = feat.shape # cls cls_feat = feat for cls_layer in self.stem_cls: cls_feat = cls_layer(cls_feat) cls_pred = F.sigmoid(self.pred_cls(cls_feat)) cls_pred_list.append(cls_pred.reshape([b, self.num_classes, h * w])) # reg reg_feat = feat for reg_layer in self.stem_reg: reg_feat = reg_layer(reg_feat) reg_xy = scale(self.pred_xy(reg_feat)) * stride reg_wh = F.elu(scale(self.pred_wh(reg_feat)) + 1.) * stride reg_angle = self.pred_angle(reg_feat) reg_angle = fmod_eval(reg_angle, self.half_pi) reg_pred = paddle.concat([reg_xy, reg_wh, reg_angle], axis=1) reg_pred = reg_pred.reshape([b, 5, h * w]).transpose((0, 2, 1)) reg_pred_list.append(reg_pred) cls_pred_list = paddle.concat(cls_pred_list, axis=2) reg_pred_list = paddle.concat(reg_pred_list, axis=1) reg_pred_list = self._bbox_decode(anchor_points, reg_pred_list) return cls_pred_list, reg_pred_list def _bbox_decode(self, points, reg_pred_list): xy, wha = paddle.split(reg_pred_list, [2, 3], axis=-1) xy = xy + points return paddle.concat([xy, wha], axis=-1) def _box2corners(self, pred_bboxes): """ convert (x, y, w, h, angle) to (x1, y1, x2, y2, x3, y3, x4, y4) Args: pred_bboxes (Tensor): [B, N, 5] Returns: polys (Tensor): [B, N, 8] """ x, y, w, h, angle = paddle.split(pred_bboxes, 5, axis=-1) cos_a_half = paddle.cos(angle) * 0.5 sin_a_half = paddle.sin(angle) * 0.5 w_x = cos_a_half * w w_y = sin_a_half * w h_x = -sin_a_half * h h_y = cos_a_half * h return paddle.concat( [ x + w_x + h_x, y + w_y + h_y, x - w_x + h_x, y - w_y + h_y, x - w_x - h_x, y - w_y - h_y, x + w_x - h_x, y + w_y - h_y ], axis=-1) def get_loss(self, head_outs, gt_meta): cls_pred_list, reg_pred_list, anchor_points, stride_tensor, num_anchors_list = head_outs gt_labels = gt_meta['gt_class'] gt_bboxes = gt_meta['gt_bbox'] gt_rboxes = gt_meta['gt_rbox'] pad_gt_mask = gt_meta['pad_gt_mask'] # decode pred_rboxes = self._bbox_decode(anchor_points, reg_pred_list) # label assignment assigned_labels, assigned_rboxes, assigned_scores = \ self.assigner( anchor_points, stride_tensor, num_anchors_list, gt_labels, gt_bboxes, gt_rboxes, pad_gt_mask, self.num_classes, pred_rboxes ) # reg_loss mask_positive = (assigned_labels != self.num_classes) num_pos = mask_positive.sum().item() if num_pos > 0: bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 5]) pred_rboxes_pos = paddle.masked_select(pred_rboxes, bbox_mask).reshape([-1, 5]) assigned_rboxes_pos = paddle.masked_select( assigned_rboxes, bbox_mask).reshape([-1, 5]) bbox_weight = paddle.masked_select( assigned_scores.sum(-1), mask_positive).reshape([-1]) avg_factor = bbox_weight.sum() loss_probiou = self.probiou_loss(pred_rboxes_pos, assigned_rboxes_pos) loss_probiou = paddle.sum(loss_probiou * bbox_weight) / avg_factor else: loss_probiou = pred_rboxes.sum() * 0. avg_factor = max(num_pos, 1.0) # cls_loss loss_cls = self._qfocal_loss( cls_pred_list, assigned_scores, reduction='sum') loss_cls = loss_cls / avg_factor loss = self.loss_weight['class'] * loss_cls + \ self.loss_weight['probiou'] * loss_probiou out_dict = { 'loss': loss, 'loss_probiou': loss_probiou, 'loss_cls': loss_cls } return out_dict @staticmethod def _qfocal_loss(score, label, gamma=2.0, reduction='sum'): weight = (score - label).pow(gamma) loss = F.binary_cross_entropy( score, label, weight=weight, reduction=reduction) return loss def post_process(self, head_outs, scale_factor): pred_scores, pred_rboxes = head_outs # [B, N, 5] -> [B, N, 4, 2] -> [B, N, 8] pred_rboxes = self._box2corners(pred_rboxes) # scale bbox to origin scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1) scale_factor = paddle.concat( [ scale_x, scale_y, scale_x, scale_y, scale_x, scale_y, scale_x, scale_y ], axis=-1).reshape([-1, 1, 8]) pred_rboxes /= scale_factor bbox_pred, bbox_num, before_nms_indexes = self.nms(pred_rboxes, pred_scores) return bbox_pred, bbox_num, before_nms_indexes ================================================ FILE: ppdet/modeling/heads/gfl_head.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # The code is based on: # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/gfl_head.py from __future__ import absolute_import from __future__ import division from __future__ import print_function import math import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.nn.initializer import Normal, Constant from ppdet.core.workspace import register from ppdet.modeling.bbox_utils import distance2bbox, bbox2distance, batch_distance2bbox from ppdet.data.transform.atss_assigner import bbox_overlaps __all__ = ['GFLHead', 'LDGFLHead'] class ScaleReg(nn.Layer): """ Parameter for scaling the regression outputs. """ def __init__(self): super(ScaleReg, self).__init__() self.scale_reg = self.create_parameter( shape=[1], attr=ParamAttr(initializer=Constant(value=1.)), dtype="float32") def forward(self, inputs): out = inputs * self.scale_reg return out class Integral(nn.Layer): """A fixed layer for calculating integral result from distribution. This layer calculates the target location by :math: `sum{P(y_i) * y_i}`, P(y_i) denotes the softmax vector that represents the discrete distribution y_i denotes the discrete set, usually {0, 1, 2, ..., reg_max} Args: reg_max (int): The maximal value of the discrete set. Default: 16. You may want to reset it according to your new dataset or related settings. """ def __init__(self, reg_max=16): super(Integral, self).__init__() self.reg_max = reg_max self.register_buffer('project', paddle.linspace(0, self.reg_max, self.reg_max + 1)) def forward(self, x): """Forward feature from the regression head to get integral result of bounding box location. Args: x (Tensor): Features of the regression head, shape (N, 4*(n+1)), n is self.reg_max. Returns: x (Tensor): Integral result of box locations, i.e., distance offsets from the box center in four directions, shape (N, 4). """ x = F.softmax(x.reshape([-1, self.reg_max + 1]), axis=1) x = F.linear(x, self.project) if self.training: x = x.reshape([-1, 4]) return x @register class DGQP(nn.Layer): """Distribution-Guided Quality Predictor of GFocal head Args: reg_topk (int): top-k statistics of distribution to guide LQE reg_channels (int): hidden layer unit to generate LQE add_mean (bool): Whether to calculate the mean of top-k statistics """ def __init__(self, reg_topk=4, reg_channels=64, add_mean=True): super(DGQP, self).__init__() self.reg_topk = reg_topk self.reg_channels = reg_channels self.add_mean = add_mean self.total_dim = reg_topk if add_mean: self.total_dim += 1 self.reg_conv1 = self.add_sublayer( 'dgqp_reg_conv1', nn.Conv2D( in_channels=4 * self.total_dim, out_channels=self.reg_channels, kernel_size=1, weight_attr=ParamAttr(initializer=Normal( mean=0., std=0.01)), bias_attr=ParamAttr(initializer=Constant(value=0)))) self.reg_conv2 = self.add_sublayer( 'dgqp_reg_conv2', nn.Conv2D( in_channels=self.reg_channels, out_channels=1, kernel_size=1, weight_attr=ParamAttr(initializer=Normal( mean=0., std=0.01)), bias_attr=ParamAttr(initializer=Constant(value=0)))) def forward(self, x): """Forward feature from the regression head to get integral result of bounding box location. Args: x (Tensor): Features of the regression head, shape (N, 4*(n+1)), n is self.reg_max. Returns: x (Tensor): Integral result of box locations, i.e., distance offsets from the box center in four directions, shape (N, 4). """ N, _, H, W = x.shape[:] prob = F.softmax(x.reshape([N, 4, -1, H, W]), axis=2) prob_topk, _ = prob.topk(self.reg_topk, axis=2) if self.add_mean: stat = paddle.concat( [prob_topk, prob_topk.mean( axis=2, keepdim=True)], axis=2) else: stat = prob_topk y = F.relu(self.reg_conv1(stat.reshape([N, 4 * self.total_dim, H, W]))) y = F.sigmoid(self.reg_conv2(y)) return y @register class GFLHead(nn.Layer): """ GFLHead Args: conv_feat (object): Instance of 'FCOSFeat' num_classes (int): Number of classes fpn_stride (list): The stride of each FPN Layer prior_prob (float): Used to set the bias init for the class prediction layer loss_class (object): Instance of QualityFocalLoss. loss_dfl (object): Instance of DistributionFocalLoss. loss_bbox (object): Instance of bbox loss. reg_max: Max value of integral set :math: `{0, ..., reg_max}` n QFL setting. Default: 16. """ __inject__ = [ 'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox', 'nms' ] __shared__ = ['num_classes'] def __init__(self, conv_feat='FCOSFeat', dgqp_module=None, num_classes=80, fpn_stride=[8, 16, 32, 64, 128], prior_prob=0.01, loss_class='QualityFocalLoss', loss_dfl='DistributionFocalLoss', loss_bbox='GIoULoss', reg_max=16, feat_in_chan=256, nms=None, nms_pre=1000, cell_offset=0): super(GFLHead, self).__init__() self.conv_feat = conv_feat self.dgqp_module = dgqp_module self.num_classes = num_classes self.fpn_stride = fpn_stride self.prior_prob = prior_prob self.loss_qfl = loss_class self.loss_dfl = loss_dfl self.loss_bbox = loss_bbox self.reg_max = reg_max self.feat_in_chan = feat_in_chan self.nms = nms self.nms_pre = nms_pre self.cell_offset = cell_offset self.use_sigmoid = self.loss_qfl.use_sigmoid if self.use_sigmoid: self.cls_out_channels = self.num_classes else: self.cls_out_channels = self.num_classes + 1 conv_cls_name = "gfl_head_cls" bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob) self.gfl_head_cls = self.add_sublayer( conv_cls_name, nn.Conv2D( in_channels=self.feat_in_chan, out_channels=self.cls_out_channels, kernel_size=3, stride=1, padding=1, weight_attr=ParamAttr(initializer=Normal( mean=0., std=0.01)), bias_attr=ParamAttr( initializer=Constant(value=bias_init_value)))) conv_reg_name = "gfl_head_reg" self.gfl_head_reg = self.add_sublayer( conv_reg_name, nn.Conv2D( in_channels=self.feat_in_chan, out_channels=4 * (self.reg_max + 1), kernel_size=3, stride=1, padding=1, weight_attr=ParamAttr(initializer=Normal( mean=0., std=0.01)), bias_attr=ParamAttr(initializer=Constant(value=0)))) self.scales_regs = [] for i in range(len(self.fpn_stride)): lvl = int(math.log(int(self.fpn_stride[i]), 2)) feat_name = 'p{}_feat'.format(lvl) scale_reg = self.add_sublayer(feat_name, ScaleReg()) self.scales_regs.append(scale_reg) self.distribution_project = Integral(self.reg_max) def forward(self, fpn_feats): assert len(fpn_feats) == len( self.fpn_stride ), "The size of fpn_feats is not equal to size of fpn_stride" cls_logits_list = [] bboxes_reg_list = [] for stride, scale_reg, fpn_feat in zip(self.fpn_stride, self.scales_regs, fpn_feats): conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat) cls_score = self.gfl_head_cls(conv_cls_feat) bbox_pred = scale_reg(self.gfl_head_reg(conv_reg_feat)) if self.dgqp_module: quality_score = self.dgqp_module(bbox_pred) cls_score = F.sigmoid(cls_score) * quality_score if not self.training: cls_score = F.sigmoid(cls_score.transpose([0, 2, 3, 1])) bbox_pred = bbox_pred.transpose([0, 2, 3, 1]) b, cell_h, cell_w, _ = cls_score.shape y, x = self.get_single_level_center_point( [cell_h, cell_w], stride, cell_offset=self.cell_offset) center_points = paddle.stack([x, y], axis=-1) cls_score = cls_score.reshape([b, -1, self.cls_out_channels]) bbox_pred = self.distribution_project(bbox_pred) * stride bbox_pred = bbox_pred.reshape([-1, cell_h * cell_w, 4]) # NOTE: If keep_ratio=False and image shape value that # multiples of 32, distance2bbox not set max_shapes parameter # to speed up model prediction. If need to set max_shapes, # please use inputs['im_shape']. bbox_pred = batch_distance2bbox( center_points, bbox_pred, max_shapes=None) cls_logits_list.append(cls_score) bboxes_reg_list.append(bbox_pred) return (cls_logits_list, bboxes_reg_list) def _images_to_levels(self, target, num_level_anchors): """ Convert targets by image to targets by feature level. """ level_targets = [] start = 0 for n in num_level_anchors: end = start + n level_targets.append(target[:, start:end].squeeze(0)) start = end return level_targets def _grid_cells_to_center(self, grid_cells): """ Get center location of each gird cell Args: grid_cells: grid cells of a feature map Returns: center points """ cells_cx = (grid_cells[:, 2] + grid_cells[:, 0]) / 2 cells_cy = (grid_cells[:, 3] + grid_cells[:, 1]) / 2 return paddle.stack([cells_cx, cells_cy], axis=-1) def get_loss(self, gfl_head_outs, gt_meta): cls_logits, bboxes_reg = gfl_head_outs num_level_anchors = [ featmap.shape[-2] * featmap.shape[-1] for featmap in cls_logits ] grid_cells_list = self._images_to_levels(gt_meta['grid_cells'], num_level_anchors) labels_list = self._images_to_levels(gt_meta['labels'], num_level_anchors) label_weights_list = self._images_to_levels(gt_meta['label_weights'], num_level_anchors) bbox_targets_list = self._images_to_levels(gt_meta['bbox_targets'], num_level_anchors) num_total_pos = sum(gt_meta['pos_num']) try: paddle.distributed.all_reduce(num_total_pos) num_total_pos = paddle.clip( num_total_pos / paddle.distributed.get_world_size(), min=1) except: num_total_pos = max(num_total_pos, 1) loss_bbox_list, loss_dfl_list, loss_qfl_list, avg_factor = [], [], [], [] for cls_score, bbox_pred, grid_cells, labels, label_weights, bbox_targets, stride in zip( cls_logits, bboxes_reg, grid_cells_list, labels_list, label_weights_list, bbox_targets_list, self.fpn_stride): grid_cells = grid_cells.reshape([-1, 4]) cls_score = cls_score.transpose([0, 2, 3, 1]).reshape( [-1, self.cls_out_channels]) bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape( [-1, 4 * (self.reg_max + 1)]) bbox_targets = bbox_targets.reshape([-1, 4]) labels = labels.reshape([-1]) label_weights = label_weights.reshape([-1]) bg_class_ind = self.num_classes pos_inds = paddle.nonzero( paddle.logical_and((labels >= 0), (labels < bg_class_ind)), as_tuple=False).squeeze(1) score = np.zeros(labels.shape) if len(pos_inds) > 0: pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0) pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0) pos_grid_cells = paddle.gather(grid_cells, pos_inds, axis=0) pos_grid_cell_centers = self._grid_cells_to_center( pos_grid_cells) / stride weight_targets = F.sigmoid(cls_score.detach()) weight_targets = paddle.gather( weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0) pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred) pos_decode_bbox_pred = distance2bbox(pos_grid_cell_centers, pos_bbox_pred_corners) pos_decode_bbox_targets = pos_bbox_targets / stride bbox_iou = bbox_overlaps( pos_decode_bbox_pred.detach().numpy(), pos_decode_bbox_targets.detach().numpy(), is_aligned=True) score[pos_inds.numpy()] = bbox_iou pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1]) target_corners = bbox2distance(pos_grid_cell_centers, pos_decode_bbox_targets, self.reg_max).reshape([-1]) # regression loss loss_bbox = paddle.sum( self.loss_bbox(pos_decode_bbox_pred, pos_decode_bbox_targets) * weight_targets) # dfl loss loss_dfl = self.loss_dfl( pred_corners, target_corners, weight=weight_targets.expand([-1, 4]).reshape([-1]), avg_factor=4.0) else: loss_bbox = bbox_pred.sum() * 0 loss_dfl = bbox_pred.sum() * 0 weight_targets = paddle.to_tensor([0], dtype='float32') # qfl loss score = paddle.to_tensor(score) loss_qfl = self.loss_qfl( cls_score, (labels, score), weight=label_weights, avg_factor=num_total_pos) loss_bbox_list.append(loss_bbox) loss_dfl_list.append(loss_dfl) loss_qfl_list.append(loss_qfl) avg_factor.append(weight_targets.sum()) avg_factor = sum(avg_factor) try: paddle.distributed.all_reduce(avg_factor) avg_factor = paddle.clip( avg_factor / paddle.distributed.get_world_size(), min=1) except: avg_factor = max(avg_factor.item(), 1) if avg_factor <= 0: loss_qfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False) loss_bbox = paddle.to_tensor( 0, dtype='float32', stop_gradient=False) loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False) else: losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list)) losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list)) loss_qfl = sum(loss_qfl_list) loss_bbox = sum(losses_bbox) loss_dfl = sum(losses_dfl) loss_states = dict( loss_qfl=loss_qfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl) return loss_states def get_single_level_center_point(self, featmap_size, stride, cell_offset=0): """ Generate pixel centers of a single stage feature map. Args: featmap_size: height and width of the feature map stride: down sample stride of the feature map Returns: y and x of the center points """ h, w = featmap_size x_range = (paddle.arange(w, dtype='float32') + cell_offset) * stride y_range = (paddle.arange(h, dtype='float32') + cell_offset) * stride y, x = paddle.meshgrid(y_range, x_range) y = y.flatten() x = x.flatten() return y, x def post_process(self, gfl_head_outs, im_shape, scale_factor): cls_scores, bboxes_reg = gfl_head_outs bboxes = paddle.concat(bboxes_reg, axis=1) # rescale: [h_scale, w_scale] -> [w_scale, h_scale, w_scale, h_scale] im_scale = scale_factor.flip([1]).tile([1, 2]).unsqueeze(1) bboxes /= im_scale mlvl_scores = paddle.concat(cls_scores, axis=1) mlvl_scores = mlvl_scores.transpose([0, 2, 1]) bbox_pred, bbox_num, _ = self.nms(bboxes, mlvl_scores) return bbox_pred, bbox_num @register class LDGFLHead(GFLHead): """ GFLHead for LD distill Args: conv_feat (object): Instance of 'FCOSFeat' num_classes (int): Number of classes fpn_stride (list): The stride of each FPN Layer prior_prob (float): Used to set the bias init for the class prediction layer loss_class (object): Instance of QualityFocalLoss. loss_dfl (object): Instance of DistributionFocalLoss. loss_bbox (object): Instance of bbox loss. reg_max: Max value of integral set :math: `{0, ..., reg_max}` n QFL setting. Default: 16. """ __inject__ = [ 'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox', 'loss_ld', 'loss_ld_vlr', 'loss_kd', 'nms' ] __shared__ = ['num_classes'] def __init__(self, conv_feat='FCOSFeat', dgqp_module=None, num_classes=80, fpn_stride=[8, 16, 32, 64, 128], prior_prob=0.01, loss_class='QualityFocalLoss', loss_dfl='DistributionFocalLoss', loss_bbox='GIoULoss', loss_ld='KnowledgeDistillationKLDivLoss', loss_ld_vlr='KnowledgeDistillationKLDivLoss', loss_kd='KnowledgeDistillationKLDivLoss', reg_max=16, feat_in_chan=256, nms=None, nms_pre=1000, cell_offset=0): super(LDGFLHead, self).__init__( conv_feat=conv_feat, dgqp_module=dgqp_module, num_classes=num_classes, fpn_stride=fpn_stride, prior_prob=prior_prob, loss_class=loss_class, loss_dfl=loss_dfl, loss_bbox=loss_bbox, reg_max=reg_max, feat_in_chan=feat_in_chan, nms=nms, nms_pre=nms_pre, cell_offset=cell_offset) self.loss_ld = loss_ld self.loss_kd = loss_kd self.loss_ld_vlr = loss_ld_vlr def forward(self, fpn_feats): assert len(fpn_feats) == len( self.fpn_stride ), "The size of fpn_feats is not equal to size of fpn_stride" cls_logits_list = [] bboxes_reg_list = [] for stride, scale_reg, fpn_feat in zip(self.fpn_stride, self.scales_regs, fpn_feats): conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat) cls_score = self.gfl_head_cls(conv_cls_feat) bbox_pred = scale_reg(self.gfl_head_reg(conv_reg_feat)) if self.dgqp_module: quality_score = self.dgqp_module(bbox_pred) cls_score = F.sigmoid(cls_score) * quality_score if not self.training: cls_score = F.sigmoid(cls_score.transpose([0, 2, 3, 1])) bbox_pred = bbox_pred.transpose([0, 2, 3, 1]) b, cell_h, cell_w, _ = cls_score.shape y, x = self.get_single_level_center_point( [cell_h, cell_w], stride, cell_offset=self.cell_offset) center_points = paddle.stack([x, y], axis=-1) cls_score = cls_score.reshape([b, -1, self.cls_out_channels]) bbox_pred = self.distribution_project(bbox_pred) * stride bbox_pred = bbox_pred.reshape([b, cell_h * cell_w, 4]) # NOTE: If keep_ratio=False and image shape value that # multiples of 32, distance2bbox not set max_shapes parameter # to speed up model prediction. If need to set max_shapes, # please use inputs['im_shape']. bbox_pred = batch_distance2bbox( center_points, bbox_pred, max_shapes=None) cls_logits_list.append(cls_score) bboxes_reg_list.append(bbox_pred) return (cls_logits_list, bboxes_reg_list) def get_loss(self, gfl_head_outs, gt_meta, soft_label_list, soft_targets_list): cls_logits, bboxes_reg = gfl_head_outs num_level_anchors = [ featmap.shape[-2] * featmap.shape[-1] for featmap in cls_logits ] grid_cells_list = self._images_to_levels(gt_meta['grid_cells'], num_level_anchors) labels_list = self._images_to_levels(gt_meta['labels'], num_level_anchors) label_weights_list = self._images_to_levels(gt_meta['label_weights'], num_level_anchors) bbox_targets_list = self._images_to_levels(gt_meta['bbox_targets'], num_level_anchors) # vlr regions vlr_regions_list = self._images_to_levels(gt_meta['vlr_regions'], num_level_anchors) num_total_pos = sum(gt_meta['pos_num']) try: paddle.distributed.all_reduce(num_total_pos) num_total_pos = paddle.clip( num_total_pos / paddle.distributed.get_world_size(), min=1.) except: num_total_pos = max(num_total_pos, 1) loss_bbox_list, loss_dfl_list, loss_qfl_list, loss_ld_list, avg_factor = [], [], [], [], [] loss_ld_vlr_list, loss_kd_list = [], [] for cls_score, bbox_pred, grid_cells, labels, label_weights, bbox_targets, stride, soft_targets,\ soft_label, vlr_region in zip( cls_logits, bboxes_reg, grid_cells_list, labels_list, label_weights_list, bbox_targets_list, self.fpn_stride, soft_targets_list, soft_label_list, vlr_regions_list): grid_cells = grid_cells.reshape([-1, 4]) cls_score = cls_score.transpose([0, 2, 3, 1]).reshape( [-1, self.cls_out_channels]) bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape( [-1, 4 * (self.reg_max + 1)]) soft_targets = soft_targets.transpose([0, 2, 3, 1]).reshape( [-1, 4 * (self.reg_max + 1)]) soft_label = soft_label.transpose([0, 2, 3, 1]).reshape( [-1, self.cls_out_channels]) # feture im # teacher_x = teacher_x.transpose([0, 2, 3, 1]).reshape([-1, 256]) # x = x.transpose([0, 2, 3, 1]).reshape([-1, 256]) bbox_targets = bbox_targets.reshape([-1, 4]) labels = labels.reshape([-1]) label_weights = label_weights.reshape([-1]) vlr_region = vlr_region.reshape([-1]) bg_class_ind = self.num_classes pos_inds = paddle.nonzero( paddle.logical_and((labels >= 0), (labels < bg_class_ind)), as_tuple=False).squeeze(1) score = np.zeros(labels.shape) remain_inds = (vlr_region > 0).nonzero() if len(pos_inds) > 0: pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0) pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0) pos_grid_cells = paddle.gather(grid_cells, pos_inds, axis=0) pos_grid_cell_centers = self._grid_cells_to_center( pos_grid_cells) / stride weight_targets = F.sigmoid(cls_score.detach()) weight_targets = paddle.gather( weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0) pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred) pos_decode_bbox_pred = distance2bbox(pos_grid_cell_centers, pos_bbox_pred_corners) pos_decode_bbox_targets = pos_bbox_targets / stride bbox_iou = bbox_overlaps( pos_decode_bbox_pred.detach().numpy(), pos_decode_bbox_targets.detach().numpy(), is_aligned=True) score[pos_inds.numpy()] = bbox_iou pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1]) pos_soft_targets = paddle.gather(soft_targets, pos_inds, axis=0) soft_corners = pos_soft_targets.reshape([-1, self.reg_max + 1]) target_corners = bbox2distance(pos_grid_cell_centers, pos_decode_bbox_targets, self.reg_max).reshape([-1]) # regression loss loss_bbox = paddle.sum( self.loss_bbox(pos_decode_bbox_pred, pos_decode_bbox_targets) * weight_targets) # dfl loss loss_dfl = self.loss_dfl( pred_corners, target_corners, weight=weight_targets.expand([-1, 4]).reshape([-1]), avg_factor=4.0) # ld loss loss_ld = self.loss_ld( pred_corners, soft_corners, weight=weight_targets.expand([-1, 4]).reshape([-1]), avg_factor=4.0) loss_kd = self.loss_kd( paddle.gather( cls_score, pos_inds, axis=0), paddle.gather( soft_label, pos_inds, axis=0), weight=paddle.gather( label_weights, pos_inds, axis=0), avg_factor=pos_inds.shape[0]) else: loss_bbox = bbox_pred.sum() * 0 loss_dfl = bbox_pred.sum() * 0 loss_ld = bbox_pred.sum() * 0 loss_kd = bbox_pred.sum() * 0 weight_targets = paddle.to_tensor([0], dtype='float32') if len(remain_inds) > 0: neg_pred_corners = bbox_pred[remain_inds].reshape( [-1, self.reg_max + 1]) neg_soft_corners = soft_targets[remain_inds].reshape( [-1, self.reg_max + 1]) remain_targets = vlr_region[remain_inds] loss_ld_vlr = self.loss_ld_vlr( neg_pred_corners, neg_soft_corners, weight=remain_targets.expand([-1, 4]).reshape([-1]), avg_factor=16.0) else: loss_ld_vlr = bbox_pred.sum() * 0 # qfl loss score = paddle.to_tensor(score) loss_qfl = self.loss_qfl( cls_score, (labels, score), weight=label_weights, avg_factor=num_total_pos) loss_bbox_list.append(loss_bbox) loss_dfl_list.append(loss_dfl) loss_qfl_list.append(loss_qfl) loss_ld_list.append(loss_ld) loss_ld_vlr_list.append(loss_ld_vlr) loss_kd_list.append(loss_kd) avg_factor.append(weight_targets.sum()) avg_factor = sum(avg_factor) # + 1e-6 try: paddle.distributed.all_reduce(avg_factor) avg_factor = paddle.clip( avg_factor / paddle.distributed.get_world_size(), min=1) except: avg_factor = max(avg_factor.item(), 1) if avg_factor <= 0: loss_qfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False) loss_bbox = paddle.to_tensor( 0, dtype='float32', stop_gradient=False) loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False) loss_ld = paddle.to_tensor(0, dtype='float32', stop_gradient=False) loss_ld_vlr = paddle.to_tensor( 0, dtype='float32', stop_gradient=False) loss_kd = paddle.to_tensor(0, dtype='float32', stop_gradient=False) else: losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list)) losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list)) loss_qfl = sum(loss_qfl_list) loss_bbox = sum(losses_bbox) loss_dfl = sum(losses_dfl) loss_ld = sum(loss_ld_list) loss_ld_vlr = sum(loss_ld_vlr_list) loss_kd = sum(loss_kd_list) loss_states = dict( loss_qfl=loss_qfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl, loss_ld=loss_ld, loss_ld_vlr=loss_ld_vlr, loss_kd=loss_kd) return loss_states ================================================ FILE: ppdet/modeling/heads/keypoint_hrhrnet_head.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn from ppdet.core.workspace import register from .. import layers as L from ..backbones.hrnet import BasicBlock @register class HrHRNetHead(nn.Layer): __inject__ = ['loss'] def __init__(self, num_joints, loss='HrHRNetLoss', swahr=False, width=32): """ Head for HigherHRNet network Args: num_joints (int): number of keypoints hrloss (object): HrHRNetLoss instance swahr (bool): whether to use swahr width (int): hrnet channel width """ super(HrHRNetHead, self).__init__() self.loss = loss self.num_joints = num_joints num_featout1 = num_joints * 2 num_featout2 = num_joints self.swahr = swahr self.conv1 = L.Conv2d(width, num_featout1, 1, 1, 0, bias=True) self.conv2 = L.Conv2d(width, num_featout2, 1, 1, 0, bias=True) self.deconv = nn.Sequential( L.ConvTranspose2d( num_featout1 + width, width, 4, 2, 1, 0, bias=False), L.BatchNorm2d(width), L.ReLU()) self.blocks = nn.Sequential(*(BasicBlock( num_channels=width, num_filters=width, has_se=False, freeze_norm=False, name='HrHRNetHead_{}'.format(i)) for i in range(4))) self.interpolate = L.Upsample(2, mode='bilinear') self.concat = L.Concat(dim=1) if swahr: self.scalelayer0 = nn.Sequential( L.Conv2d( width, num_joints, 1, 1, 0, bias=True), L.BatchNorm2d(num_joints), L.ReLU(), L.Conv2d( num_joints, num_joints, 9, 1, 4, groups=num_joints, bias=True)) self.scalelayer1 = nn.Sequential( L.Conv2d( width, num_joints, 1, 1, 0, bias=True), L.BatchNorm2d(num_joints), L.ReLU(), L.Conv2d( num_joints, num_joints, 9, 1, 4, groups=num_joints, bias=True)) def forward(self, feats, targets=None): x1 = feats[0] xo1 = self.conv1(x1) x2 = self.blocks(self.deconv(self.concat((x1, xo1)))) xo2 = self.conv2(x2) num_joints = self.num_joints if self.training: heatmap1, tagmap = paddle.split(xo1, 2, axis=1) if self.swahr: so1 = self.scalelayer0(x1) so2 = self.scalelayer1(x2) hrhrnet_outputs = ([heatmap1, so1], [xo2, so2], tagmap) return self.loss(hrhrnet_outputs, targets) else: hrhrnet_outputs = (heatmap1, xo2, tagmap) return self.loss(hrhrnet_outputs, targets) # averaged heatmap, upsampled tagmap upsampled = self.interpolate(xo1) avg = (upsampled[:, :num_joints] + xo2[:, :num_joints]) / 2 return avg, upsampled[:, num_joints:] ================================================ FILE: ppdet/modeling/heads/mask_head.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle.nn.initializer import KaimingNormal from ppdet.core.workspace import register, create from ppdet.modeling.layers import ConvNormLayer from .roi_extractor import RoIAlign from ..cls_utils import _get_class_default_kwargs @register class MaskFeat(nn.Layer): """ Feature extraction in Mask head Args: in_channel (int): Input channels out_channel (int): Output channels num_convs (int): The number of conv layers, default 4 norm_type (string | None): Norm type, bn, gn, sync_bn are available, default None """ def __init__(self, in_channel=256, out_channel=256, num_convs=4, norm_type=None): super(MaskFeat, self).__init__() self.num_convs = num_convs self.in_channel = in_channel self.out_channel = out_channel self.norm_type = norm_type fan_conv = out_channel * 3 * 3 fan_deconv = out_channel * 2 * 2 mask_conv = nn.Sequential() if norm_type == 'gn': for i in range(self.num_convs): conv_name = 'mask_inter_feat_{}'.format(i + 1) mask_conv.add_sublayer( conv_name, ConvNormLayer( ch_in=in_channel if i == 0 else out_channel, ch_out=out_channel, filter_size=3, stride=1, norm_type=self.norm_type, initializer=KaimingNormal(fan_in=fan_conv), skip_quant=True)) mask_conv.add_sublayer(conv_name + 'act', nn.ReLU()) else: for i in range(self.num_convs): conv_name = 'mask_inter_feat_{}'.format(i + 1) conv = nn.Conv2D( in_channels=in_channel if i == 0 else out_channel, out_channels=out_channel, kernel_size=3, padding=1, weight_attr=paddle.ParamAttr( initializer=KaimingNormal(fan_in=fan_conv))) conv.skip_quant = True mask_conv.add_sublayer(conv_name, conv) mask_conv.add_sublayer(conv_name + 'act', nn.ReLU()) mask_conv.add_sublayer( 'conv5_mask', nn.Conv2DTranspose( in_channels=self.out_channel if num_convs > 0 else self.in_channel, out_channels=self.out_channel, kernel_size=2, stride=2, weight_attr=paddle.ParamAttr( initializer=KaimingNormal(fan_in=fan_deconv)))) mask_conv.add_sublayer('conv5_mask' + 'act', nn.ReLU()) self.upsample = mask_conv @classmethod def from_config(cls, cfg, input_shape): if isinstance(input_shape, (list, tuple)): input_shape = input_shape[0] return {'in_channel': input_shape.channels, } def out_channels(self): return self.out_channel def forward(self, feats): return self.upsample(feats) @register class MaskHead(nn.Layer): __shared__ = ['num_classes', 'export_onnx'] __inject__ = ['mask_assigner'] """ RCNN mask head Args: head (nn.Layer): Extract feature in mask head roi_extractor (object): The module of RoI Extractor mask_assigner (object): The module of Mask Assigner, label and sample the mask num_classes (int): The number of classes share_bbox_feat (bool): Whether to share the feature from bbox head, default false """ def __init__(self, head, roi_extractor=_get_class_default_kwargs(RoIAlign), mask_assigner='MaskAssigner', num_classes=80, share_bbox_feat=False, export_onnx=False): super(MaskHead, self).__init__() self.num_classes = num_classes self.export_onnx = export_onnx self.roi_extractor = roi_extractor if isinstance(roi_extractor, dict): self.roi_extractor = RoIAlign(**roi_extractor) self.head = head self.in_channels = head.out_channels() self.mask_assigner = mask_assigner self.share_bbox_feat = share_bbox_feat self.bbox_head = None self.mask_fcn_logits = nn.Conv2D( in_channels=self.in_channels, out_channels=self.num_classes, kernel_size=1, weight_attr=paddle.ParamAttr(initializer=KaimingNormal( fan_in=self.num_classes))) self.mask_fcn_logits.skip_quant = True @classmethod def from_config(cls, cfg, input_shape): roi_pooler = cfg['roi_extractor'] assert isinstance(roi_pooler, dict) kwargs = RoIAlign.from_config(cfg, input_shape) roi_pooler.update(kwargs) kwargs = {'input_shape': input_shape} head = create(cfg['head'], **kwargs) return { 'roi_extractor': roi_pooler, 'head': head, } def get_loss(self, mask_logits, mask_label, mask_target, mask_weight): mask_label = F.one_hot(mask_label, self.num_classes).unsqueeze([2, 3]) mask_label = paddle.expand_as(mask_label, mask_logits) mask_label.stop_gradient = True mask_pred = paddle.gather_nd(mask_logits, paddle.nonzero(mask_label)) shape = mask_logits.shape mask_pred = paddle.reshape(mask_pred, [shape[0], shape[2], shape[3]]) mask_target = mask_target.cast('float32') mask_weight = mask_weight.unsqueeze([1, 2]) loss_mask = F.binary_cross_entropy_with_logits( mask_pred, mask_target, weight=mask_weight, reduction="mean") return loss_mask def forward_train(self, body_feats, rois, rois_num, inputs, targets, bbox_feat): """ body_feats (list[Tensor]): Multi-level backbone features rois (list[Tensor]): Proposals for each batch with shape [N, 4] rois_num (Tensor): The number of proposals for each batch inputs (dict): ground truth info """ tgt_labels, _, tgt_gt_inds = targets rois, rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights = self.mask_assigner( rois, tgt_labels, tgt_gt_inds, inputs) if self.share_bbox_feat: rois_feat = paddle.gather(bbox_feat, mask_index) else: rois_feat = self.roi_extractor(body_feats, rois, rois_num) mask_feat = self.head(rois_feat) mask_logits = self.mask_fcn_logits(mask_feat) loss_mask = self.get_loss(mask_logits, tgt_classes, tgt_masks, tgt_weights) return {'loss_mask': loss_mask} def forward_test(self, body_feats, rois, rois_num, scale_factor, feat_func=None): """ body_feats (list[Tensor]): Multi-level backbone features rois (Tensor): Prediction from bbox head with shape [N, 6] rois_num (Tensor): The number of prediction for each batch scale_factor (Tensor): The scale factor from origin size to input size """ if not self.export_onnx and rois.shape[0] == 0: mask_out = paddle.full([1, 1, 1], -1) else: bbox = [rois[:, 2:]] labels = rois[:, 0].cast('int32') rois_feat = self.roi_extractor(body_feats, bbox, rois_num) if self.share_bbox_feat: assert feat_func is not None rois_feat = feat_func(rois_feat) mask_feat = self.head(rois_feat) mask_logit = self.mask_fcn_logits(mask_feat) if self.num_classes == 1: mask_out = F.sigmoid(mask_logit)[:, 0, :, :] else: num_masks = mask_logit.shape[0] index = paddle.arange(num_masks).cast('int32') mask_out = mask_logit[index, labels] mask_out_shape = mask_out.shape mask_out = paddle.reshape(mask_out, index.shape + [mask_out_shape[-2]] + [mask_out_shape[-1]]) mask_out = F.sigmoid(mask_out) return mask_out def forward(self, body_feats, rois, rois_num, inputs, targets=None, bbox_feat=None, feat_func=None): if self.training: return self.forward_train(body_feats, rois, rois_num, inputs, targets, bbox_feat) else: im_scale = inputs['scale_factor'] return self.forward_test(body_feats, rois, rois_num, im_scale, feat_func) ================================================ FILE: ppdet/modeling/heads/petr_head.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ this code is base on https://github.com/hikvision-research/opera/blob/main/opera/models/dense_heads/petr_head.py """ import copy import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register import paddle.distributed as dist from ..transformers.petr_transformer import inverse_sigmoid, masked_fill from ..initializer import constant_, normal_ __all__ = ["PETRHead"] from functools import partial def bias_init_with_prob(prior_prob: float) -> float: """initialize conv/fc bias value according to a given probability value.""" bias_init = float(-np.log((1 - prior_prob) / prior_prob)) return bias_init def multi_apply(func, *args, **kwargs): """Apply function to a list of arguments. Note: This function applies the ``func`` to multiple inputs and map the multiple outputs of the ``func`` into different list. Each list contains the same type of outputs corresponding to different inputs. Args: func (Function): A function that will be applied to a list of arguments Returns: tuple(list): A tuple containing multiple list, each list contains \ a kind of returned results by the function """ pfunc = partial(func, **kwargs) if kwargs else func map_results = map(pfunc, *args) res = tuple(map(list, zip(*map_results))) return res def reduce_mean(tensor): """"Obtain the mean of tensor on different GPUs.""" if not (dist.get_world_size() and dist.is_initialized()): return tensor tensor = tensor.clone() dist.all_reduce( tensor.divide( paddle.to_tensor( dist.get_world_size(), dtype='float32')), op=dist.ReduceOp.SUM) return tensor def gaussian_radius(det_size, min_overlap=0.7): """calculate gaussian radius according to object size. """ height, width = det_size a1 = 1 b1 = (height + width) c1 = width * height * (1 - min_overlap) / (1 + min_overlap) sq1 = paddle.sqrt(b1**2 - 4 * a1 * c1) r1 = (b1 + sq1) / 2 a2 = 4 b2 = 2 * (height + width) c2 = (1 - min_overlap) * width * height sq2 = paddle.sqrt(b2**2 - 4 * a2 * c2) r2 = (b2 + sq2) / 2 a3 = 4 * min_overlap b3 = -2 * min_overlap * (height + width) c3 = (min_overlap - 1) * width * height sq3 = paddle.sqrt(b3**2 - 4 * a3 * c3) r3 = (b3 + sq3) / 2 return min(r1, r2, r3) def gaussian2D(shape, sigma=1): m, n = [(ss - 1.) / 2. for ss in shape] y = paddle.arange(-m, m + 1, dtype="float32")[:, None] x = paddle.arange(-n, n + 1, dtype="float32")[None, :] # y, x = np.ogrid[-m:m + 1, -n:n + 1] h = paddle.exp(-(x * x + y * y) / (2 * sigma * sigma)) h[h < np.finfo(np.float32).eps * h.max()] = 0 return h def draw_umich_gaussian(heatmap, center, radius, k=1): diameter = 2 * radius + 1 gaussian = gaussian2D((diameter, diameter), sigma=diameter / 6) gaussian = paddle.to_tensor(gaussian, dtype=heatmap.dtype) x, y = int(center[0]), int(center[1]) radius = int(radius) height, width = heatmap.shape[0:2] left, right = min(x, radius), min(width - x, radius + 1) top, bottom = min(y, radius), min(height - y, radius + 1) masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] masked_gaussian = gaussian[radius - top:radius + bottom, radius - left: radius + right] # assert masked_gaussian.equal(1).float().sum() == 1 if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0: heatmap[y - top:y + bottom, x - left:x + right] = paddle.maximum( masked_heatmap, masked_gaussian * k) return heatmap @register class PETRHead(nn.Layer): """Head of `End-to-End Multi-Person Pose Estimation with Transformers`. Args: num_classes (int): Number of categories excluding the background. in_channels (int): Number of channels in the input feature map. num_query (int): Number of query in Transformer. num_kpt_fcs (int, optional): Number of fully-connected layers used in `FFN`, which is then used for the keypoint regression head. Default 2. transformer (obj:`mmcv.ConfigDict`|dict): ConfigDict is used for building the Encoder and Decoder. Default: None. sync_cls_avg_factor (bool): Whether to sync the avg_factor of all ranks. Default to False. positional_encoding (obj:`mmcv.ConfigDict`|dict): Config for position encoding. loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the classification loss. Default `CrossEntropyLoss`. loss_kpt (obj:`mmcv.ConfigDict`|dict): Config of the regression loss. Default `L1Loss`. loss_oks (obj:`mmcv.ConfigDict`|dict): Config of the regression oks loss. Default `OKSLoss`. loss_hm (obj:`mmcv.ConfigDict`|dict): Config of the regression heatmap loss. Default `NegLoss`. as_two_stage (bool) : Whether to generate the proposal from the outputs of encoder. with_kpt_refine (bool): Whether to refine the reference points in the decoder. Defaults to True. test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of transformer head. init_cfg (dict or list[dict], optional): Initialization config dict. Default: None. """ __inject__ = [ "transformer", "positional_encoding", "assigner", "sampler", "loss_cls", "loss_kpt", "loss_oks", "loss_hm", "loss_kpt_rpn", "loss_kpt_refine", "loss_oks_refine" ] def __init__(self, num_classes, in_channels, num_query=100, num_kpt_fcs=2, num_keypoints=17, transformer=None, sync_cls_avg_factor=True, positional_encoding='SinePositionalEncoding', loss_cls='FocalLoss', loss_kpt='L1Loss', loss_oks='OKSLoss', loss_hm='CenterFocalLoss', with_kpt_refine=True, assigner='PoseHungarianAssigner', sampler='PseudoSampler', loss_kpt_rpn='L1Loss', loss_kpt_refine='L1Loss', loss_oks_refine='opera.OKSLoss', test_cfg=dict(max_per_img=100), init_cfg=None, **kwargs): # NOTE here use `AnchorFreeHead` instead of `TransformerHead`, # since it brings inconvenience when the initialization of # `AnchorFreeHead` is called. super().__init__() self.bg_cls_weight = 0 self.sync_cls_avg_factor = sync_cls_avg_factor self.assigner = assigner self.sampler = sampler self.num_query = num_query self.num_classes = num_classes self.in_channels = in_channels self.num_kpt_fcs = num_kpt_fcs self.test_cfg = test_cfg self.fp16_enabled = False self.as_two_stage = transformer.as_two_stage self.with_kpt_refine = with_kpt_refine self.num_keypoints = num_keypoints self.loss_cls = loss_cls self.loss_kpt = loss_kpt self.loss_kpt_rpn = loss_kpt_rpn self.loss_kpt_refine = loss_kpt_refine self.loss_oks = loss_oks self.loss_oks_refine = loss_oks_refine self.loss_hm = loss_hm if self.loss_cls.use_sigmoid: self.cls_out_channels = num_classes else: self.cls_out_channels = num_classes + 1 self.positional_encoding = positional_encoding self.transformer = transformer self.embed_dims = self.transformer.embed_dims # assert 'num_feats' in positional_encoding num_feats = positional_encoding.num_pos_feats assert num_feats * 2 == self.embed_dims, 'embed_dims should' \ f' be exactly 2 times of num_feats. Found {self.embed_dims}' \ f' and {num_feats}.' self._init_layers() self.init_weights() def _init_layers(self): """Initialize classification branch and keypoint branch of head.""" fc_cls = nn.Linear(self.embed_dims, self.cls_out_channels) kpt_branch = [] kpt_branch.append(nn.Linear(self.embed_dims, 512)) kpt_branch.append(nn.ReLU()) for _ in range(self.num_kpt_fcs): kpt_branch.append(nn.Linear(512, 512)) kpt_branch.append(nn.ReLU()) kpt_branch.append(nn.Linear(512, 2 * self.num_keypoints)) kpt_branch = nn.Sequential(*kpt_branch) def _get_clones(module, N): return nn.LayerList([copy.deepcopy(module) for i in range(N)]) # last kpt_branch is used to generate proposal from # encode feature map when as_two_stage is True. num_pred = (self.transformer.decoder.num_layers + 1) if \ self.as_two_stage else self.transformer.decoder.num_layers if self.with_kpt_refine: self.cls_branches = _get_clones(fc_cls, num_pred) self.kpt_branches = _get_clones(kpt_branch, num_pred) else: self.cls_branches = nn.LayerList([fc_cls for _ in range(num_pred)]) self.kpt_branches = nn.LayerList( [kpt_branch for _ in range(num_pred)]) self.query_embedding = nn.Embedding(self.num_query, self.embed_dims * 2) refine_kpt_branch = [] for _ in range(self.num_kpt_fcs): refine_kpt_branch.append( nn.Linear(self.embed_dims, self.embed_dims)) refine_kpt_branch.append(nn.ReLU()) refine_kpt_branch.append(nn.Linear(self.embed_dims, 2)) refine_kpt_branch = nn.Sequential(*refine_kpt_branch) if self.with_kpt_refine: num_pred = self.transformer.refine_decoder.num_layers self.refine_kpt_branches = _get_clones(refine_kpt_branch, num_pred) self.fc_hm = nn.Linear(self.embed_dims, self.num_keypoints) def init_weights(self): """Initialize weights of the PETR head.""" self.transformer.init_weights() if self.loss_cls.use_sigmoid: bias_init = bias_init_with_prob(0.01) for m in self.cls_branches: constant_(m.bias, bias_init) for m in self.kpt_branches: constant_(m[-1].bias, 0) # initialization of keypoint refinement branch if self.with_kpt_refine: for m in self.refine_kpt_branches: constant_(m[-1].bias, 0) # initialize bias for heatmap prediction bias_init = bias_init_with_prob(0.1) normal_(self.fc_hm.weight, std=0.01) constant_(self.fc_hm.bias, bias_init) def forward(self, mlvl_feats, img_metas): """Forward function. Args: mlvl_feats (tuple[Tensor]): Features from the upstream network, each is a 4D-tensor with shape (N, C, H, W). img_metas (list[dict]): List of image information. Returns: outputs_classes (Tensor): Outputs from the classification head, shape [nb_dec, bs, num_query, cls_out_channels]. Note cls_out_channels should include background. outputs_kpts (Tensor): Sigmoid outputs from the regression head with normalized coordinate format (cx, cy, w, h). Shape [nb_dec, bs, num_query, K*2]. enc_outputs_class (Tensor): The score of each point on encode feature map, has shape (N, h*w, num_class). Only when as_two_stage is Ture it would be returned, otherwise `None` would be returned. enc_outputs_kpt (Tensor): The proposal generate from the encode feature map, has shape (N, h*w, K*2). Only when as_two_stage is Ture it would be returned, otherwise `None` would be returned. """ batch_size = mlvl_feats[0].shape[0] input_img_h, input_img_w = img_metas[0]['batch_input_shape'] img_masks = paddle.zeros( (batch_size, input_img_h, input_img_w), dtype=mlvl_feats[0].dtype) for img_id in range(batch_size): img_h, img_w, _ = img_metas[img_id]['img_shape'] img_masks[img_id, :img_h, :img_w] = 1 mlvl_masks = [] mlvl_positional_encodings = [] for feat in mlvl_feats: mlvl_masks.append( F.interpolate( img_masks[None], size=feat.shape[-2:]).squeeze(0)) mlvl_positional_encodings.append( self.positional_encoding(mlvl_masks[-1]).transpose( [0, 3, 1, 2])) query_embeds = self.query_embedding.weight hs, init_reference, inter_references, \ enc_outputs_class, enc_outputs_kpt, hm_proto, memory = \ self.transformer( mlvl_feats, mlvl_masks, query_embeds, mlvl_positional_encodings, kpt_branches=self.kpt_branches \ if self.with_kpt_refine else None, # noqa:E501 cls_branches=self.cls_branches \ if self.as_two_stage else None # noqa:E501 ) outputs_classes = [] outputs_kpts = [] for lvl in range(hs.shape[0]): if lvl == 0: reference = init_reference else: reference = inter_references[lvl - 1] reference = inverse_sigmoid(reference) outputs_class = self.cls_branches[lvl](hs[lvl]) tmp_kpt = self.kpt_branches[lvl](hs[lvl]) assert reference.shape[-1] == self.num_keypoints * 2 tmp_kpt += reference outputs_kpt = F.sigmoid(tmp_kpt) outputs_classes.append(outputs_class) outputs_kpts.append(outputs_kpt) outputs_classes = paddle.stack(outputs_classes) outputs_kpts = paddle.stack(outputs_kpts) if hm_proto is not None: # get heatmap prediction (training phase) hm_memory, hm_mask = hm_proto hm_pred = self.fc_hm(hm_memory) hm_proto = (hm_pred.transpose((0, 3, 1, 2)), hm_mask) if self.as_two_stage: return outputs_classes, outputs_kpts, \ enc_outputs_class, F.sigmoid(enc_outputs_kpt), \ hm_proto, memory, mlvl_masks else: raise RuntimeError('only "as_two_stage=True" is supported.') def forward_refine(self, memory, mlvl_masks, refine_targets, losses, img_metas): """Forward function. Args: mlvl_masks (tuple[Tensor]): The key_padding_mask from different level used for encoder and decoder, each is a 3D-tensor with shape (bs, H, W). losses (dict[str, Tensor]): A dictionary of loss components. img_metas (list[dict]): List of image information. Returns: dict[str, Tensor]: A dictionary of loss components. """ kpt_preds, kpt_targets, area_targets, kpt_weights = refine_targets pos_inds = kpt_weights.sum(-1) > 0 if not pos_inds.any(): pos_kpt_preds = paddle.zeros_like(kpt_preds[:1]) pos_img_inds = paddle.zeros([1], dtype="int64") else: pos_kpt_preds = kpt_preds[pos_inds] pos_img_inds = (pos_inds.nonzero() / self.num_query).squeeze(1).astype("int64") hs, init_reference, inter_references = self.transformer.forward_refine( mlvl_masks, memory, pos_kpt_preds.detach(), pos_img_inds, kpt_branches=self.refine_kpt_branches if self.with_kpt_refine else None, # noqa:E501 ) outputs_kpts = [] for lvl in range(hs.shape[0]): if lvl == 0: reference = init_reference else: reference = inter_references[lvl - 1] reference = inverse_sigmoid(reference) tmp_kpt = self.refine_kpt_branches[lvl](hs[lvl]) assert reference.shape[-1] == 2 tmp_kpt += reference outputs_kpt = F.sigmoid(tmp_kpt) outputs_kpts.append(outputs_kpt) outputs_kpts = paddle.stack(outputs_kpts) if not self.training: return outputs_kpts num_valid_kpt = paddle.clip( reduce_mean(kpt_weights.sum()), min=1).item() num_total_pos = paddle.to_tensor( [outputs_kpts.shape[1]], dtype=kpt_weights.dtype) num_total_pos = paddle.clip(reduce_mean(num_total_pos), min=1).item() if not pos_inds.any(): for i, kpt_refine_preds in enumerate(outputs_kpts): loss_kpt = loss_oks = kpt_refine_preds.sum() * 0 losses[f'd{i}.loss_kpt_refine'] = loss_kpt losses[f'd{i}.loss_oks_refine'] = loss_oks continue return losses batch_size = mlvl_masks[0].shape[0] factors = [] for img_id in range(batch_size): img_h, img_w, _ = img_metas[img_id]['img_shape'] factor = paddle.to_tensor( [img_w, img_h, img_w, img_h], dtype="float32").squeeze(-1).unsqueeze(0).tile( (self.num_query, 1)) factors.append(factor) factors = paddle.concat(factors, 0) factors = factors[pos_inds][:, :2].tile((1, kpt_preds.shape[-1] // 2)) pos_kpt_weights = kpt_weights[pos_inds] pos_kpt_targets = kpt_targets[pos_inds] pos_kpt_targets_scaled = pos_kpt_targets * factors pos_areas = area_targets[pos_inds] pos_valid = kpt_weights[pos_inds][:, 0::2] for i, kpt_refine_preds in enumerate(outputs_kpts): if not pos_inds.any(): print("refine kpt and oks skip") loss_kpt = loss_oks = kpt_refine_preds.sum() * 0 losses[f'd{i}.loss_kpt_refine'] = loss_kpt losses[f'd{i}.loss_oks_refine'] = loss_oks continue # kpt L1 Loss pos_refine_preds = kpt_refine_preds.reshape( (kpt_refine_preds.shape[0], -1)) loss_kpt = self.loss_kpt_refine( pos_refine_preds, pos_kpt_targets, pos_kpt_weights, avg_factor=num_valid_kpt) losses[f'd{i}.loss_kpt_refine'] = loss_kpt # kpt oks loss pos_refine_preds_scaled = pos_refine_preds * factors assert (pos_areas > 0).all() loss_oks = self.loss_oks_refine( pos_refine_preds_scaled, pos_kpt_targets_scaled, pos_valid, pos_areas, avg_factor=num_total_pos) losses[f'd{i}.loss_oks_refine'] = loss_oks return losses # over-write because img_metas are needed as inputs for bbox_head. def forward_train(self, x, img_metas, gt_bboxes, gt_labels=None, gt_keypoints=None, gt_areas=None, gt_bboxes_ignore=None, proposal_cfg=None, **kwargs): """Forward function for training mode. Args: x (list[Tensor]): Features from backbone. img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. gt_bboxes (list[Tensor]): Ground truth bboxes of the image, shape (num_gts, 4). gt_labels (list[Tensor]): Ground truth labels of each box, shape (num_gts,). gt_keypoints (list[Tensor]): Ground truth keypoints of the image, shape (num_gts, K*3). gt_areas (list[Tensor]): Ground truth mask areas of each box, shape (num_gts,). gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be ignored, shape (num_ignored_gts, 4). proposal_cfg (mmcv.Config): Test / postprocessing configuration, if None, test_cfg would be used. Returns: dict[str, Tensor]: A dictionary of loss components. """ assert proposal_cfg is None, '"proposal_cfg" must be None' outs = self(x, img_metas) memory, mlvl_masks = outs[-2:] outs = outs[:-2] if gt_labels is None: loss_inputs = outs + (gt_bboxes, gt_keypoints, gt_areas, img_metas) else: loss_inputs = outs + (gt_bboxes, gt_labels, gt_keypoints, gt_areas, img_metas) losses_and_targets = self.loss( *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) # losses = losses_and_targets losses, refine_targets = losses_and_targets # get pose refinement loss losses = self.forward_refine(memory, mlvl_masks, refine_targets, losses, img_metas) return losses def loss(self, all_cls_scores, all_kpt_preds, enc_cls_scores, enc_kpt_preds, enc_hm_proto, gt_bboxes_list, gt_labels_list, gt_keypoints_list, gt_areas_list, img_metas, gt_bboxes_ignore=None): """Loss function. Args: all_cls_scores (Tensor): Classification score of all decoder layers, has shape [nb_dec, bs, num_query, cls_out_channels]. all_kpt_preds (Tensor): Sigmoid regression outputs of all decode layers. Each is a 4D-tensor with normalized coordinate format (x_{i}, y_{i}) and shape [nb_dec, bs, num_query, K*2]. enc_cls_scores (Tensor): Classification scores of points on encode feature map, has shape (N, h*w, num_classes). Only be passed when as_two_stage is True, otherwise is None. enc_kpt_preds (Tensor): Regression results of each points on the encode feature map, has shape (N, h*w, K*2). Only be passed when as_two_stage is True, otherwise is None. gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels_list (list[Tensor]): Ground truth class indices for each image with shape (num_gts, ). gt_keypoints_list (list[Tensor]): Ground truth keypoints for each image with shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v, ..., p^{K}_x, p^{K}_y, p^{K}_v] format. gt_areas_list (list[Tensor]): Ground truth mask areas for each image with shape (num_gts, ). img_metas (list[dict]): List of image meta information. gt_bboxes_ignore (list[Tensor], optional): Bounding boxes which can be ignored for each image. Default None. Returns: dict[str, Tensor]: A dictionary of loss components. """ assert gt_bboxes_ignore is None, \ f'{self.__class__.__name__} only supports ' \ f'for gt_bboxes_ignore setting to None.' num_dec_layers = len(all_cls_scores) all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)] all_gt_keypoints_list = [ gt_keypoints_list for _ in range(num_dec_layers) ] all_gt_areas_list = [gt_areas_list for _ in range(num_dec_layers)] img_metas_list = [img_metas for _ in range(num_dec_layers)] losses_cls, losses_kpt, losses_oks, kpt_preds_list, kpt_targets_list, \ area_targets_list, kpt_weights_list = multi_apply( self.loss_single, all_cls_scores, all_kpt_preds, all_gt_labels_list, all_gt_keypoints_list, all_gt_areas_list, img_metas_list) loss_dict = dict() # loss of proposal generated from encode feature map. if enc_cls_scores is not None: binary_labels_list = [ paddle.zeros_like(gt_labels_list[i]) for i in range(len(img_metas)) ] enc_loss_cls, enc_losses_kpt = \ self.loss_single_rpn( enc_cls_scores, enc_kpt_preds, binary_labels_list, gt_keypoints_list, gt_areas_list, img_metas) loss_dict['enc_loss_cls'] = enc_loss_cls loss_dict['enc_loss_kpt'] = enc_losses_kpt # loss from the last decoder layer loss_dict['loss_cls'] = losses_cls[-1] loss_dict['loss_kpt'] = losses_kpt[-1] loss_dict['loss_oks'] = losses_oks[-1] # loss from other decoder layers num_dec_layer = 0 for loss_cls_i, loss_kpt_i, loss_oks_i in zip( losses_cls[:-1], losses_kpt[:-1], losses_oks[:-1]): loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i loss_dict[f'd{num_dec_layer}.loss_kpt'] = loss_kpt_i loss_dict[f'd{num_dec_layer}.loss_oks'] = loss_oks_i num_dec_layer += 1 # losses of heatmap generated from P3 feature map hm_pred, hm_mask = enc_hm_proto loss_hm = self.loss_heatmap(hm_pred, hm_mask, gt_keypoints_list, gt_labels_list, gt_bboxes_list) loss_dict['loss_hm'] = loss_hm return loss_dict, (kpt_preds_list[-1], kpt_targets_list[-1], area_targets_list[-1], kpt_weights_list[-1]) def loss_heatmap(self, hm_pred, hm_mask, gt_keypoints, gt_labels, gt_bboxes): assert hm_pred.shape[-2:] == hm_mask.shape[-2:] num_img, _, h, w = hm_pred.shape # placeholder of heatmap target (Gaussian distribution) hm_target = paddle.zeros(hm_pred.shape, hm_pred.dtype) for i, (gt_label, gt_bbox, gt_keypoint ) in enumerate(zip(gt_labels, gt_bboxes, gt_keypoints)): if gt_label.shape[0] == 0: continue gt_keypoint = gt_keypoint.reshape((gt_keypoint.shape[0], -1, 3)).clone() gt_keypoint[..., :2] /= 8 assert gt_keypoint[..., 0].max() <= w + 0.5 # new coordinate system assert gt_keypoint[..., 1].max() <= h + 0.5 # new coordinate system gt_bbox /= 8 gt_w = gt_bbox[:, 2] - gt_bbox[:, 0] gt_h = gt_bbox[:, 3] - gt_bbox[:, 1] for j in range(gt_label.shape[0]): # get heatmap radius kp_radius = paddle.clip( paddle.floor( gaussian_radius( (gt_h[j], gt_w[j]), min_overlap=0.9)), min=0, max=3) for k in range(self.num_keypoints): if gt_keypoint[j, k, 2] > 0: gt_kp = gt_keypoint[j, k, :2] gt_kp_int = paddle.floor(gt_kp) hm_target[i, k] = draw_umich_gaussian( hm_target[i, k], gt_kp_int, kp_radius) # compute heatmap loss hm_pred = paddle.clip( F.sigmoid(hm_pred), min=1e-4, max=1 - 1e-4) # refer to CenterNet loss_hm = self.loss_hm( hm_pred, hm_target.detach(), mask=~hm_mask.astype("bool").unsqueeze(1)) return loss_hm def loss_single(self, cls_scores, kpt_preds, gt_labels_list, gt_keypoints_list, gt_areas_list, img_metas): """Loss function for outputs from a single decoder layer of a single feature level. Args: cls_scores (Tensor): Box score logits from a single decoder layer for all images. Shape [bs, num_query, cls_out_channels]. kpt_preds (Tensor): Sigmoid outputs from a single decoder layer for all images, with normalized coordinate (x_{i}, y_{i}) and shape [bs, num_query, K*2]. gt_labels_list (list[Tensor]): Ground truth class indices for each image with shape (num_gts, ). gt_keypoints_list (list[Tensor]): Ground truth keypoints for each image with shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v, ..., p^{K}_x, p^{K}_y, p^{K}_v] format. gt_areas_list (list[Tensor]): Ground truth mask areas for each image with shape (num_gts, ). img_metas (list[dict]): List of image meta information. Returns: dict[str, Tensor]: A dictionary of loss components for outputs from a single decoder layer. """ num_imgs = cls_scores.shape[0] cls_scores_list = [cls_scores[i] for i in range(num_imgs)] kpt_preds_list = [kpt_preds[i] for i in range(num_imgs)] cls_reg_targets = self.get_targets(cls_scores_list, kpt_preds_list, gt_labels_list, gt_keypoints_list, gt_areas_list, img_metas) (labels_list, label_weights_list, kpt_targets_list, kpt_weights_list, area_targets_list, num_total_pos, num_total_neg) = cls_reg_targets labels = paddle.concat(labels_list, 0) label_weights = paddle.concat(label_weights_list, 0) kpt_targets = paddle.concat(kpt_targets_list, 0) kpt_weights = paddle.concat(kpt_weights_list, 0) area_targets = paddle.concat(area_targets_list, 0) # classification loss cls_scores = cls_scores.reshape((-1, self.cls_out_channels)) # construct weighted avg_factor to match with the official DETR repo cls_avg_factor = num_total_pos * 1.0 + \ num_total_neg * self.bg_cls_weight if self.sync_cls_avg_factor: cls_avg_factor = reduce_mean( paddle.to_tensor( [cls_avg_factor], dtype=cls_scores.dtype)) cls_avg_factor = max(cls_avg_factor, 1) loss_cls = self.loss_cls( cls_scores, labels, label_weights, avg_factor=cls_avg_factor) # Compute the average number of gt keypoints accross all gpus, for # normalization purposes num_total_pos = paddle.to_tensor([num_total_pos], dtype=loss_cls.dtype) num_total_pos = paddle.clip(reduce_mean(num_total_pos), min=1).item() # construct factors used for rescale keypoints factors = [] for img_meta, kpt_pred in zip(img_metas, kpt_preds): img_h, img_w, _ = img_meta['img_shape'] factor = paddle.to_tensor( [img_w, img_h, img_w, img_h], dtype=kpt_pred.dtype).squeeze().unsqueeze(0).tile( (kpt_pred.shape[0], 1)) factors.append(factor) factors = paddle.concat(factors, 0) # keypoint regression loss kpt_preds = kpt_preds.reshape((-1, kpt_preds.shape[-1])) num_valid_kpt = paddle.clip( reduce_mean(kpt_weights.sum()), min=1).item() # assert num_valid_kpt == (kpt_targets>0).sum().item() loss_kpt = self.loss_kpt( kpt_preds, kpt_targets.detach(), kpt_weights.detach(), avg_factor=num_valid_kpt) # keypoint oks loss pos_inds = kpt_weights.sum(-1) > 0 if not pos_inds.any(): loss_oks = kpt_preds.sum() * 0 else: factors = factors[pos_inds][:, :2].tile(( (1, kpt_preds.shape[-1] // 2))) pos_kpt_preds = kpt_preds[pos_inds] * factors pos_kpt_targets = kpt_targets[pos_inds] * factors pos_areas = area_targets[pos_inds] pos_valid = kpt_weights[pos_inds][..., 0::2] assert (pos_areas > 0).all() loss_oks = self.loss_oks( pos_kpt_preds, pos_kpt_targets, pos_valid, pos_areas, avg_factor=num_total_pos) return loss_cls, loss_kpt, loss_oks, kpt_preds, kpt_targets, \ area_targets, kpt_weights def get_targets(self, cls_scores_list, kpt_preds_list, gt_labels_list, gt_keypoints_list, gt_areas_list, img_metas): """Compute regression and classification targets for a batch image. Outputs from a single decoder layer of a single feature level are used. Args: cls_scores_list (list[Tensor]): Box score logits from a single decoder layer for each image with shape [num_query, cls_out_channels]. kpt_preds_list (list[Tensor]): Sigmoid outputs from a single decoder layer for each image, with normalized coordinate (x_{i}, y_{i}) and shape [num_query, K*2]. gt_labels_list (list[Tensor]): Ground truth class indices for each image with shape (num_gts, ). gt_keypoints_list (list[Tensor]): Ground truth keypoints for each image with shape (num_gts, K*3). gt_areas_list (list[Tensor]): Ground truth mask areas for each image with shape (num_gts, ). img_metas (list[dict]): List of image meta information. Returns: tuple: a tuple containing the following targets. - labels_list (list[Tensor]): Labels for all images. - label_weights_list (list[Tensor]): Label weights for all images. - kpt_targets_list (list[Tensor]): Keypoint targets for all images. - kpt_weights_list (list[Tensor]): Keypoint weights for all images. - area_targets_list (list[Tensor]): area targets for all images. - num_total_pos (int): Number of positive samples in all images. - num_total_neg (int): Number of negative samples in all images. """ (labels_list, label_weights_list, kpt_targets_list, kpt_weights_list, area_targets_list, pos_inds_list, neg_inds_list) = multi_apply( self._get_target_single, cls_scores_list, kpt_preds_list, gt_labels_list, gt_keypoints_list, gt_areas_list, img_metas) num_total_pos = sum((inds.numel() for inds in pos_inds_list)) num_total_neg = sum((inds.numel() for inds in neg_inds_list)) return (labels_list, label_weights_list, kpt_targets_list, kpt_weights_list, area_targets_list, num_total_pos, num_total_neg) def _get_target_single(self, cls_score, kpt_pred, gt_labels, gt_keypoints, gt_areas, img_meta): """Compute regression and classification targets for one image. Outputs from a single decoder layer of a single feature level are used. Args: cls_score (Tensor): Box score logits from a single decoder layer for one image. Shape [num_query, cls_out_channels]. kpt_pred (Tensor): Sigmoid outputs from a single decoder layer for one image, with normalized coordinate (x_{i}, y_{i}) and shape [num_query, K*2]. gt_labels (Tensor): Ground truth class indices for one image with shape (num_gts, ). gt_keypoints (Tensor): Ground truth keypoints for one image with shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v, ..., \ p^{K}_x, p^{K}_y, p^{K}_v] format. gt_areas (Tensor): Ground truth mask areas for one image with shape (num_gts, ). img_meta (dict): Meta information for one image. Returns: tuple[Tensor]: a tuple containing the following for one image. - labels (Tensor): Labels of each image. - label_weights (Tensor): Label weights of each image. - kpt_targets (Tensor): Keypoint targets of each image. - kpt_weights (Tensor): Keypoint weights of each image. - area_targets (Tensor): Area targets of each image. - pos_inds (Tensor): Sampled positive indices for each image. - neg_inds (Tensor): Sampled negative indices for each image. """ num_bboxes = kpt_pred.shape[0] # assigner and sampler assign_result = self.assigner.assign(cls_score, kpt_pred, gt_labels, gt_keypoints, gt_areas, img_meta) sampling_result = self.sampler.sample(assign_result, kpt_pred, gt_keypoints) pos_inds = sampling_result.pos_inds neg_inds = sampling_result.neg_inds # label targets labels = paddle.full((num_bboxes, ), self.num_classes, dtype="int64") label_weights = paddle.ones((num_bboxes, ), dtype=gt_labels.dtype) kpt_targets = paddle.zeros_like(kpt_pred) kpt_weights = paddle.zeros_like(kpt_pred) area_targets = paddle.zeros((kpt_pred.shape[0], ), dtype=kpt_pred.dtype) if pos_inds.size == 0: return (labels, label_weights, kpt_targets, kpt_weights, area_targets, pos_inds, neg_inds) labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds][ ..., 0].astype("int64") img_h, img_w, _ = img_meta['img_shape'] # keypoint targets pos_gt_kpts = gt_keypoints[sampling_result.pos_assigned_gt_inds] pos_gt_kpts = pos_gt_kpts.reshape( (len(sampling_result.pos_assigned_gt_inds), -1, 3)) valid_idx = pos_gt_kpts[:, :, 2] > 0 pos_kpt_weights = kpt_weights[pos_inds].reshape( (pos_gt_kpts.shape[0], kpt_weights.shape[-1] // 2, 2)) # pos_kpt_weights[valid_idx][...] = 1.0 pos_kpt_weights = masked_fill(pos_kpt_weights, valid_idx.unsqueeze(-1), 1.0) kpt_weights[pos_inds] = pos_kpt_weights.reshape( (pos_kpt_weights.shape[0], kpt_pred.shape[-1])) factor = paddle.to_tensor( [img_w, img_h], dtype=kpt_pred.dtype).squeeze().unsqueeze(0) pos_gt_kpts_normalized = pos_gt_kpts[..., :2] pos_gt_kpts_normalized[..., 0] = pos_gt_kpts_normalized[..., 0] / \ factor[:, 0:1] pos_gt_kpts_normalized[..., 1] = pos_gt_kpts_normalized[..., 1] / \ factor[:, 1:2] kpt_targets[pos_inds] = pos_gt_kpts_normalized.reshape( (pos_gt_kpts.shape[0], kpt_pred.shape[-1])) pos_gt_areas = gt_areas[sampling_result.pos_assigned_gt_inds][..., 0] area_targets[pos_inds] = pos_gt_areas return (labels, label_weights, kpt_targets, kpt_weights, area_targets, pos_inds, neg_inds) def loss_single_rpn(self, cls_scores, kpt_preds, gt_labels_list, gt_keypoints_list, gt_areas_list, img_metas): """Loss function for outputs from a single decoder layer of a single feature level. Args: cls_scores (Tensor): Box score logits from a single decoder layer for all images. Shape [bs, num_query, cls_out_channels]. kpt_preds (Tensor): Sigmoid outputs from a single decoder layer for all images, with normalized coordinate (x_{i}, y_{i}) and shape [bs, num_query, K*2]. gt_labels_list (list[Tensor]): Ground truth class indices for each image with shape (num_gts, ). gt_keypoints_list (list[Tensor]): Ground truth keypoints for each image with shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v, ..., p^{K}_x, p^{K}_y, p^{K}_v] format. gt_areas_list (list[Tensor]): Ground truth mask areas for each image with shape (num_gts, ). img_metas (list[dict]): List of image meta information. Returns: dict[str, Tensor]: A dictionary of loss components for outputs from a single decoder layer. """ num_imgs = cls_scores.shape[0] cls_scores_list = [cls_scores[i] for i in range(num_imgs)] kpt_preds_list = [kpt_preds[i] for i in range(num_imgs)] cls_reg_targets = self.get_targets(cls_scores_list, kpt_preds_list, gt_labels_list, gt_keypoints_list, gt_areas_list, img_metas) (labels_list, label_weights_list, kpt_targets_list, kpt_weights_list, area_targets_list, num_total_pos, num_total_neg) = cls_reg_targets labels = paddle.concat(labels_list, 0) label_weights = paddle.concat(label_weights_list, 0) kpt_targets = paddle.concat(kpt_targets_list, 0) kpt_weights = paddle.concat(kpt_weights_list, 0) # classification loss cls_scores = cls_scores.reshape((-1, self.cls_out_channels)) # construct weighted avg_factor to match with the official DETR repo cls_avg_factor = num_total_pos * 1.0 + \ num_total_neg * self.bg_cls_weight if self.sync_cls_avg_factor: cls_avg_factor = reduce_mean( paddle.to_tensor( [cls_avg_factor], dtype=cls_scores.dtype)) cls_avg_factor = max(cls_avg_factor, 1) cls_avg_factor = max(cls_avg_factor, 1) loss_cls = self.loss_cls( cls_scores, labels, label_weights, avg_factor=cls_avg_factor) # Compute the average number of gt keypoints accross all gpus, for # normalization purposes # num_total_pos = loss_cls.to_tensor([num_total_pos]) # num_total_pos = paddle.clip(reduce_mean(num_total_pos), min=1).item() # keypoint regression loss kpt_preds = kpt_preds.reshape((-1, kpt_preds.shape[-1])) num_valid_kpt = paddle.clip( reduce_mean(kpt_weights.sum()), min=1).item() # assert num_valid_kpt == (kpt_targets>0).sum().item() loss_kpt = self.loss_kpt_rpn( kpt_preds, kpt_targets, kpt_weights, avg_factor=num_valid_kpt) return loss_cls, loss_kpt def get_bboxes(self, all_cls_scores, all_kpt_preds, enc_cls_scores, enc_kpt_preds, hm_proto, memory, mlvl_masks, img_metas, rescale=False): """Transform network outputs for a batch into bbox predictions. Args: all_cls_scores (Tensor): Classification score of all decoder layers, has shape [nb_dec, bs, num_query, cls_out_channels]. all_kpt_preds (Tensor): Sigmoid regression outputs of all decode layers. Each is a 4D-tensor with normalized coordinate format (x_{i}, y_{i}) and shape [nb_dec, bs, num_query, K*2]. enc_cls_scores (Tensor): Classification scores of points on encode feature map, has shape (N, h*w, num_classes). Only be passed when as_two_stage is True, otherwise is None. enc_kpt_preds (Tensor): Regression results of each points on the encode feature map, has shape (N, h*w, K*2). Only be passed when as_two_stage is True, otherwise is None. img_metas (list[dict]): Meta information of each image. rescale (bool, optional): If True, return boxes in original image space. Defalut False. Returns: list[list[Tensor, Tensor]]: Each item in result_list is 3-tuple. The first item is an (n, 5) tensor, where the first 4 columns are bounding box positions (tl_x, tl_y, br_x, br_y) and the 5-th column is a score between 0 and 1. The second item is a (n,) tensor where each item is the predicted class label of the corresponding box. The third item is an (n, K, 3) tensor with [p^{1}_x, p^{1}_y, p^{1}_v, ..., p^{K}_x, p^{K}_y, p^{K}_v] format. """ cls_scores = all_cls_scores[-1] kpt_preds = all_kpt_preds[-1] result_list = [] for img_id in range(len(img_metas)): cls_score = cls_scores[img_id] kpt_pred = kpt_preds[img_id] img_shape = img_metas[img_id]['img_shape'] scale_factor = img_metas[img_id]['scale_factor'] # TODO: only support single image test # memory_i = memory[:, img_id, :] # mlvl_mask = mlvl_masks[img_id] proposals = self._get_bboxes_single(cls_score, kpt_pred, img_shape, scale_factor, memory, mlvl_masks, rescale) result_list.append(proposals) return result_list def _get_bboxes_single(self, cls_score, kpt_pred, img_shape, scale_factor, memory, mlvl_masks, rescale=False): """Transform outputs from the last decoder layer into bbox predictions for each image. Args: cls_score (Tensor): Box score logits from the last decoder layer for each image. Shape [num_query, cls_out_channels]. kpt_pred (Tensor): Sigmoid outputs from the last decoder layer for each image, with coordinate format (x_{i}, y_{i}) and shape [num_query, K*2]. img_shape (tuple[int]): Shape of input image, (height, width, 3). scale_factor (ndarray, optional): Scale factor of the image arange as (w_scale, h_scale, w_scale, h_scale). rescale (bool, optional): If True, return boxes in original image space. Default False. Returns: tuple[Tensor]: Results of detected bboxes and labels. - det_bboxes: Predicted bboxes with shape [num_query, 5], where the first 4 columns are bounding box positions (tl_x, tl_y, br_x, br_y) and the 5-th column are scores between 0 and 1. - det_labels: Predicted labels of the corresponding box with shape [num_query]. - det_kpts: Predicted keypoints with shape [num_query, K, 3]. """ assert len(cls_score) == len(kpt_pred) max_per_img = self.test_cfg.get('max_per_img', self.num_query) # exclude background if self.loss_cls.use_sigmoid: cls_score = F.sigmoid(cls_score) scores, indexs = cls_score.reshape([-1]).topk(max_per_img) det_labels = indexs % self.num_classes bbox_index = indexs // self.num_classes kpt_pred = kpt_pred[bbox_index] else: scores, det_labels = F.softmax(cls_score, axis=-1)[..., :-1].max(-1) scores, bbox_index = scores.topk(max_per_img) kpt_pred = kpt_pred[bbox_index] det_labels = det_labels[bbox_index] # ----- results after pose decoder ----- # det_kpts = kpt_pred.reshape((kpt_pred.shape[0], -1, 2)) # ----- results after joint decoder (default) ----- # import time # start = time.time() refine_targets = (kpt_pred, None, None, paddle.ones_like(kpt_pred)) refine_outputs = self.forward_refine(memory, mlvl_masks, refine_targets, None, None) # end = time.time() # print(f'refine time: {end - start:.6f}') det_kpts = refine_outputs[-1] det_kpts[..., 0] = det_kpts[..., 0] * img_shape[1] det_kpts[..., 1] = det_kpts[..., 1] * img_shape[0] det_kpts[..., 0].clip_(min=0, max=img_shape[1]) det_kpts[..., 1].clip_(min=0, max=img_shape[0]) if rescale: det_kpts /= paddle.to_tensor( scale_factor[:2], dtype=det_kpts.dtype).unsqueeze(0).unsqueeze(0) # use circumscribed rectangle box of keypoints as det bboxes x1 = det_kpts[..., 0].min(axis=1, keepdim=True) y1 = det_kpts[..., 1].min(axis=1, keepdim=True) x2 = det_kpts[..., 0].max(axis=1, keepdim=True) y2 = det_kpts[..., 1].max(axis=1, keepdim=True) det_bboxes = paddle.concat([x1, y1, x2, y2], axis=1) det_bboxes = paddle.concat((det_bboxes, scores.unsqueeze(1)), -1) det_kpts = paddle.concat( (det_kpts, paddle.ones( det_kpts[..., :1].shape, dtype=det_kpts.dtype)), axis=2) return det_bboxes, det_labels, det_kpts def simple_test(self, feats, img_metas, rescale=False): """Test det bboxes without test-time augmentation. Args: feats (tuple[paddle.Tensor]): Multi-level features from the upstream network, each is a 4D-tensor. img_metas (list[dict]): List of image information. rescale (bool, optional): Whether to rescale the results. Defaults to False. Returns: list[tuple[Tensor, Tensor, Tensor]]: Each item in result_list is 3-tuple. The first item is ``bboxes`` with shape (n, 5), where 5 represent (tl_x, tl_y, br_x, br_y, score). The shape of the second tensor in the tuple is ``labels`` with shape (n,). The third item is ``kpts`` with shape (n, K, 3), in [p^{1}_x, p^{1}_y, p^{1}_v, p^{K}_x, p^{K}_y, p^{K}_v] format. """ # forward of this head requires img_metas outs = self.forward(feats, img_metas) results_list = self.get_bboxes(*outs, img_metas, rescale=rescale) return results_list def get_loss(self, boxes, scores, gt_bbox, gt_class, prior_boxes): return self.loss(boxes, scores, gt_bbox, gt_class, prior_boxes) ================================================ FILE: ppdet/modeling/heads/pico_head.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import math import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.nn.initializer import Normal, Constant from ppdet.modeling.ops import get_static_shape from ..initializer import normal_ from ..assigners.utils import generate_anchors_for_grid_cell from ..bbox_utils import bbox_center, batch_distance2bbox, bbox2distance from ppdet.core.workspace import register from ppdet.modeling.layers import ConvNormLayer from .simota_head import OTAVFLHead from .gfl_head import Integral, GFLHead from ppdet.modeling.necks.csp_pan import DPModule eps = 1e-9 __all__ = ['PicoHead', 'PicoHeadV2', 'PicoFeat'] def npu_avg_pool2d(feat, w, h): batch_size, channels, _, _ = feat.shape feat_flat = paddle.reshape(feat, [batch_size, channels, -1]) feat_mean = paddle.mean(feat_flat, axis=2) feat_mean = paddle.reshape( feat_mean, [batch_size, channels, w, h]) return feat_mean class PicoSE(nn.Layer): def __init__(self, feat_channels): super(PicoSE, self).__init__() self.fc = nn.Conv2D(feat_channels, feat_channels, 1) self.conv = ConvNormLayer(feat_channels, feat_channels, 1, 1) self._init_weights() def _init_weights(self): normal_(self.fc.weight, std=0.001) def forward(self, feat, avg_feat): weight = F.sigmoid(self.fc(avg_feat)) out = self.conv(feat * weight) return out @register class PicoFeat(nn.Layer): """ PicoFeat of PicoDet Args: feat_in (int): The channel number of input Tensor. feat_out (int): The channel number of output Tensor. num_convs (int): The convolution number of the LiteGFLFeat. norm_type (str): Normalization type, 'bn'/'sync_bn'/'gn'. share_cls_reg (bool): Whether to share the cls and reg output. act (str): The act of per layers. use_se (bool): Whether to use se module. """ def __init__(self, feat_in=256, feat_out=96, num_fpn_stride=3, num_convs=2, norm_type='bn', share_cls_reg=False, act='hard_swish', use_se=False): super(PicoFeat, self).__init__() self.num_convs = num_convs self.norm_type = norm_type self.share_cls_reg = share_cls_reg self.act = act self.use_se = use_se self.cls_convs = [] self.reg_convs = [] if paddle.device.get_device().startswith("npu"): self.device = "npu" else: self.device = None if use_se: assert share_cls_reg == True, \ 'In the case of using se, share_cls_reg must be set to True' self.se = nn.LayerList() for stage_idx in range(num_fpn_stride): cls_subnet_convs = [] reg_subnet_convs = [] for i in range(self.num_convs): in_c = feat_in if i == 0 else feat_out cls_conv_dw = self.add_sublayer( 'cls_conv_dw{}.{}'.format(stage_idx, i), ConvNormLayer( ch_in=in_c, ch_out=feat_out, filter_size=5, stride=1, groups=feat_out, norm_type=norm_type, bias_on=False, lr_scale=2.)) cls_subnet_convs.append(cls_conv_dw) cls_conv_pw = self.add_sublayer( 'cls_conv_pw{}.{}'.format(stage_idx, i), ConvNormLayer( ch_in=in_c, ch_out=feat_out, filter_size=1, stride=1, norm_type=norm_type, bias_on=False, lr_scale=2.)) cls_subnet_convs.append(cls_conv_pw) if not self.share_cls_reg: reg_conv_dw = self.add_sublayer( 'reg_conv_dw{}.{}'.format(stage_idx, i), ConvNormLayer( ch_in=in_c, ch_out=feat_out, filter_size=5, stride=1, groups=feat_out, norm_type=norm_type, bias_on=False, lr_scale=2.)) reg_subnet_convs.append(reg_conv_dw) reg_conv_pw = self.add_sublayer( 'reg_conv_pw{}.{}'.format(stage_idx, i), ConvNormLayer( ch_in=in_c, ch_out=feat_out, filter_size=1, stride=1, norm_type=norm_type, bias_on=False, lr_scale=2.)) reg_subnet_convs.append(reg_conv_pw) self.cls_convs.append(cls_subnet_convs) self.reg_convs.append(reg_subnet_convs) if use_se: self.se.append(PicoSE(feat_out)) def act_func(self, x): if self.act == "leaky_relu": x = F.leaky_relu(x) elif self.act == "hard_swish": x = F.hardswish(x) elif self.act == "relu6": x = F.relu6(x) return x def forward(self, fpn_feat, stage_idx): assert stage_idx < len(self.cls_convs) cls_feat = fpn_feat reg_feat = fpn_feat for i in range(len(self.cls_convs[stage_idx])): cls_feat = self.act_func(self.cls_convs[stage_idx][i](cls_feat)) reg_feat = cls_feat if not self.share_cls_reg: reg_feat = self.act_func(self.reg_convs[stage_idx][i](reg_feat)) if self.use_se: if self.device == "npu": avg_feat = npu_avg_pool2d(cls_feat, 1, 1) else: avg_feat = F.adaptive_avg_pool2d(cls_feat, (1, 1)) se_feat = self.act_func(self.se[stage_idx](cls_feat, avg_feat)) return cls_feat, se_feat return cls_feat, reg_feat @register class PicoHead(OTAVFLHead): """ PicoHead Args: conv_feat (object): Instance of 'PicoFeat' num_classes (int): Number of classes fpn_stride (list): The stride of each FPN Layer prior_prob (float): Used to set the bias init for the class prediction layer loss_class (object): Instance of VariFocalLoss. loss_dfl (object): Instance of DistributionFocalLoss. loss_bbox (object): Instance of bbox loss. assigner (object): Instance of label assigner. reg_max: Max value of integral set :math: `{0, ..., reg_max}` n QFL setting. Default: 7. """ __inject__ = [ 'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox', 'assigner', 'nms' ] __shared__ = ['num_classes', 'eval_size'] def __init__(self, conv_feat='PicoFeat', dgqp_module=None, num_classes=80, fpn_stride=[8, 16, 32], prior_prob=0.01, loss_class='VariFocalLoss', loss_dfl='DistributionFocalLoss', loss_bbox='GIoULoss', assigner='SimOTAAssigner', reg_max=16, feat_in_chan=96, nms=None, nms_pre=1000, cell_offset=0, eval_size=None): super(PicoHead, self).__init__( conv_feat=conv_feat, dgqp_module=dgqp_module, num_classes=num_classes, fpn_stride=fpn_stride, prior_prob=prior_prob, loss_class=loss_class, loss_dfl=loss_dfl, loss_bbox=loss_bbox, assigner=assigner, reg_max=reg_max, feat_in_chan=feat_in_chan, nms=nms, nms_pre=nms_pre, cell_offset=cell_offset) self.conv_feat = conv_feat self.num_classes = num_classes self.fpn_stride = fpn_stride self.prior_prob = prior_prob self.loss_vfl = loss_class self.loss_dfl = loss_dfl self.loss_bbox = loss_bbox self.assigner = assigner self.reg_max = reg_max self.feat_in_chan = feat_in_chan self.nms = nms self.nms_pre = nms_pre self.cell_offset = cell_offset self.eval_size = eval_size self.device = paddle.device.get_device() self.use_sigmoid = self.loss_vfl.use_sigmoid if self.use_sigmoid: self.cls_out_channels = self.num_classes else: self.cls_out_channels = self.num_classes + 1 bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob) # Clear the super class initialization self.gfl_head_cls = None self.gfl_head_reg = None self.scales_regs = None self.head_cls_list = [] self.head_reg_list = [] for i in range(len(fpn_stride)): head_cls = self.add_sublayer( "head_cls" + str(i), nn.Conv2D( in_channels=self.feat_in_chan, out_channels=self.cls_out_channels + 4 * (self.reg_max + 1) if self.conv_feat.share_cls_reg else self.cls_out_channels, kernel_size=1, stride=1, padding=0, weight_attr=ParamAttr(initializer=Normal( mean=0., std=0.01)), bias_attr=ParamAttr( initializer=Constant(value=bias_init_value)))) self.head_cls_list.append(head_cls) if not self.conv_feat.share_cls_reg: head_reg = self.add_sublayer( "head_reg" + str(i), nn.Conv2D( in_channels=self.feat_in_chan, out_channels=4 * (self.reg_max + 1), kernel_size=1, stride=1, padding=0, weight_attr=ParamAttr(initializer=Normal( mean=0., std=0.01)), bias_attr=ParamAttr(initializer=Constant(value=0)))) self.head_reg_list.append(head_reg) # initialize the anchor points if self.eval_size: self.anchor_points, self.stride_tensor = self._generate_anchors() def forward(self, fpn_feats, export_post_process=True): assert len(fpn_feats) == len( self.fpn_stride ), "The size of fpn_feats is not equal to size of fpn_stride" if self.training: return self.forward_train(fpn_feats) else: return self.forward_eval( fpn_feats, export_post_process=export_post_process) def forward_train(self, fpn_feats): cls_logits_list, bboxes_reg_list = [], [] for i, fpn_feat in enumerate(fpn_feats): conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat, i) if self.conv_feat.share_cls_reg: cls_logits = self.head_cls_list[i](conv_cls_feat) cls_score, bbox_pred = paddle.split( cls_logits, [self.cls_out_channels, 4 * (self.reg_max + 1)], axis=1) else: cls_score = self.head_cls_list[i](conv_cls_feat) bbox_pred = self.head_reg_list[i](conv_reg_feat) if self.dgqp_module: quality_score = self.dgqp_module(bbox_pred) cls_score = F.sigmoid(cls_score) * quality_score cls_logits_list.append(cls_score) bboxes_reg_list.append(bbox_pred) return (cls_logits_list, bboxes_reg_list) def forward_eval(self, fpn_feats, export_post_process=True): if self.eval_size: anchor_points, stride_tensor = self.anchor_points, self.stride_tensor else: anchor_points, stride_tensor = self._generate_anchors(fpn_feats) cls_logits_list, bboxes_reg_list = [], [] for i, fpn_feat in enumerate(fpn_feats): conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat, i) if self.conv_feat.share_cls_reg: cls_logits = self.head_cls_list[i](conv_cls_feat) cls_score, bbox_pred = paddle.split( cls_logits, [self.cls_out_channels, 4 * (self.reg_max + 1)], axis=1) else: cls_score = self.head_cls_list[i](conv_cls_feat) bbox_pred = self.head_reg_list[i](conv_reg_feat) if self.dgqp_module: quality_score = self.dgqp_module(bbox_pred) cls_score = F.sigmoid(cls_score) * quality_score if not export_post_process: # Now only supports batch size = 1 in deploy # TODO(ygh): support batch size > 1 cls_score_out = F.sigmoid(cls_score).reshape( [1, self.cls_out_channels, -1]).transpose([0, 2, 1]) bbox_pred = bbox_pred.reshape([1, (self.reg_max + 1) * 4, -1]).transpose([0, 2, 1]) else: _, _, h, w = fpn_feat.shape l = h * w cls_score_out = F.sigmoid( cls_score.reshape([-1, self.cls_out_channels, l])) bbox_pred = bbox_pred.transpose([0, 2, 3, 1]) bbox_pred = self.distribution_project(bbox_pred) bbox_pred = bbox_pred.reshape([-1, l, 4]) cls_logits_list.append(cls_score_out) bboxes_reg_list.append(bbox_pred) if export_post_process: cls_logits_list = paddle.concat(cls_logits_list, axis=-1) bboxes_reg_list = paddle.concat(bboxes_reg_list, axis=1) bboxes_reg_list = batch_distance2bbox(anchor_points, bboxes_reg_list) bboxes_reg_list *= stride_tensor return (cls_logits_list, bboxes_reg_list) def _generate_anchors(self, feats=None): # just use in eval time anchor_points = [] stride_tensor = [] for i, stride in enumerate(self.fpn_stride): if feats is not None: _, _, h, w = feats[i].shape else: h = math.ceil(self.eval_size[0] / stride) w = math.ceil(self.eval_size[1] / stride) shift_x = paddle.arange(end=w) + self.cell_offset shift_y = paddle.arange(end=h) + self.cell_offset shift_y, shift_x = paddle.meshgrid(shift_y, shift_x) anchor_point = paddle.cast( paddle.stack( [shift_x, shift_y], axis=-1), dtype='float32') anchor_points.append(anchor_point.reshape([-1, 2])) stride_tensor.append( paddle.full( [h * w, 1], stride, dtype='float32')) anchor_points = paddle.concat(anchor_points) stride_tensor = paddle.concat(stride_tensor) return anchor_points, stride_tensor def post_process(self, head_outs, scale_factor, export_nms=True, nms_cpu=False): pred_scores, pred_bboxes = head_outs if not export_nms: return pred_bboxes, pred_scores else: # rescale: [h_scale, w_scale] -> [w_scale, h_scale, w_scale, h_scale] scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1) scale_factor = paddle.concat( [scale_x, scale_y, scale_x, scale_y], axis=-1).reshape([-1, 1, 4]) # scale bbox to origin image size. pred_bboxes /= scale_factor if nms_cpu: paddle.set_device("cpu") bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores) paddle.set_device(self.device) else: bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores) return bbox_pred, bbox_num @register class PicoHeadV2(GFLHead): """ PicoHeadV2 Args: conv_feat (object): Instance of 'PicoFeat' num_classes (int): Number of classes fpn_stride (list): The stride of each FPN Layer prior_prob (float): Used to set the bias init for the class prediction layer loss_class (object): Instance of VariFocalLoss. loss_dfl (object): Instance of DistributionFocalLoss. loss_bbox (object): Instance of bbox loss. assigner (object): Instance of label assigner. reg_max: Max value of integral set :math: `{0, ..., reg_max}` n QFL setting. Default: 7. """ __inject__ = [ 'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox', 'static_assigner', 'assigner', 'nms' ] __shared__ = ['num_classes', 'eval_size'] def __init__(self, conv_feat='PicoFeatV2', dgqp_module=None, num_classes=80, fpn_stride=[8, 16, 32], prior_prob=0.01, use_align_head=True, loss_class='VariFocalLoss', loss_dfl='DistributionFocalLoss', loss_bbox='GIoULoss', static_assigner_epoch=60, static_assigner='ATSSAssigner', assigner='TaskAlignedAssigner', reg_max=16, feat_in_chan=96, nms=None, nms_pre=1000, cell_offset=0, act='hard_swish', grid_cell_scale=5.0, eval_size=None): super(PicoHeadV2, self).__init__( conv_feat=conv_feat, dgqp_module=dgqp_module, num_classes=num_classes, fpn_stride=fpn_stride, prior_prob=prior_prob, loss_class=loss_class, loss_dfl=loss_dfl, loss_bbox=loss_bbox, reg_max=reg_max, feat_in_chan=feat_in_chan, nms=nms, nms_pre=nms_pre, cell_offset=cell_offset, ) self.conv_feat = conv_feat self.num_classes = num_classes self.fpn_stride = fpn_stride self.prior_prob = prior_prob self.loss_vfl = loss_class self.loss_dfl = loss_dfl self.loss_bbox = loss_bbox self.static_assigner_epoch = static_assigner_epoch self.static_assigner = static_assigner self.assigner = assigner self.reg_max = reg_max self.feat_in_chan = feat_in_chan self.nms = nms self.nms_pre = nms_pre self.cell_offset = cell_offset self.act = act self.grid_cell_scale = grid_cell_scale self.use_align_head = use_align_head self.cls_out_channels = self.num_classes self.eval_size = eval_size bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob) # Clear the super class initialization self.gfl_head_cls = None self.gfl_head_reg = None self.scales_regs = None self.head_cls_list = nn.LayerList() self.head_reg_list = nn.LayerList() self.cls_align = nn.LayerList() for i in range(len(fpn_stride)): head_cls = self.add_sublayer( "head_cls" + str(i), nn.Conv2D( in_channels=self.feat_in_chan, out_channels=self.cls_out_channels, kernel_size=1, stride=1, padding=0, weight_attr=ParamAttr(initializer=Normal( mean=0., std=0.01)), bias_attr=ParamAttr( initializer=Constant(value=bias_init_value)))) self.head_cls_list.append(head_cls) head_reg = self.add_sublayer( "head_reg" + str(i), nn.Conv2D( in_channels=self.feat_in_chan, out_channels=4 * (self.reg_max + 1), kernel_size=1, stride=1, padding=0, weight_attr=ParamAttr(initializer=Normal( mean=0., std=0.01)), bias_attr=ParamAttr(initializer=Constant(value=0)))) self.head_reg_list.append(head_reg) if self.use_align_head: self.cls_align.append( DPModule( self.feat_in_chan, 1, 5, act=self.act, use_act_in_out=False)) # initialize the anchor points if self.eval_size: self.anchor_points, self.stride_tensor = self._generate_anchors() def forward(self, fpn_feats, export_post_process=True): assert len(fpn_feats) == len( self.fpn_stride ), "The size of fpn_feats is not equal to size of fpn_stride" if self.training: return self.forward_train(fpn_feats) else: return self.forward_eval( fpn_feats, export_post_process=export_post_process) def forward_train(self, fpn_feats): cls_score_list, reg_list, box_list = [], [], [] for i, (fpn_feat, stride) in enumerate(zip(fpn_feats, self.fpn_stride)): b, _, h, w = get_static_shape(fpn_feat) # task decomposition conv_cls_feat, se_feat = self.conv_feat(fpn_feat, i) cls_logit = self.head_cls_list[i](se_feat) reg_pred = self.head_reg_list[i](se_feat) # cls prediction and alignment if self.use_align_head: cls_prob = F.sigmoid(self.cls_align[i](conv_cls_feat)) cls_score = (F.sigmoid(cls_logit) * cls_prob + eps).sqrt() else: cls_score = F.sigmoid(cls_logit) cls_score_out = cls_score.transpose([0, 2, 3, 1]) bbox_pred = reg_pred.transpose([0, 2, 3, 1]) b, cell_h, cell_w, _ = cls_score_out.shape y, x = self.get_single_level_center_point( [cell_h, cell_w], stride, cell_offset=self.cell_offset) center_points = paddle.stack([x, y], axis=-1) cls_score_out = cls_score_out.reshape( [b, -1, self.cls_out_channels]) bbox_pred = self.distribution_project(bbox_pred) * stride bbox_pred = bbox_pred.reshape([b, cell_h * cell_w, 4]) bbox_pred = batch_distance2bbox( center_points, bbox_pred, max_shapes=None) cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1])) reg_list.append(reg_pred.flatten(2).transpose([0, 2, 1])) box_list.append(bbox_pred / stride) cls_score_list = paddle.concat(cls_score_list, axis=1) box_list = paddle.concat(box_list, axis=1) reg_list = paddle.concat(reg_list, axis=1) return cls_score_list, reg_list, box_list, fpn_feats def forward_eval(self, fpn_feats, export_post_process=True): if self.eval_size: anchor_points, stride_tensor = self.anchor_points, self.stride_tensor else: anchor_points, stride_tensor = self._generate_anchors(fpn_feats) cls_score_list, box_list = [], [] for i, (fpn_feat, stride) in enumerate(zip(fpn_feats, self.fpn_stride)): _, _, h, w = fpn_feat.shape # task decomposition conv_cls_feat, se_feat = self.conv_feat(fpn_feat, i) cls_logit = self.head_cls_list[i](se_feat) reg_pred = self.head_reg_list[i](se_feat) # cls prediction and alignment if self.use_align_head: cls_prob = F.sigmoid(self.cls_align[i](conv_cls_feat)) cls_score = (F.sigmoid(cls_logit) * cls_prob + eps).sqrt() else: cls_score = F.sigmoid(cls_logit) if not export_post_process: # Now only supports batch size = 1 in deploy cls_score_list.append( cls_score.reshape([1, self.cls_out_channels, -1]).transpose( [0, 2, 1])) box_list.append( reg_pred.reshape([1, (self.reg_max + 1) * 4, -1]).transpose( [0, 2, 1])) else: l = h * w cls_score_out = cls_score.reshape( [-1, self.cls_out_channels, l]) bbox_pred = reg_pred.transpose([0, 2, 3, 1]) bbox_pred = self.distribution_project(bbox_pred) bbox_pred = bbox_pred.reshape([-1, l, 4]) cls_score_list.append(cls_score_out) box_list.append(bbox_pred) if export_post_process: cls_score_list = paddle.concat(cls_score_list, axis=-1) box_list = paddle.concat(box_list, axis=1) box_list = batch_distance2bbox(anchor_points, box_list) box_list *= stride_tensor return cls_score_list, box_list def get_loss(self, head_outs, gt_meta): pred_scores, pred_regs, pred_bboxes, fpn_feats = head_outs gt_labels = gt_meta['gt_class'] gt_bboxes = gt_meta['gt_bbox'] gt_scores = gt_meta['gt_score'] if 'gt_score' in gt_meta else None num_imgs = gt_meta['im_id'].shape[0] pad_gt_mask = gt_meta['pad_gt_mask'] anchors, _, num_anchors_list, stride_tensor_list = generate_anchors_for_grid_cell( fpn_feats, self.fpn_stride, self.grid_cell_scale, self.cell_offset) centers = bbox_center(anchors) # label assignment if gt_meta['epoch_id'] < self.static_assigner_epoch: assigned_labels, assigned_bboxes, assigned_scores = self.static_assigner( anchors, num_anchors_list, gt_labels, gt_bboxes, pad_gt_mask, bg_index=self.num_classes, gt_scores=gt_scores, pred_bboxes=pred_bboxes.detach() * stride_tensor_list) else: assigned_labels, assigned_bboxes, assigned_scores = self.assigner( pred_scores.detach(), pred_bboxes.detach() * stride_tensor_list, centers, num_anchors_list, gt_labels, gt_bboxes, pad_gt_mask, bg_index=self.num_classes, gt_scores=gt_scores) assigned_bboxes /= stride_tensor_list centers_shape = centers.shape flatten_centers = centers.expand( [num_imgs, centers_shape[0], centers_shape[1]]).reshape([-1, 2]) flatten_strides = stride_tensor_list.expand( [num_imgs, centers_shape[0], 1]).reshape([-1, 1]) flatten_cls_preds = pred_scores.reshape([-1, self.num_classes]) flatten_regs = pred_regs.reshape([-1, 4 * (self.reg_max + 1)]) flatten_bboxes = pred_bboxes.reshape([-1, 4]) flatten_bbox_targets = assigned_bboxes.reshape([-1, 4]) flatten_labels = assigned_labels.reshape([-1]) flatten_assigned_scores = assigned_scores.reshape( [-1, self.num_classes]) pos_inds = paddle.nonzero( paddle.logical_and((flatten_labels >= 0), (flatten_labels < self.num_classes)), as_tuple=False).squeeze(1) num_total_pos = len(pos_inds) if num_total_pos > 0: pos_bbox_targets = paddle.gather( flatten_bbox_targets, pos_inds, axis=0) pos_decode_bbox_pred = paddle.gather( flatten_bboxes, pos_inds, axis=0) pos_reg = paddle.gather(flatten_regs, pos_inds, axis=0) pos_strides = paddle.gather(flatten_strides, pos_inds, axis=0) pos_centers = paddle.gather( flatten_centers, pos_inds, axis=0) / pos_strides weight_targets = flatten_assigned_scores.detach() weight_targets = paddle.gather( weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0) pred_corners = pos_reg.reshape([-1, self.reg_max + 1]) target_corners = bbox2distance(pos_centers, pos_bbox_targets, self.reg_max).reshape([-1]) # regression loss loss_bbox = paddle.sum( self.loss_bbox(pos_decode_bbox_pred, pos_bbox_targets) * weight_targets) # dfl loss loss_dfl = self.loss_dfl( pred_corners, target_corners, weight=weight_targets.expand([-1, 4]).reshape([-1]), avg_factor=4.0) else: loss_bbox = paddle.zeros([]) loss_dfl = paddle.zeros([]) avg_factor = flatten_assigned_scores.sum() if paddle.distributed.get_world_size() > 1: paddle.distributed.all_reduce(avg_factor) avg_factor = paddle.clip( avg_factor / paddle.distributed.get_world_size(), min=1) loss_vfl = self.loss_vfl( flatten_cls_preds, flatten_assigned_scores, avg_factor=avg_factor) loss_bbox = loss_bbox / avg_factor loss_dfl = loss_dfl / avg_factor loss_states = dict( loss_vfl=loss_vfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl) return loss_states def _generate_anchors(self, feats=None): # just use in eval time anchor_points = [] stride_tensor = [] for i, stride in enumerate(self.fpn_stride): if feats is not None: _, _, h, w = feats[i].shape else: h = math.ceil(self.eval_size[0] / stride) w = math.ceil(self.eval_size[1] / stride) shift_x = paddle.arange(end=w) + self.cell_offset shift_y = paddle.arange(end=h) + self.cell_offset shift_y, shift_x = paddle.meshgrid(shift_y, shift_x) anchor_point = paddle.cast( paddle.stack( [shift_x, shift_y], axis=-1), dtype='float32') anchor_points.append(anchor_point.reshape([-1, 2])) stride_tensor.append( paddle.full( [h * w, 1], stride, dtype='float32')) anchor_points = paddle.concat(anchor_points) stride_tensor = paddle.concat(stride_tensor) return anchor_points, stride_tensor def post_process(self, head_outs, scale_factor, export_nms=True, nms_cpu=False): pred_scores, pred_bboxes = head_outs if not export_nms: return pred_bboxes, pred_scores else: # rescale: [h_scale, w_scale] -> [w_scale, h_scale, w_scale, h_scale] scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1) scale_factor = paddle.concat( [scale_x, scale_y, scale_x, scale_y], axis=-1).reshape([-1, 1, 4]) # scale bbox to origin image size. pred_bboxes /= scale_factor bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores) return bbox_pred, bbox_num ================================================ FILE: ppdet/modeling/heads/ppyoloe_contrast_head.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from ..initializer import bias_init_with_prob, constant_ from ..assigners.utils import generate_anchors_for_grid_cell from ppdet.modeling.heads.ppyoloe_head import PPYOLOEHead __all__ = ['PPYOLOEContrastHead'] @register class PPYOLOEContrastHead(PPYOLOEHead): __shared__ = [ 'num_classes', 'eval_size', 'trt', 'exclude_nms', 'exclude_post_process', 'use_shared_conv', 'for_distill' ] __inject__ = ['static_assigner', 'assigner', 'nms', 'contrast_loss'] def __init__(self, in_channels=[1024, 512, 256], num_classes=80, act='swish', fpn_strides=(32, 16, 8), grid_cell_scale=5.0, grid_cell_offset=0.5, reg_max=16, reg_range=None, static_assigner_epoch=4, use_varifocal_loss=True, static_assigner='ATSSAssigner', assigner='TaskAlignedAssigner', contrast_loss='SupContrast', nms='MultiClassNMS', eval_size=None, loss_weight={ 'class': 1.0, 'iou': 2.5, 'dfl': 0.5, }, trt=False, attn_conv='convbn', exclude_nms=False, exclude_post_process=False, use_shared_conv=True, for_distill=False): super().__init__(in_channels, num_classes, act, fpn_strides, grid_cell_scale, grid_cell_offset, reg_max, reg_range, static_assigner_epoch, use_varifocal_loss, static_assigner, assigner, nms, eval_size, loss_weight, trt, attn_conv, exclude_nms, exclude_post_process, use_shared_conv, for_distill) assert len(in_channels) > 0, "len(in_channels) should > 0" self.contrast_loss = contrast_loss self.contrast_encoder = nn.LayerList() for in_c in self.in_channels: self.contrast_encoder.append(nn.Conv2D(in_c, 128, 3, padding=1)) self._init_contrast_encoder() def _init_contrast_encoder(self): bias_en = bias_init_with_prob(0.01) for en_ in self.contrast_encoder: constant_(en_.weight) constant_(en_.bias, bias_en) def forward_train(self, feats, targets, aux_pred=None): anchors, anchor_points, num_anchors_list, stride_tensor = \ generate_anchors_for_grid_cell( feats, self.fpn_strides, self.grid_cell_scale, self.grid_cell_offset) cls_score_list, reg_distri_list = [], [] contrast_encoder_list = [] for i, feat in enumerate(feats): avg_feat = F.adaptive_avg_pool2d(feat, (1, 1)) cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) + feat) reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat)) contrast_logit = self.contrast_encoder[i](self.stem_cls[i]( feat, avg_feat) + feat) contrast_encoder_list.append( contrast_logit.flatten(2).transpose([0, 2, 1])) # cls and reg cls_score = F.sigmoid(cls_logit) cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1])) reg_distri_list.append(reg_distri.flatten(2).transpose([0, 2, 1])) cls_score_list = paddle.concat(cls_score_list, axis=1) reg_distri_list = paddle.concat(reg_distri_list, axis=1) contrast_encoder_list = paddle.concat(contrast_encoder_list, axis=1) return self.get_loss([ cls_score_list, reg_distri_list, contrast_encoder_list, anchors, anchor_points, num_anchors_list, stride_tensor ], targets) def get_loss(self, head_outs, gt_meta): pred_scores, pred_distri, pred_contrast_encoder, anchors,\ anchor_points, num_anchors_list, stride_tensor = head_outs anchor_points_s = anchor_points / stride_tensor pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri) gt_labels = gt_meta['gt_class'] gt_bboxes = gt_meta['gt_bbox'] pad_gt_mask = gt_meta['pad_gt_mask'] # label assignment if gt_meta['epoch_id'] < self.static_assigner_epoch: assigned_labels, assigned_bboxes, assigned_scores = \ self.static_assigner( anchors, num_anchors_list, gt_labels, gt_bboxes, pad_gt_mask, bg_index=self.num_classes, pred_bboxes=pred_bboxes.detach() * stride_tensor) alpha_l = 0.25 else: if self.sm_use: assigned_labels, assigned_bboxes, assigned_scores = \ self.assigner( pred_scores.detach(), pred_bboxes.detach() * stride_tensor, anchor_points, stride_tensor, gt_labels, gt_bboxes, pad_gt_mask, bg_index=self.num_classes) else: assigned_labels, assigned_bboxes, assigned_scores = \ self.assigner( pred_scores.detach(), pred_bboxes.detach() * stride_tensor, anchor_points, num_anchors_list, gt_labels, gt_bboxes, pad_gt_mask, bg_index=self.num_classes) alpha_l = -1 # rescale bbox assigned_bboxes /= stride_tensor # cls loss if self.use_varifocal_loss: one_hot_label = F.one_hot(assigned_labels, self.num_classes + 1)[..., :-1] loss_cls = self._varifocal_loss(pred_scores, assigned_scores, one_hot_label) else: loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l) assigned_scores_sum = assigned_scores.sum() if paddle.distributed.get_world_size() > 1: paddle.distributed.all_reduce(assigned_scores_sum) assigned_scores_sum /= paddle.distributed.get_world_size() assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.) loss_cls /= assigned_scores_sum loss_l1, loss_iou, loss_dfl = \ self._bbox_loss(pred_distri, pred_bboxes, anchor_points_s, assigned_labels, assigned_bboxes, assigned_scores, assigned_scores_sum) # contrast loss loss_contrast = self.contrast_loss(pred_contrast_encoder.reshape([-1, pred_contrast_encoder.shape[-1]]), \ assigned_labels.reshape([-1]), assigned_scores.max(-1).reshape([-1])) loss = self.loss_weight['class'] * loss_cls + \ self.loss_weight['iou'] * loss_iou + \ self.loss_weight['dfl'] * loss_dfl + \ self.loss_weight['contrast'] * loss_contrast out_dict = { 'loss': loss, 'loss_cls': loss_cls, 'loss_iou': loss_iou, 'loss_dfl': loss_dfl, 'loss_l1': loss_l1, 'loss_contrast': loss_contrast } return out_dict ================================================ FILE: ppdet/modeling/heads/ppyoloe_head.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from paddle import ParamAttr from paddle.nn.initializer import KaimingNormal from paddle.nn.initializer import Normal, Constant from ..bbox_utils import batch_distance2bbox from ..losses import GIoULoss from ..initializer import bias_init_with_prob, constant_, normal_ from ..assigners.utils import generate_anchors_for_grid_cell from ppdet.modeling.backbones.cspresnet import ConvBNLayer, RepVggBlock from ppdet.modeling.ops import get_static_shape, get_act_fn from ppdet.modeling.layers import MultiClassNMS __all__ = ['PPYOLOEHead', 'SimpleConvHead'] class ESEAttn(nn.Layer): def __init__(self, feat_channels, act='swish', attn_conv='convbn'): super(ESEAttn, self).__init__() self.fc = nn.Conv2D(feat_channels, feat_channels, 1) if attn_conv == 'convbn': self.conv = ConvBNLayer(feat_channels, feat_channels, 1, act=act) elif attn_conv == 'repvgg': self.conv = RepVggBlock(feat_channels, feat_channels, act=act) else: self.conv = None self._init_weights() def _init_weights(self): normal_(self.fc.weight, std=0.001) def forward(self, feat, avg_feat): weight = F.sigmoid(self.fc(avg_feat)) if self.conv: return self.conv(feat * weight) else: return feat * weight @register class PPYOLOEHead(nn.Layer): __shared__ = [ 'num_classes', 'eval_size', 'trt', 'exclude_nms', 'exclude_post_process', 'use_shared_conv', 'for_distill' ] __inject__ = ['static_assigner', 'assigner', 'nms'] def __init__(self, in_channels=[1024, 512, 256], num_classes=80, act='swish', fpn_strides=(32, 16, 8), grid_cell_scale=5.0, grid_cell_offset=0.5, reg_max=16, reg_range=None, static_assigner_epoch=4, use_varifocal_loss=True, static_assigner='ATSSAssigner', assigner='TaskAlignedAssigner', nms='MultiClassNMS', eval_size=None, loss_weight={ 'class': 1.0, 'iou': 2.5, 'dfl': 0.5, }, trt=False, attn_conv='convbn', exclude_nms=False, exclude_post_process=False, use_shared_conv=True, for_distill=False): super(PPYOLOEHead, self).__init__() assert len(in_channels) > 0, "len(in_channels) should > 0" self.in_channels = in_channels self.num_classes = num_classes self.fpn_strides = fpn_strides self.grid_cell_scale = grid_cell_scale self.grid_cell_offset = grid_cell_offset if reg_range: self.sm_use = True self.reg_range = reg_range else: self.sm_use = False self.reg_range = (0, reg_max + 1) self.reg_channels = self.reg_range[1] - self.reg_range[0] self.iou_loss = GIoULoss() self.loss_weight = loss_weight self.use_varifocal_loss = use_varifocal_loss self.eval_size = eval_size self.static_assigner_epoch = static_assigner_epoch self.static_assigner = static_assigner self.assigner = assigner self.nms = nms if isinstance(self.nms, MultiClassNMS) and trt: self.nms.trt = trt self.exclude_nms = exclude_nms self.exclude_post_process = exclude_post_process self.use_shared_conv = use_shared_conv self.for_distill = for_distill self.is_teacher = False # stem self.stem_cls = nn.LayerList() self.stem_reg = nn.LayerList() act = get_act_fn( act, trt=trt) if act is None or isinstance(act, (str, dict)) else act for in_c in self.in_channels: self.stem_cls.append(ESEAttn(in_c, act=act, attn_conv=attn_conv)) self.stem_reg.append(ESEAttn(in_c, act=act, attn_conv=attn_conv)) # pred head self.pred_cls = nn.LayerList() self.pred_reg = nn.LayerList() for in_c in self.in_channels: self.pred_cls.append( nn.Conv2D( in_c, self.num_classes, 3, padding=1)) self.pred_reg.append( nn.Conv2D( in_c, 4 * self.reg_channels, 3, padding=1)) # projection conv self.proj_conv = nn.Conv2D(self.reg_channels, 1, 1, bias_attr=False) self.proj_conv.skip_quant = True self._init_weights() if self.for_distill: self.distill_pairs = {} @classmethod def from_config(cls, cfg, input_shape): return {'in_channels': [i.channels for i in input_shape], } def _init_weights(self): bias_cls = bias_init_with_prob(0.01) for cls_, reg_ in zip(self.pred_cls, self.pred_reg): constant_(cls_.weight) constant_(cls_.bias, bias_cls) constant_(reg_.weight) constant_(reg_.bias, 1.0) proj = paddle.linspace(self.reg_range[0], self.reg_range[1] - 1, self.reg_channels).reshape( [1, self.reg_channels, 1, 1]) self.proj_conv.weight.set_value(proj) self.proj_conv.weight.stop_gradient = True if self.eval_size: anchor_points, stride_tensor = self._generate_anchors() self.anchor_points = anchor_points self.stride_tensor = stride_tensor def m_avg_pool2d(self, feat, w, h): batch_size, channels, _, _ = feat.shape feat_flat = paddle.reshape(feat, [batch_size, channels, -1]) feat_mean = paddle.mean(feat_flat, axis=2) feat_mean = paddle.reshape( feat_mean, [batch_size, channels, w, h]) return feat_mean def forward_train(self, feats, targets, aux_pred=None): anchors, anchor_points, num_anchors_list, stride_tensor = \ generate_anchors_for_grid_cell( feats, self.fpn_strides, self.grid_cell_scale, self.grid_cell_offset) cls_score_list, reg_distri_list = [], [] for i, feat in enumerate(feats): if (paddle.get_device()[:3]=='npu'): avg_feat = self.m_avg_pool2d(feat, 1, 1) else: avg_feat = F.adaptive_avg_pool2d(feat, (1, 1)) cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) + feat) reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat)) # cls and reg cls_score = F.sigmoid(cls_logit) cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1])) reg_distri_list.append(reg_distri.flatten(2).transpose([0, 2, 1])) cls_score_list = paddle.concat(cls_score_list, axis=1) reg_distri_list = paddle.concat(reg_distri_list, axis=1) if targets.get('is_teacher', False): pred_deltas, pred_dfls = self._bbox_decode_fake(reg_distri_list) return cls_score_list, pred_deltas * stride_tensor, pred_dfls if targets.get('get_data', False): pred_deltas, pred_dfls = self._bbox_decode_fake(reg_distri_list) return cls_score_list, pred_deltas * stride_tensor, pred_dfls return self.get_loss([ cls_score_list, reg_distri_list, anchors, anchor_points, num_anchors_list, stride_tensor ], targets, aux_pred) def _generate_anchors(self, feats=None, dtype='float32'): # just use in eval time anchor_points = [] stride_tensor = [] for i, stride in enumerate(self.fpn_strides): if feats is not None: _, _, h, w = feats[i].shape else: h = int(self.eval_size[0] / stride) w = int(self.eval_size[1] / stride) shift_x = paddle.arange(end=w) + self.grid_cell_offset shift_y = paddle.arange(end=h) + self.grid_cell_offset shift_y, shift_x = paddle.meshgrid(shift_y, shift_x) anchor_point = paddle.cast( paddle.stack( [shift_x, shift_y], axis=-1), dtype=dtype) anchor_points.append(anchor_point.reshape([-1, 2])) stride_tensor.append(paddle.full([h * w, 1], stride, dtype=dtype)) anchor_points = paddle.concat(anchor_points) stride_tensor = paddle.concat(stride_tensor) return anchor_points, stride_tensor def forward_eval(self, feats): if self.eval_size: anchor_points, stride_tensor = self.anchor_points, self.stride_tensor else: anchor_points, stride_tensor = self._generate_anchors(feats) cls_score_list, reg_dist_list = [], [] for i, feat in enumerate(feats): _, _, h, w = feat.shape l = h * w if (paddle.device.get_device()[:3]=='npu'): avg_feat = self.m_avg_pool2d(feat, 1, 1) else: avg_feat = F.adaptive_avg_pool2d(feat, (1, 1)) cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) + feat) reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat)) reg_dist = reg_dist.reshape( [-1, 4, self.reg_channels, l]).transpose([0, 2, 3, 1]) if self.use_shared_conv: reg_dist = self.proj_conv(F.softmax( reg_dist, axis=1)).squeeze(1) else: reg_dist = F.softmax(reg_dist, axis=1) # cls and reg cls_score = F.sigmoid(cls_logit) cls_score_list.append(cls_score.reshape([-1, self.num_classes, l])) reg_dist_list.append(reg_dist) cls_score_list = paddle.concat(cls_score_list, axis=-1) if self.use_shared_conv: reg_dist_list = paddle.concat(reg_dist_list, axis=1) else: reg_dist_list = paddle.concat(reg_dist_list, axis=2) reg_dist_list = self.proj_conv(reg_dist_list).squeeze(1) return cls_score_list, reg_dist_list, anchor_points, stride_tensor def forward(self, feats, targets=None, aux_pred=None): assert len(feats) == len(self.fpn_strides), \ "The size of feats is not equal to size of fpn_strides" if self.training: return self.forward_train(feats, targets, aux_pred) else: if targets is not None: # only for semi-det self.is_teacher = targets.get('is_teacher', False) if self.is_teacher: return self.forward_train(feats, targets, aux_pred=None) else: return self.forward_eval(feats) return self.forward_eval(feats) @staticmethod def _focal_loss(score, label, alpha=0.25, gamma=2.0): weight = (score - label).pow(gamma) if alpha > 0: alpha_t = alpha * label + (1 - alpha) * (1 - label) weight *= alpha_t loss = F.binary_cross_entropy( score, label, weight=weight, reduction='sum') return loss @staticmethod def _varifocal_loss(pred_score, gt_score, label, alpha=0.75, gamma=2.0): weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label loss = F.binary_cross_entropy( pred_score, gt_score, weight=weight, reduction='sum') return loss def _bbox_decode(self, anchor_points, pred_dist): _, l, _ = get_static_shape(pred_dist) pred_dist = F.softmax(pred_dist.reshape([-1, l, 4, self.reg_channels])) pred_dist = self.proj_conv(pred_dist.transpose([0, 3, 1, 2])).squeeze(1) return batch_distance2bbox(anchor_points, pred_dist) def _bbox_decode_fake(self, pred_dist): _, l, _ = get_static_shape(pred_dist) pred_dist_dfl = F.softmax( pred_dist.reshape([-1, l, 4, self.reg_channels])) pred_dist = self.proj_conv(pred_dist_dfl.transpose([0, 3, 1, 2 ])).squeeze(1) return pred_dist, pred_dist_dfl def _bbox2distance(self, points, bbox): x1y1, x2y2 = paddle.split(bbox, 2, -1) lt = points - x1y1 rb = x2y2 - points return paddle.concat([lt, rb], -1).clip(self.reg_range[0], self.reg_range[1] - 1 - 0.01) def _df_loss(self, pred_dist, target, lower_bound=0): target_left = paddle.cast(target.floor(), 'int64') target_right = target_left + 1 weight_left = target_right.astype('float32') - target weight_right = 1 - weight_left loss_left = F.cross_entropy( pred_dist, target_left - lower_bound, reduction='none') * weight_left loss_right = F.cross_entropy( pred_dist, target_right - lower_bound, reduction='none') * weight_right return (loss_left + loss_right).mean(-1, keepdim=True) def _bbox_loss(self, pred_dist, pred_bboxes, anchor_points, assigned_labels, assigned_bboxes, assigned_scores, assigned_scores_sum): # select positive samples mask mask_positive = (assigned_labels != self.num_classes) if self.for_distill: # only used for LD main_kd distill self.distill_pairs['mask_positive_select'] = mask_positive num_pos = mask_positive.sum() # pos/neg loss if num_pos > 0: # l1 + iou bbox_mask = mask_positive.astype('int32').unsqueeze(-1).tile( [1, 1, 4]).astype('bool') pred_bboxes_pos = paddle.masked_select(pred_bboxes, bbox_mask).reshape([-1, 4]) assigned_bboxes_pos = paddle.masked_select( assigned_bboxes, bbox_mask).reshape([-1, 4]) bbox_weight = paddle.masked_select( assigned_scores.sum(-1), mask_positive).unsqueeze(-1) loss_l1 = F.l1_loss(pred_bboxes_pos, assigned_bboxes_pos) loss_iou = self.iou_loss(pred_bboxes_pos, assigned_bboxes_pos) * bbox_weight loss_iou = loss_iou.sum() / assigned_scores_sum dist_mask = mask_positive.unsqueeze(-1).astype('int32').tile( [1, 1, self.reg_channels * 4]).astype('bool') pred_dist_pos = paddle.masked_select( pred_dist, dist_mask).reshape([-1, 4, self.reg_channels]) assigned_ltrb = self._bbox2distance(anchor_points, assigned_bboxes) assigned_ltrb_pos = paddle.masked_select( assigned_ltrb, bbox_mask).reshape([-1, 4]) loss_dfl = self._df_loss(pred_dist_pos, assigned_ltrb_pos, self.reg_range[0]) * bbox_weight loss_dfl = loss_dfl.sum() / assigned_scores_sum if self.for_distill: self.distill_pairs['pred_bboxes_pos'] = pred_bboxes_pos self.distill_pairs['pred_dist_pos'] = pred_dist_pos self.distill_pairs['bbox_weight'] = bbox_weight else: loss_l1 = paddle.zeros([]) loss_iou = paddle.zeros([]) loss_dfl = pred_dist.sum() * 0. return loss_l1, loss_iou, loss_dfl def get_loss(self, head_outs, gt_meta, aux_pred=None): pred_scores, pred_distri, anchors,\ anchor_points, num_anchors_list, stride_tensor = head_outs anchor_points_s = anchor_points / stride_tensor pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri) if aux_pred is not None: pred_scores_aux = aux_pred[0] pred_bboxes_aux = self._bbox_decode(anchor_points_s, aux_pred[1]) if 'origin_gt_class' in gt_meta: gt_labels = gt_meta['origin_gt_class'] gt_bboxes = gt_meta['origin_gt_bbox'] pad_gt_mask = gt_meta['pad_origin_gt_mask'] else: gt_labels = gt_meta['gt_class'] gt_bboxes = gt_meta['gt_bbox'] pad_gt_mask = gt_meta['pad_gt_mask'] # label assignment if gt_meta['epoch_id'] < self.static_assigner_epoch: assigned_labels, assigned_bboxes, assigned_scores = \ self.static_assigner( anchors, num_anchors_list, gt_labels, gt_bboxes, pad_gt_mask, bg_index=self.num_classes, pred_bboxes=pred_bboxes.detach() * stride_tensor) alpha_l = 0.25 else: if self.sm_use: # only used in smalldet of PPYOLOE-SOD model assigned_labels, assigned_bboxes, assigned_scores = \ self.assigner( pred_scores.detach(), pred_bboxes.detach() * stride_tensor, anchor_points, stride_tensor, gt_labels, gt_bboxes, pad_gt_mask, bg_index=self.num_classes) else: if aux_pred is None: if not hasattr(self, "assigned_labels"): assigned_labels, assigned_bboxes, assigned_scores = \ self.assigner( pred_scores.detach(), pred_bboxes.detach() * stride_tensor, anchor_points, num_anchors_list, gt_labels, gt_bboxes, pad_gt_mask, bg_index=self.num_classes) if self.for_distill: self.assigned_labels = assigned_labels self.assigned_bboxes = assigned_bboxes self.assigned_scores = assigned_scores else: # only used in distill assigned_labels = self.assigned_labels assigned_bboxes = self.assigned_bboxes assigned_scores = self.assigned_scores else: assigned_labels, assigned_bboxes, assigned_scores = \ self.assigner( pred_scores_aux.detach(), pred_bboxes_aux.detach() * stride_tensor, anchor_points, num_anchors_list, gt_labels, gt_bboxes, pad_gt_mask, bg_index=self.num_classes) alpha_l = -1 # rescale bbox assigned_bboxes /= stride_tensor assign_out_dict = self.get_loss_from_assign( pred_scores, pred_distri, pred_bboxes, anchor_points_s, assigned_labels, assigned_bboxes, assigned_scores, alpha_l) if aux_pred is not None: assign_out_dict_aux = self.get_loss_from_assign( aux_pred[0], aux_pred[1], pred_bboxes_aux, anchor_points_s, assigned_labels, assigned_bboxes, assigned_scores, alpha_l) loss = {} for key in assign_out_dict.keys(): loss[key] = assign_out_dict[key] + assign_out_dict_aux[key] else: loss = assign_out_dict return loss def get_loss_from_assign(self, pred_scores, pred_distri, pred_bboxes, anchor_points_s, assigned_labels, assigned_bboxes, assigned_scores, alpha_l): # cls loss if self.use_varifocal_loss: one_hot_label = F.one_hot(assigned_labels, self.num_classes + 1)[..., :-1] loss_cls = self._varifocal_loss(pred_scores, assigned_scores, one_hot_label) else: loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l) assigned_scores_sum = assigned_scores.sum() if paddle.distributed.get_world_size() > 1: paddle.distributed.all_reduce(assigned_scores_sum) assigned_scores_sum /= paddle.distributed.get_world_size() assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.) loss_cls /= assigned_scores_sum if self.for_distill: self.distill_pairs['pred_cls_scores'] = pred_scores self.distill_pairs['pos_num'] = assigned_scores_sum self.distill_pairs['assigned_scores'] = assigned_scores one_hot_label = F.one_hot(assigned_labels, self.num_classes + 1)[..., :-1] self.distill_pairs['target_labels'] = one_hot_label loss_l1, loss_iou, loss_dfl = \ self._bbox_loss(pred_distri, pred_bboxes, anchor_points_s, assigned_labels, assigned_bboxes, assigned_scores, assigned_scores_sum) loss = self.loss_weight['class'] * loss_cls + \ self.loss_weight['iou'] * loss_iou + \ self.loss_weight['dfl'] * loss_dfl out_dict = { 'loss': loss, 'loss_cls': loss_cls, 'loss_iou': loss_iou, 'loss_dfl': loss_dfl, 'loss_l1': loss_l1, } return out_dict def post_process(self, head_outs, scale_factor): pred_scores, pred_dist, anchor_points, stride_tensor = head_outs pred_bboxes = batch_distance2bbox(anchor_points, pred_dist) pred_bboxes *= stride_tensor if self.exclude_post_process: return paddle.concat( [pred_bboxes, pred_scores.transpose([0, 2, 1])], axis=-1), None, None else: # scale bbox to origin scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1) scale_factor = paddle.concat( [scale_x, scale_y, scale_x, scale_y], axis=-1).reshape([-1, 1, 4]) pred_bboxes /= scale_factor if self.exclude_nms: # `exclude_nms=True` just use in benchmark return pred_bboxes, pred_scores, None else: bbox_pred, bbox_num, nms_keep_idx = self.nms(pred_bboxes, pred_scores) return bbox_pred, bbox_num, nms_keep_idx def get_activation(name="LeakyReLU"): if name == "silu": module = nn.Silu() elif name == "relu": module = nn.ReLU() elif name in ["LeakyReLU", 'leakyrelu', 'lrelu']: module = nn.LeakyReLU(0.1) elif name is None: module = nn.Identity() else: raise AttributeError("Unsupported act type: {}".format(name)) return module class ConvNormLayer(nn.Layer): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, norm_type='gn', activation="LeakyReLU"): super(ConvNormLayer, self).__init__() assert norm_type in ['bn', 'sync_bn', 'syncbn', 'gn', None] self.conv = nn.Conv2D( in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias_attr=False, weight_attr=ParamAttr(initializer=KaimingNormal())) if norm_type in ['bn', 'sync_bn', 'syncbn']: self.norm = nn.BatchNorm2D(out_channels) elif norm_type == 'gn': self.norm = nn.GroupNorm(num_groups=32, num_channels=out_channels) else: self.norm = None self.act = get_activation(activation) def forward(self, x): y = self.conv(x) if self.norm is not None: y = self.norm(y) y = self.act(y) return y class ScaleReg(nn.Layer): """ Parameter for scaling the regression outputs. """ def __init__(self, scale=1.0): super(ScaleReg, self).__init__() scale = paddle.to_tensor(scale) self.scale = self.create_parameter( shape=[1], dtype='float32', default_initializer=nn.initializer.Assign(scale)) def forward(self, x): return x * self.scale @register class SimpleConvHead(nn.Layer): __shared__ = ['num_classes'] def __init__(self, num_classes=80, feat_in=288, feat_out=288, num_convs=1, fpn_strides=[32, 16, 8, 4], norm_type='gn', act='LeakyReLU', prior_prob=0.01, reg_max=16): super(SimpleConvHead, self).__init__() self.num_classes = num_classes self.feat_in = feat_in self.feat_out = feat_out self.num_convs = num_convs self.fpn_strides = fpn_strides self.reg_max = reg_max self.cls_convs = nn.LayerList() self.reg_convs = nn.LayerList() for i in range(self.num_convs): in_c = feat_in if i == 0 else feat_out self.cls_convs.append( ConvNormLayer( in_c, feat_out, 3, stride=1, padding=1, norm_type=norm_type, activation=act)) self.reg_convs.append( ConvNormLayer( in_c, feat_out, 3, stride=1, padding=1, norm_type=norm_type, activation=act)) bias_cls = bias_init_with_prob(prior_prob) self.gfl_cls = nn.Conv2D( feat_out, self.num_classes, kernel_size=3, stride=1, padding=1, weight_attr=ParamAttr(initializer=Normal( mean=0.0, std=0.01)), bias_attr=ParamAttr(initializer=Constant(value=bias_cls))) self.gfl_reg = nn.Conv2D( feat_out, 4 * (self.reg_max + 1), kernel_size=3, stride=1, padding=1, weight_attr=ParamAttr(initializer=Normal( mean=0.0, std=0.01)), bias_attr=ParamAttr(initializer=Constant(value=0))) self.scales = nn.LayerList() for i in range(len(self.fpn_strides)): self.scales.append(ScaleReg(1.0)) def forward(self, feats): cls_scores = [] bbox_preds = [] for x, scale in zip(feats, self.scales): cls_feat = x reg_feat = x for cls_conv in self.cls_convs: cls_feat = cls_conv(cls_feat) for reg_conv in self.reg_convs: reg_feat = reg_conv(reg_feat) cls_score = self.gfl_cls(cls_feat) cls_score = F.sigmoid(cls_score) cls_score = cls_score.flatten(2).transpose([0, 2, 1]) cls_scores.append(cls_score) bbox_pred = scale(self.gfl_reg(reg_feat)) bbox_pred = bbox_pred.flatten(2).transpose([0, 2, 1]) bbox_preds.append(bbox_pred) cls_scores = paddle.concat(cls_scores, axis=1) bbox_preds = paddle.concat(bbox_preds, axis=1) return cls_scores, bbox_preds ================================================ FILE: ppdet/modeling/heads/ppyoloe_ins_head.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from ppdet.modeling.backbones.csp_darknet import BaseConv from ppdet.modeling.layers import MultiClassNMS from ppdet.modeling.ops import get_static_shape, get_act_fn from .ppyoloe_head import ESEAttn from ..assigners.utils import generate_anchors_for_grid_cell from ..bbox_utils import batch_distance2bbox from ..initializer import bias_init_with_prob, constant_ from ..losses import GIoULoss __all__ = ['PPYOLOEInsHead'] def custom_binary_cross_entropy_with_logits(x, y): max_val = paddle.maximum(-x, paddle.to_tensor(0.0)) loss = (1 - y) * x + max_val + paddle.log( paddle.exp(-max_val) + paddle.exp(-x - max_val)) return loss class MaskProto(nn.Layer): # YOLOv8 mask Proto module for instance segmentation models def __init__(self, ch_in, num_protos=256, num_masks=32, act='silu'): super().__init__() self.conv1 = BaseConv(ch_in, num_protos, 3, 1, act=act) self.upsample = nn.Conv2DTranspose(num_protos, num_protos, 2, 2, 0, bias_attr=True) self.conv2 = BaseConv(num_protos, num_protos, 3, 1, act=act) self.conv3 = BaseConv(num_protos, num_masks, 1, 1, act=act) def forward(self, x): return self.conv3(self.conv2(self.upsample(self.conv1(x)))) def xyxy2xywh(x): """ Convert bounding box coordinates from (x1, y1, x2, y2) format to (x, y, width, height) format where (x1, y1) is the top-left corner and (x2, y2) is the bottom-right corner. """ assert x.shape[ -1] == 4, f'input shape last dimension expected 4 but input shape is {x.shape}' y = paddle.empty_like(x) if isinstance( x, paddle.Tensor) else np.empty_like(x) # faster than clone/copy y[..., 0] = (x[..., 0] + x[..., 2]) / 2 # x center y[..., 1] = (x[..., 1] + x[..., 3]) / 2 # y center y[..., 2] = x[..., 2] - x[..., 0] # width y[..., 3] = x[..., 3] - x[..., 1] # height return y def crop_mask(masks, boxes): """ It takes a mask and a bounding box, and returns a mask that is cropped to the bounding box Args: masks (paddle.Tensor): [h, w, n] tensor of masks boxes (paddle.Tensor): [n, 4] tensor of bbox coordinates in relative point form Returns: (paddle.Tensor): The masks are being cropped to the bounding box. """ _, h, w = masks.shape x1, y1, x2, y2 = paddle.chunk(boxes[:, :, None], 4, axis=1) r = paddle.arange(w, dtype=x1.dtype)[None, None, :] c = paddle.arange(h, dtype=y1.dtype)[None, :, None] if "npu" in paddle.device.get_all_custom_device_type(): # bool tensor broadcast multiply is extreamly slow on npu, so we cast it to float32. m_dtype = masks.dtype return masks * ((r >= x1).cast(m_dtype) * (r < x2).cast(m_dtype) * (c >= y1).cast(m_dtype) * (c < y2).cast(m_dtype)) else: return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2)).astype(masks.dtype) def process_mask_upsample(protos, masks_in, bboxes, shape): """ It takes the output of the mask head, and applies the mask to the bounding boxes. This produces masks of higher quality but is slower. Args: protos (paddle.Tensor): [mask_dim, mask_h, mask_w] masks_in (paddle.Tensor): [n, mask_dim], n is number of masks after nms bboxes (paddle.Tensor): [n, 4], n is number of masks after nms shape (tuple): the size of the input image (h,w) Returns: (paddle.Tensor): The upsampled masks. """ c, mh, mw = protos.shape # CHW masks = F.sigmoid(masks_in @ protos.reshape([c, -1])).reshape([-1, mh, mw]) masks = F.interpolate(masks[None], shape, mode='bilinear', align_corners=False)[0] # CHW masks = crop_mask(masks, bboxes) # CHW return masks @register class PPYOLOEInsHead(nn.Layer): __shared__ = [ 'num_classes', 'eval_size', 'trt', 'exclude_nms', 'exclude_post_process', 'use_shared_conv', 'for_distill', 'width_mult' ] __inject__ = ['static_assigner', 'assigner', 'nms'] def __init__(self, in_channels=[1024, 512, 256], num_classes=80, act='swish', fpn_strides=(32, 16, 8), grid_cell_scale=5.0, grid_cell_offset=0.5, reg_max=16, reg_range=None, static_assigner_epoch=4, use_varifocal_loss=True, static_assigner='ATSSAssigner', assigner='TaskAlignedAssigner', nms='MultiClassNMS', eval_size=None, loss_weight={ 'class': 1.0, 'iou': 2.5, 'dfl': 0.5, }, trt=False, attn_conv='convbn', exclude_nms=False, exclude_post_process=False, use_shared_conv=True, mask_thr_binary=0.5, num_masks=32, num_protos=256, width_mult=1.0, for_distill=False): super(PPYOLOEInsHead, self).__init__() assert len(in_channels) > 0, "len(in_channels) should > 0" self.mask_thr_binary = mask_thr_binary self.num_masks = num_masks self.num_protos = int(num_protos * width_mult) self.in_channels = in_channels self.num_classes = num_classes self.fpn_strides = fpn_strides self.grid_cell_scale = grid_cell_scale self.grid_cell_offset = grid_cell_offset if reg_range: self.sm_use = True self.reg_range = reg_range else: self.sm_use = False self.reg_range = (0, reg_max + 1) self.reg_channels = self.reg_range[1] - self.reg_range[0] self.iou_loss = GIoULoss() self.loss_weight = loss_weight self.use_varifocal_loss = use_varifocal_loss self.eval_size = eval_size self.static_assigner_epoch = static_assigner_epoch self.static_assigner = static_assigner self.assigner = assigner self.nms = nms if isinstance(self.nms, MultiClassNMS) and trt: self.nms.trt = trt self.exclude_nms = exclude_nms self.exclude_post_process = exclude_post_process self.use_shared_conv = use_shared_conv self.for_distill = for_distill self.is_teacher = False # stem self.stem_cls = nn.LayerList() self.stem_reg = nn.LayerList() self.stem_ins = nn.LayerList() act = get_act_fn( act, trt=trt) if act is None or isinstance(act, (str, dict)) else act for in_c in self.in_channels: self.stem_cls.append(ESEAttn(in_c, act=act, attn_conv=attn_conv)) self.stem_reg.append(ESEAttn(in_c, act=act, attn_conv=attn_conv)) self.stem_ins.append(ESEAttn(in_c, act=act, attn_conv=attn_conv)) # pred head self.pred_cls = nn.LayerList() self.pred_reg = nn.LayerList() self.pred_ins = nn.LayerList() for in_c in self.in_channels: self.pred_cls.append( nn.Conv2D(in_c, self.num_classes, 3, padding=1)) self.pred_reg.append( nn.Conv2D(in_c, 4 * self.reg_channels, 3, padding=1)) self.pred_ins.append(nn.Conv2D(in_c, self.num_masks, 3, padding=1)) # projection conv self.proj_conv = nn.Conv2D(self.reg_channels, 1, 1, bias_attr=False) self.proj_conv.skip_quant = True self._init_weights() self.proto = MaskProto(in_channels[-1], self.num_protos, self.num_masks, act=act) if self.for_distill: self.distill_pairs = {} @classmethod def from_config(cls, cfg, input_shape): return { 'in_channels': [i.channels for i in input_shape], } def _init_weights(self): bias_cls = bias_init_with_prob(0.01) for cls_, reg_ in zip(self.pred_cls, self.pred_reg): constant_(cls_.weight) constant_(cls_.bias, bias_cls) constant_(reg_.weight) constant_(reg_.bias, 1.0) proj = paddle.linspace(self.reg_range[0], self.reg_range[1] - 1, self.reg_channels).reshape( [1, self.reg_channels, 1, 1]) self.proj_conv.weight.set_value(proj) self.proj_conv.weight.stop_gradient = True if self.eval_size: anchor_points, stride_tensor = self._generate_anchors() self.anchor_points = anchor_points self.stride_tensor = stride_tensor def forward_train(self, feats, targets): anchors, anchor_points, num_anchors_list, stride_tensor = \ generate_anchors_for_grid_cell( feats, self.fpn_strides, self.grid_cell_scale, self.grid_cell_offset) cls_score_list, reg_distri_list = [], [] mask_feat = self.proto(feats[-1]) mask_coeff_list = [] for i, feat in enumerate(feats): _, _, h, w = feat.shape l = h * w if "npu" in paddle.device.get_all_custom_device_type( ): # backward in avgpool is extremely slow in npu kernel, replace it with mean avg_feat = feat.mean(axis=[2, 3], keepdim=True) else: avg_feat = F.adaptive_avg_pool2d(feat, (1, 1)) cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) + feat) reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat)) msk_coeff = self.pred_ins[i](self.stem_ins[i](feat, avg_feat) + feat) # cls and reg cls_score = F.sigmoid(cls_logit) cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1])) mask_coeff_list.append(msk_coeff.flatten(2).transpose([0, 2, 1])) ### reg_distri_list.append(reg_distri.flatten(2).transpose([0, 2, 1])) cls_score_list = paddle.concat(cls_score_list, axis=1) mask_coeff_list = paddle.concat(mask_coeff_list, axis=1) reg_distri_list = paddle.concat(reg_distri_list, axis=1) return self.get_loss([ cls_score_list, reg_distri_list, mask_coeff_list, mask_feat, anchors, anchor_points, num_anchors_list, stride_tensor ], targets) def _generate_anchors(self, feats=None, dtype='float32'): # just use in eval time anchor_points = [] stride_tensor = [] for i, stride in enumerate(self.fpn_strides): if feats is not None: _, _, h, w = feats[i].shape else: h = int(self.eval_size[0] / stride) w = int(self.eval_size[1] / stride) shift_x = paddle.arange(end=w) + self.grid_cell_offset shift_y = paddle.arange(end=h) + self.grid_cell_offset shift_y, shift_x = paddle.meshgrid(shift_y, shift_x) anchor_point = paddle.cast(paddle.stack([shift_x, shift_y], axis=-1), dtype=dtype) anchor_points.append(anchor_point.reshape([-1, 2])) stride_tensor.append(paddle.full([h * w, 1], stride, dtype=dtype)) anchor_points = paddle.concat(anchor_points) stride_tensor = paddle.concat(stride_tensor) return anchor_points, stride_tensor def forward_eval(self, feats): mask_proto = self.proto(feats[-1]) if self.eval_size: anchor_points, stride_tensor = self.anchor_points, self.stride_tensor else: anchor_points, stride_tensor = self._generate_anchors(feats) cls_score_list, reg_dist_list, pred_mask_list = [], [], [] feats_shapes = [] for i, feat in enumerate(feats): _, _, h, w = feat.shape l = h * w feats_shapes.append(l) if "npu" in paddle.device.get_all_custom_device_type(): # backward in avgpool is extremely slow in npu kernel, replace it with mean avg_feat = feat.mean(axis=[2, 3], keepdim=True) else: avg_feat = F.adaptive_avg_pool2d(feat, (1, 1)) cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) + feat) reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat)) mask_coeff = self.pred_ins[i](self.stem_ins[i](feat, avg_feat) + feat) pred_mask_list.append(mask_coeff.reshape([-1, self.num_masks, l])) reg_dist = reg_dist.reshape([-1, 4, self.reg_channels, l]).transpose([0, 2, 3, 1]) if self.use_shared_conv: reg_dist = self.proj_conv(F.softmax(reg_dist, axis=1)).squeeze(1) else: reg_dist = F.softmax(reg_dist, axis=1) # cls and reg cls_score = F.sigmoid(cls_logit) cls_score_list.append(cls_score.reshape([-1, self.num_classes, l])) reg_dist_list.append(reg_dist) cls_score_list = paddle.concat(cls_score_list, axis=-1) pred_mask_list = paddle.concat(pred_mask_list, axis=-1) if self.use_shared_conv: reg_dist_list = paddle.concat(reg_dist_list, axis=1) else: reg_dist_list = paddle.concat(reg_dist_list, axis=2) reg_dist_list = self.proj_conv(reg_dist_list).squeeze(1) return cls_score_list, reg_dist_list, pred_mask_list, mask_proto, anchor_points, stride_tensor def forward(self, feats, targets=None): assert len(feats) == len(self.fpn_strides), \ "The size of feats is not equal to size of fpn_strides" if self.training: return self.forward_train(feats, targets) else: return self.forward_eval(feats) @staticmethod def _focal_loss(score, label, alpha=0.25, gamma=2.0): weight = (score - label).pow(gamma) if alpha > 0: alpha_t = alpha * label + (1 - alpha) * (1 - label) weight *= alpha_t loss = F.binary_cross_entropy(score, label, weight=weight, reduction='sum') return loss @staticmethod def _varifocal_loss(pred_score, gt_score, label, alpha=0.75, gamma=2.0): weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label loss = F.binary_cross_entropy(pred_score, gt_score, weight=weight, reduction='sum') return loss def _bbox_decode(self, anchor_points, pred_dist): _, l, _ = get_static_shape(pred_dist) pred_dist = F.softmax(pred_dist.reshape([-1, l, 4, self.reg_channels])) pred_dist = self.proj_conv(pred_dist.transpose([0, 3, 1, 2])).squeeze(1) return batch_distance2bbox(anchor_points, pred_dist) def _bbox_decode_fake(self, pred_dist): _, l, _ = get_static_shape(pred_dist) pred_dist_dfl = F.softmax( pred_dist.reshape([-1, l, 4, self.reg_channels])) pred_dist = self.proj_conv(pred_dist_dfl.transpose([0, 3, 1, 2])).squeeze(1) return pred_dist, pred_dist_dfl def _bbox2distance(self, points, bbox): x1y1, x2y2 = paddle.split(bbox, 2, -1) lt = points - x1y1 rb = x2y2 - points if "npu" in paddle.device.get_all_custom_device_type( ): # npu clip kernel causes nan grad, replace it with maximum & minimum. out = paddle.concat([lt, rb], -1) out = paddle.maximum( out, paddle.to_tensor(self.reg_range[0], dtype=out.dtype)) out = paddle.minimum( out, paddle.to_tensor(self.reg_range[1] - 1 - 0.01, dtype=out.dtype)) return out else: return paddle.concat([lt, rb], -1).clip(self.reg_range[0], self.reg_range[1] - 1 - 0.01) def _df_loss(self, pred_dist, target, lower_bound=0): target_left = paddle.cast(target.floor(), 'int64') target_right = target_left + 1 weight_left = target_right.astype('float32') - target weight_right = 1 - weight_left loss_left = F.cross_entropy(pred_dist, target_left - lower_bound, reduction='none') * weight_left loss_right = F.cross_entropy(pred_dist, target_right - lower_bound, reduction='none') * weight_right return (loss_left + loss_right).mean(-1, keepdim=True) def get_loss(self, head_outs, gt_meta): assert 'gt_bbox' in gt_meta and 'gt_class' in gt_meta assert 'gt_segm' in gt_meta pred_scores, pred_distri, pred_mask_coeffs, mask_proto, anchors, \ anchor_points, num_anchors_list, stride_tensor = head_outs bs = pred_scores.shape[0] imgsz = paddle.to_tensor( [640, 640] ) # paddle.to_tensor(pred_scores[0].shape[2:]) * self.fpn_strides[0] # image size (h,w) mask_h, mask_w = mask_proto.shape[-2:] anchor_points_s = anchor_points / stride_tensor pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri) gt_labels = paddle.stack(gt_meta['gt_class']) gt_bboxes = paddle.stack(gt_meta['gt_bbox']) pad_gt_mask = paddle.stack(gt_meta['pad_gt_mask']) gt_segms = paddle.stack(gt_meta['gt_segm']).cast('float32') if tuple(gt_segms.shape[-2:]) != (mask_h, mask_w): # downsample gt_segms = F.interpolate(gt_segms, (mask_h, mask_w), mode='nearest').reshape( [bs, -1, mask_h * mask_w]) # label assignment assigned_labels, assigned_bboxes, assigned_scores, assigned_gt_index = \ self.assigner( pred_scores.detach(), pred_bboxes.detach() * stride_tensor, anchor_points, num_anchors_list, gt_labels, gt_bboxes, pad_gt_mask, bg_index=self.num_classes, gt_segms=gt_segms) # rescale bbox assigned_bboxes /= stride_tensor # assign segms for masks assigned_masks = paddle.gather(gt_segms.reshape([-1, mask_h * mask_w]), assigned_gt_index.flatten(), axis=0) assigned_masks = assigned_masks.reshape( [bs, assigned_gt_index.shape[1], mask_h * mask_w]) assign_out_dict = self.get_loss_from_assign( pred_scores, pred_distri, pred_bboxes, anchor_points_s, assigned_labels, assigned_bboxes, assigned_scores, assigned_masks, pred_mask_coeffs, mask_proto, stride_tensor, imgsz) loss = assign_out_dict return loss def get_loss_from_assign(self, pred_scores, pred_distri, pred_bboxes, anchor_points_s, assigned_labels, assigned_bboxes, assigned_scores, assigned_masks, pred_mask_coeffs, mask_proto, stride_tensor, imgsz): # cls loss if self.use_varifocal_loss: one_hot_label = F.one_hot(assigned_labels, self.num_classes + 1)[..., :-1] loss_cls = self._varifocal_loss(pred_scores, assigned_scores, one_hot_label) else: loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l=-1) assigned_scores_sum = assigned_scores.sum() if paddle.distributed.get_world_size() > 1: paddle.distributed.all_reduce(assigned_scores_sum) assigned_scores_sum /= paddle.distributed.get_world_size() if "npu" in paddle.device.get_all_custom_device_type(): # npu clip kernel causes nan grad, replace it with maximum & minimum. assigned_scores_sum = paddle.maximum( assigned_scores_sum, paddle.to_tensor(1., dtype=assigned_scores_sum.dtype)) else: assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.) loss_cls /= assigned_scores_sum # select positive samples mask mask_positive = (assigned_labels != self.num_classes) num_pos = mask_positive.sum() # pos/neg loss if num_pos > 0: # l1 + iou bbox_mask = mask_positive.astype('int32').unsqueeze(-1).tile( [1, 1, 4]).astype('bool') pred_bboxes_pos = paddle.masked_select(pred_bboxes, bbox_mask).reshape([-1, 4]) assigned_bboxes_pos = paddle.masked_select( assigned_bboxes, bbox_mask).reshape([-1, 4]) bbox_weight = paddle.masked_select(assigned_scores.sum(-1), mask_positive).unsqueeze(-1) loss_l1 = F.l1_loss(pred_bboxes_pos, assigned_bboxes_pos) loss_iou = self.iou_loss(pred_bboxes_pos, assigned_bboxes_pos) * bbox_weight loss_iou = loss_iou.sum() / assigned_scores_sum # dfl loss dist_mask = mask_positive.unsqueeze(-1).astype('int32').tile( [1, 1, self.reg_channels * 4]).astype('bool') pred_dist_pos = paddle.masked_select(pred_distri, dist_mask).reshape([ -1, 4, self.reg_channels ]) # pred_dist in funs assigned_ltrb = self._bbox2distance( anchor_points_s, assigned_bboxes) # anchor_points in func assigned_ltrb_pos = paddle.masked_select( assigned_ltrb, bbox_mask).reshape([-1, 4]) loss_dfl = self._df_loss(pred_dist_pos, assigned_ltrb_pos, self.reg_range[0]) * bbox_weight loss_dfl = loss_dfl.sum() / assigned_scores_sum # mask loss loss_mask = self.calculate_segmentation_loss( mask_positive, assigned_masks, assigned_bboxes * stride_tensor, mask_proto, pred_mask_coeffs, imgsz) # [bs, 8400] [bs, 8400, 160 * 160] [bs, 8400, 4] [bs, 32, 160, 160] [bs, 8400, 32] loss_mask /= assigned_scores_sum else: loss_l1 = paddle.zeros([1]) loss_iou = paddle.zeros([1]) loss_mask = paddle.zeros([1]) loss_dfl = paddle.zeros([1]) loss = self.loss_weight['class'] * loss_cls + \ self.loss_weight['iou'] * loss_iou + \ self.loss_weight['dfl'] * loss_dfl + \ self.loss_weight['iou'] * loss_mask out_dict = { 'loss': loss, 'loss_cls': loss_cls, 'loss_iou': loss_iou, 'loss_dfl': loss_dfl, 'loss_mask': loss_mask, 'loss_l1': loss_l1, } return out_dict def calculate_segmentation_loss(self, fg_mask, masks, target_bboxes, proto, pred_masks, imgsz, overlap=True): """ Calculate the loss for instance segmentation. Args: fg_mask (paddle.Tensor): A binary tensor of shape (BS, N_anchors) indicating which anchors are positive. masks (paddle.Tensor): Ground truth masks of shape (BS, H, W) if `overlap` is False, otherwise (BS, ?, H, W). target_gt_idx (paddle.Tensor): Indexes of ground truth objects for each anchor of shape (BS, N_anchors). target_bboxes (paddle.Tensor): Ground truth bounding boxes for each anchor of shape (BS, N_anchors, 4). batch_idx (paddle.Tensor): Batch indices of shape (N_labels_in_batch, 1). proto (paddle.Tensor): Prototype masks of shape (BS, 32, H, W). pred_masks (paddle.Tensor): Predicted masks for each anchor of shape (BS, N_anchors, 32). imgsz (paddle.Tensor): Size of the input image as a tensor of shape (2), i.e., (H, W). overlap (bool): Whether the masks in `masks` tensor overlap. Returns: (paddle.Tensor): The calculated loss for instance segmentation. Notes: The batch loss can be computed for improved speed at higher memory usage. For example, pred_mask can be computed as follows: pred_mask = paddle.einsum('in,nhw->ihw', pred, proto) # (i, 32) @ (32, 160, 160) -> (i, 160, 160) """ _, _, mask_h, mask_w = proto.shape loss = paddle.to_tensor([0.]) # Normalize to 0-1 target_bboxes_normalized = target_bboxes / imgsz[[1, 0, 1, 0]].cast( target_bboxes.dtype) # [8, 8400, 4] # Areas of target bboxes marea = xyxy2xywh(target_bboxes_normalized)[..., 2:].prod(2).unsqueeze(-1) # Normalize to mask size mxyxy = target_bboxes_normalized * paddle.to_tensor( [mask_w, mask_h, mask_w, mask_h], dtype=target_bboxes_normalized.dtype) for i, single_i in enumerate( zip(fg_mask, pred_masks, proto, mxyxy, marea, masks)): fg_mask_i, pred_masks_i, proto_i, mxyxy_i, marea_i, masks_i = single_i # [8400] [8400, 32] [32, 160, 160] [8400, 4] [8400, 1] [8400, 25600] if fg_mask_i.any(): loss += self.single_mask_loss(masks_i[fg_mask_i], pred_masks_i[fg_mask_i], proto_i, mxyxy_i[fg_mask_i], marea_i[fg_mask_i]) # [10, 25600] [10, 32] [32, 160, 160] [10, 4] [10, 1] else: loss += (proto * 0).sum() + ( pred_masks * 0).sum() # inf sums may lead to nan loss return loss @staticmethod def single_mask_loss(gt_mask, pred, proto, xyxy, area): """ Compute the instance segmentation loss for a single image. Args: gt_mask (paddle.Tensor): Ground truth mask of shape (n, H, W), where n is the number of objects. pred (paddle.Tensor): Predicted mask coefficients of shape (n, 32). proto (paddle.Tensor): Prototype masks of shape (32, H, W). xyxy (paddle.Tensor): Ground truth bounding boxes in xyxy format, normalized to [0, 1], of shape (n, 4). area (paddle.Tensor): Area of each ground truth bounding box of shape (n,). Returns: (paddle.Tensor): The calculated mask loss for a single image. Notes: The function uses the equation pred_mask = paddle.einsum('in,nhw->ihw', pred, proto) to produce the predicted masks from the prototype masks and predicted mask coefficients. """ nt = pred.shape[0] gt_mask = gt_mask.reshape([nt, *proto.shape[1:]]) nmasks = 32 pred_mask = (pred @ proto.reshape([nmasks, -1])).reshape( [-1, *proto.shape[1:]]) # (n,32) @ (32,80,80) -> (n,80,80) if "npu" in paddle.device.get_all_custom_device_type(): # bce npu kernel causes nan grad, replace it with numeric stable custom implementation. loss = custom_binary_cross_entropy_with_logits(pred_mask, gt_mask) else: loss = F.binary_cross_entropy_with_logits(pred_mask, gt_mask, reduction='none') return (crop_mask(loss, xyxy).mean(axis=(1, 2)) / area.squeeze(-1)).sum() def post_process(self, head_outs, im_shape, scale_factor, infer_shape=[640, 640], rescale=True): pred_scores, pred_dist, pred_mask_coeffs, mask_feat, anchor_points, stride_tensor = head_outs pred_bboxes = batch_distance2bbox(anchor_points, pred_dist) pred_bboxes *= stride_tensor if self.exclude_post_process: return paddle.concat([ pred_bboxes, pred_scores.transpose([0, 2, 1]), pred_mask_coeffs.transpose([0, 2, 1]) ], axis=-1), mask_feat, None # [1, 8400, 4+80+32], [1, 32, 160, 160] bbox_pred, bbox_num, keep_idxs = self.nms(pred_bboxes, pred_scores) if bbox_num.sum() > 0: pred_mask_coeffs = pred_mask_coeffs.transpose([0, 2, 1]) mask_coeffs = paddle.gather( pred_mask_coeffs.reshape([-1, self.num_masks]), keep_idxs) mask_logits = process_mask_upsample(mask_feat[0], mask_coeffs, bbox_pred[:, 2:6], infer_shape) if rescale: ori_h, ori_w = im_shape[0] / scale_factor[0] mask_logits = F.interpolate( mask_logits.unsqueeze(0), size=[ int(paddle.round(mask_logits.shape[-2] / scale_factor[0][0])), int(paddle.round(mask_logits.shape[-1] / scale_factor[0][1])) ], mode='bilinear', align_corners=False) if "npu" in paddle.device.get_all_custom_device_type(): # due to npu numeric error, we need to take round of img size. mask_logits = mask_logits[ ..., :round(ori_h.item()), :round(ori_w.item())] else: mask_logits = mask_logits[..., :int(ori_h), :int(ori_w)] masks = mask_logits.squeeze(0) mask_pred = paddle.to_tensor(masks > self.mask_thr_binary).cast("float32") # scale bbox to origin scale_factor = scale_factor.flip(-1).tile([1, 2]) bbox_pred[:, 2:6] /= scale_factor else: ori_h, ori_w = im_shape[0] / scale_factor[0] bbox_num = paddle.to_tensor([1]).cast("int32") bbox_pred = paddle.zeros([bbox_num, 6]) mask_pred = paddle.zeros([bbox_num, int(ori_h), int(ori_w)]) return bbox_pred, bbox_num, mask_pred, keep_idxs ================================================ FILE: ppdet/modeling/heads/ppyoloe_r_head.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from ..losses import ProbIoULoss from ..initializer import bias_init_with_prob, constant_, normal_, vector_ from ppdet.modeling.backbones.cspresnet import ConvBNLayer from ppdet.modeling.ops import get_static_shape, get_act_fn, anchor_generator from ppdet.modeling.layers import MultiClassNMS __all__ = ['PPYOLOERHead'] class ESEAttn(nn.Layer): def __init__(self, feat_channels, act='swish'): super(ESEAttn, self).__init__() self.fc = nn.Conv2D(feat_channels, feat_channels, 1) self.conv = ConvBNLayer(feat_channels, feat_channels, 1, act=act) self._init_weights() def _init_weights(self): normal_(self.fc.weight, std=0.01) def forward(self, feat, avg_feat): weight = F.sigmoid(self.fc(avg_feat)) return self.conv(feat * weight) @register class PPYOLOERHead(nn.Layer): __shared__ = ['num_classes', 'trt', 'export_onnx'] __inject__ = ['static_assigner', 'assigner', 'nms'] def __init__(self, in_channels=[1024, 512, 256], num_classes=15, act='swish', fpn_strides=(32, 16, 8), grid_cell_offset=0.5, angle_max=90, use_varifocal_loss=True, static_assigner_epoch=4, trt=False, export_onnx=False, static_assigner='ATSSAssigner', assigner='TaskAlignedAssigner', nms='MultiClassNMS', loss_weight={'class': 1.0, 'iou': 2.5, 'dfl': 0.05}): super(PPYOLOERHead, self).__init__() assert len(in_channels) > 0, "len(in_channels) should > 0" self.in_channels = in_channels self.num_classes = num_classes self.fpn_strides = fpn_strides self.grid_cell_offset = grid_cell_offset self.angle_max = angle_max self.loss_weight = loss_weight self.use_varifocal_loss = use_varifocal_loss self.half_pi = paddle.to_tensor( [1.5707963267948966], dtype=paddle.float32) self.half_pi_bin = self.half_pi / angle_max self.iou_loss = ProbIoULoss() self.static_assigner_epoch = static_assigner_epoch self.static_assigner = static_assigner self.assigner = assigner self.nms = nms # stem self.stem_cls = nn.LayerList() self.stem_reg = nn.LayerList() self.stem_angle = nn.LayerList() trt = False if export_onnx else trt self.export_onnx = export_onnx act = get_act_fn( act, trt=trt) if act is None or isinstance(act, (str, dict)) else act self.trt = trt for in_c in self.in_channels: self.stem_cls.append(ESEAttn(in_c, act=act)) self.stem_reg.append(ESEAttn(in_c, act=act)) self.stem_angle.append(ESEAttn(in_c, act=act)) # pred head self.pred_cls = nn.LayerList() self.pred_reg = nn.LayerList() self.pred_angle = nn.LayerList() for in_c in self.in_channels: self.pred_cls.append( nn.Conv2D( in_c, self.num_classes, 3, padding=1)) self.pred_reg.append(nn.Conv2D(in_c, 4, 3, padding=1)) self.pred_angle.append( nn.Conv2D( in_c, self.angle_max + 1, 3, padding=1)) self.angle_proj_conv = nn.Conv2D( self.angle_max + 1, 1, 1, bias_attr=False) self._init_weights() @classmethod def from_config(cls, cfg, input_shape): return {'in_channels': [i.channels for i in input_shape], } def _init_weights(self): bias_cls = bias_init_with_prob(0.01) bias_angle = [10.] + [1.] * self.angle_max for cls_, reg_, angle_ in zip(self.pred_cls, self.pred_reg, self.pred_angle): normal_(cls_.weight, std=0.01) constant_(cls_.bias, bias_cls) normal_(reg_.weight, std=0.01) constant_(reg_.bias) constant_(angle_.weight) vector_(angle_.bias, bias_angle) angle_proj = paddle.linspace(0, self.angle_max, self.angle_max + 1) self.angle_proj = angle_proj * self.half_pi_bin self.angle_proj_conv.weight.set_value( self.angle_proj.reshape([1, self.angle_max + 1, 1, 1])) self.angle_proj_conv.weight.stop_gradient = True def _generate_anchors(self, feats): if self.trt: anchor_points = [] for feat, stride in zip(feats, self.fpn_strides): _, _, h, w = feat.shape anchor, _ = anchor_generator( feat, stride * 4, 1.0, [1.0, 1.0, 1.0, 1.0], [stride, stride], offset=0.5) x1, y1, x2, y2 = paddle.split(anchor, 4, axis=-1) xc = (x1 + x2 + 1) / 2 yc = (y1 + y2 + 1) / 2 anchor_point = paddle.concat( [xc, yc], axis=-1).reshape((1, h * w, 2)) anchor_points.append(anchor_point) anchor_points = paddle.concat(anchor_points, axis=1) return anchor_points, None, None else: anchor_points = [] stride_tensor = [] num_anchors_list = [] for feat, stride in zip(feats, self.fpn_strides): _, _, h, w = feat.shape shift_x = (paddle.arange(end=w) + 0.5) * stride shift_y = (paddle.arange(end=h) + 0.5) * stride shift_y, shift_x = paddle.meshgrid(shift_y, shift_x) anchor_point = paddle.cast( paddle.stack( [shift_x, shift_y], axis=-1), dtype='float32') anchor_points.append(anchor_point.reshape([1, -1, 2])) stride_tensor.append( paddle.full( [1, h * w, 1], stride, dtype='float32')) num_anchors_list.append(h * w) anchor_points = paddle.concat(anchor_points, axis=1) stride_tensor = paddle.concat(stride_tensor, axis=1) return anchor_points, stride_tensor, num_anchors_list def forward(self, feats, targets=None): assert len(feats) == len(self.fpn_strides), \ "The size of feats is not equal to size of fpn_strides" if self.training: return self.forward_train(feats, targets) else: return self.forward_eval(feats) def forward_train(self, feats, targets): anchor_points, stride_tensor, num_anchors_list = self._generate_anchors( feats) cls_score_list, reg_dist_list, reg_angle_list = [], [], [] for i, feat in enumerate(feats): avg_feat = F.adaptive_avg_pool2d(feat, (1, 1)) cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) + feat) reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat)) reg_angle = self.pred_angle[i](self.stem_angle[i](feat, avg_feat)) # cls and reg cls_score = F.sigmoid(cls_logit) cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1])) reg_dist_list.append(reg_dist.flatten(2).transpose([0, 2, 1])) reg_angle_list.append(reg_angle.flatten(2).transpose([0, 2, 1])) cls_score_list = paddle.concat(cls_score_list, axis=1) reg_dist_list = paddle.concat(reg_dist_list, axis=1) reg_angle_list = paddle.concat(reg_angle_list, axis=1) return self.get_loss([ cls_score_list, reg_dist_list, reg_angle_list, anchor_points, num_anchors_list, stride_tensor ], targets) def forward_eval(self, feats): cls_score_list, reg_box_list = [], [] anchor_points, _, _ = self._generate_anchors(feats) for i, (feat, stride) in enumerate(zip(feats, self.fpn_strides)): b, _, h, w = feat.shape l = h * w # cls avg_feat = F.adaptive_avg_pool2d(feat, (1, 1)) cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) + feat) # reg reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat)) reg_xy, reg_wh = paddle.split(reg_dist, 2, axis=1) reg_xy = reg_xy * stride reg_wh = (F.elu(reg_wh) + 1.) * stride reg_angle = self.pred_angle[i](self.stem_angle[i](feat, avg_feat)) reg_angle = self.angle_proj_conv(F.softmax(reg_angle, axis=1)) reg_box = paddle.concat([reg_xy, reg_wh, reg_angle], axis=1) # cls and reg cls_score = F.sigmoid(cls_logit) cls_score_list.append(cls_score.reshape([b, self.num_classes, l])) reg_box_list.append(reg_box.reshape([b, 5, l])) cls_score_list = paddle.concat(cls_score_list, axis=-1) reg_box_list = paddle.concat(reg_box_list, axis=-1).transpose([0, 2, 1]) reg_xy, reg_wha = paddle.split(reg_box_list, [2, 3], axis=-1) reg_xy = reg_xy + anchor_points reg_box_list = paddle.concat([reg_xy, reg_wha], axis=-1) return cls_score_list, reg_box_list def _bbox_decode(self, points, pred_dist, pred_angle, stride_tensor): # predict vector to x, y, w, h, angle b, l = pred_angle.shape[:2] xy, wh = paddle.split(pred_dist, 2, axis=-1) xy = xy * stride_tensor + points wh = (F.elu(wh) + 1.) * stride_tensor angle = F.softmax(pred_angle.reshape([b, l, 1, self.angle_max + 1 ])).matmul(self.angle_proj) return paddle.concat([xy, wh, angle], axis=-1) def get_loss(self, head_outs, gt_meta): pred_scores, pred_dist, pred_angle, \ anchor_points, num_anchors_list, stride_tensor = head_outs # [B, N, 5] -> [B, N, 5] pred_bboxes = self._bbox_decode(anchor_points, pred_dist, pred_angle, stride_tensor) gt_labels = gt_meta['gt_class'] # [B, N, 5] gt_bboxes = gt_meta['gt_rbox'] pad_gt_mask = gt_meta['pad_gt_mask'] # label assignment if gt_meta['epoch_id'] < self.static_assigner_epoch: assigned_labels, assigned_bboxes, assigned_scores = \ self.static_assigner( anchor_points, stride_tensor, num_anchors_list, gt_labels, gt_meta['gt_bbox'], gt_bboxes, pad_gt_mask, self.num_classes, pred_bboxes.detach() ) else: assigned_labels, assigned_bboxes, assigned_scores = \ self.assigner( pred_scores.detach(), pred_bboxes.detach(), anchor_points, num_anchors_list, gt_labels, gt_bboxes, pad_gt_mask, bg_index=self.num_classes) alpha_l = -1 # cls loss if self.use_varifocal_loss: one_hot_label = F.one_hot(assigned_labels, self.num_classes + 1)[..., :-1] loss_cls = self._varifocal_loss(pred_scores, assigned_scores, one_hot_label) else: loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l) assigned_scores_sum = assigned_scores.sum() if paddle.distributed.get_world_size() > 1: paddle.distributed.all_reduce(assigned_scores_sum) assigned_scores_sum = paddle.clip( assigned_scores_sum / paddle.distributed.get_world_size(), min=1.) else: assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.) loss_cls /= assigned_scores_sum loss_iou, loss_dfl = self._bbox_loss(pred_angle, pred_bboxes, anchor_points, assigned_labels, assigned_bboxes, assigned_scores, assigned_scores_sum, stride_tensor) loss = self.loss_weight['class'] * loss_cls + \ self.loss_weight['iou'] * loss_iou + \ self.loss_weight['dfl'] * loss_dfl out_dict = { 'loss': loss, 'loss_cls': loss_cls, 'loss_iou': loss_iou, 'loss_dfl': loss_dfl } return out_dict @staticmethod def _focal_loss(score, label, alpha=0.25, gamma=2.0): weight = (score - label).pow(gamma) if alpha > 0: alpha_t = alpha * label + (1 - alpha) * (1 - label) weight *= alpha_t loss = F.binary_cross_entropy( score, label, weight=weight, reduction='sum') return loss @staticmethod def _varifocal_loss(pred_score, gt_score, label, alpha=0.75, gamma=2.0): weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label loss = F.binary_cross_entropy( pred_score, gt_score, weight=weight, reduction='sum') return loss @staticmethod def _df_loss(pred_dist, target): target_left = paddle.cast(target, 'int64') target_right = target_left + 1 weight_left = target_right.astype('float32') - target weight_right = 1 - weight_left loss_left = F.cross_entropy( pred_dist, target_left, reduction='none') * weight_left loss_right = F.cross_entropy( pred_dist, target_right, reduction='none') * weight_right return (loss_left + loss_right).mean(-1, keepdim=True) def _bbox_loss(self, pred_angle, pred_bboxes, anchor_points, assigned_labels, assigned_bboxes, assigned_scores, assigned_scores_sum, stride_tensor): # select positive samples mask mask_positive = (assigned_labels != self.num_classes) num_pos = mask_positive.sum() # pos/neg loss if num_pos > 0: # iou bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 5]) pred_bboxes_pos = paddle.masked_select(pred_bboxes, bbox_mask).reshape([-1, 5]) assigned_bboxes_pos = paddle.masked_select( assigned_bboxes, bbox_mask).reshape([-1, 5]) bbox_weight = paddle.masked_select( assigned_scores.sum(-1), mask_positive).reshape([-1]) loss_iou = self.iou_loss(pred_bboxes_pos, assigned_bboxes_pos) * bbox_weight loss_iou = loss_iou.sum() / assigned_scores_sum # dfl angle_mask = mask_positive.unsqueeze(-1).tile( [1, 1, self.angle_max + 1]) pred_angle_pos = paddle.masked_select( pred_angle, angle_mask).reshape([-1, self.angle_max + 1]) assigned_angle_pos = ( assigned_bboxes_pos[:, 4] / self.half_pi_bin).clip(0, self.angle_max - 0.01) loss_dfl = self._df_loss(pred_angle_pos, assigned_angle_pos) else: loss_iou = pred_bboxes.sum() * 0. loss_dfl = paddle.zeros([1]) return loss_iou, loss_dfl def _box2corners(self, pred_bboxes): """ convert (x, y, w, h, angle) to (x1, y1, x2, y2, x3, y3, x4, y4) Args: pred_bboxes (Tensor): [B, N, 5] Returns: polys (Tensor): [B, N, 8] """ x, y, w, h, angle = paddle.split(pred_bboxes, 5, axis=-1) cos_a_half = paddle.cos(angle) * 0.5 sin_a_half = paddle.sin(angle) * 0.5 w_x = cos_a_half * w w_y = sin_a_half * w h_x = -sin_a_half * h h_y = cos_a_half * h return paddle.concat( [ x + w_x + h_x, y + w_y + h_y, x - w_x + h_x, y - w_y + h_y, x - w_x - h_x, y - w_y - h_y, x + w_x - h_x, y + w_y - h_y ], axis=-1) def post_process(self, head_outs, scale_factor): pred_scores, pred_bboxes = head_outs # [B, N, 5] -> [B, N, 8] pred_bboxes = self._box2corners(pred_bboxes) # scale bbox to origin scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1) scale_factor = paddle.concat( [ scale_x, scale_y, scale_x, scale_y, scale_x, scale_y, scale_x, scale_y ], axis=-1).reshape([-1, 1, 8]) pred_bboxes /= scale_factor if self.export_onnx: return pred_bboxes, pred_scores, None bbox_pred, bbox_num, nms_keep_idx = self.nms(pred_bboxes, pred_scores) return bbox_pred, bbox_num, nms_keep_idx ================================================ FILE: ppdet/modeling/heads/retina_head.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import math import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.nn.initializer import Normal, Constant from ppdet.modeling.bbox_utils import bbox2delta, delta2bbox from ppdet.modeling.heads.fcos_head import FCOSFeat from ppdet.core.workspace import register __all__ = ['RetinaHead'] @register class RetinaFeat(FCOSFeat): """We use FCOSFeat to construct conv layers in RetinaNet. We rename FCOSFeat to RetinaFeat to avoid confusion. """ pass @register class RetinaHead(nn.Layer): """Used in RetinaNet proposed in paper https://arxiv.org/pdf/1708.02002.pdf """ __shared__ = ['num_classes'] __inject__ = [ 'conv_feat', 'anchor_generator', 'bbox_assigner', 'loss_class', 'loss_bbox', 'nms' ] def __init__(self, num_classes=80, conv_feat='RetinaFeat', anchor_generator='RetinaAnchorGenerator', bbox_assigner='MaxIoUAssigner', loss_class='FocalLoss', loss_bbox='SmoothL1Loss', nms='MultiClassNMS', prior_prob=0.01, nms_pre=1000, weights=[1., 1., 1., 1.]): super(RetinaHead, self).__init__() self.num_classes = num_classes self.conv_feat = conv_feat self.anchor_generator = anchor_generator self.bbox_assigner = bbox_assigner self.loss_class = loss_class self.loss_bbox = loss_bbox self.nms = nms self.nms_pre = nms_pre self.weights = weights bias_init_value = -math.log((1 - prior_prob) / prior_prob) num_anchors = self.anchor_generator.num_anchors self.retina_cls = nn.Conv2D( in_channels=self.conv_feat.feat_out, out_channels=self.num_classes * num_anchors, kernel_size=3, stride=1, padding=1, weight_attr=ParamAttr(initializer=Normal( mean=0.0, std=0.01)), bias_attr=ParamAttr(initializer=Constant(value=bias_init_value))) self.retina_reg = nn.Conv2D( in_channels=self.conv_feat.feat_out, out_channels=4 * num_anchors, kernel_size=3, stride=1, padding=1, weight_attr=ParamAttr(initializer=Normal( mean=0.0, std=0.01)), bias_attr=ParamAttr(initializer=Constant(value=0))) def forward(self, neck_feats, targets=None): cls_logits_list = [] bboxes_reg_list = [] for neck_feat in neck_feats: conv_cls_feat, conv_reg_feat = self.conv_feat(neck_feat) cls_logits = self.retina_cls(conv_cls_feat) bbox_reg = self.retina_reg(conv_reg_feat) cls_logits_list.append(cls_logits) bboxes_reg_list.append(bbox_reg) if self.training: return self.get_loss([cls_logits_list, bboxes_reg_list], targets) else: return [cls_logits_list, bboxes_reg_list] def get_loss(self, head_outputs, targets): """Here we calculate loss for a batch of images. We assign anchors to gts in each image and gather all the assigned postive and negative samples. Then loss is calculated on the gathered samples. """ cls_logits_list, bboxes_reg_list = head_outputs anchors = self.anchor_generator(cls_logits_list) anchors = paddle.concat(anchors) # matches: contain gt_inds # match_labels: -1(ignore), 0(neg) or 1(pos) matches_list, match_labels_list = [], [] # assign anchors to gts, no sampling is involved for gt_bbox in targets['gt_bbox']: matches, match_labels = self.bbox_assigner(anchors, gt_bbox) matches_list.append(matches) match_labels_list.append(match_labels) # reshape network outputs cls_logits = [ _.transpose([0, 2, 3, 1]).reshape([0, -1, self.num_classes]) for _ in cls_logits_list ] bboxes_reg = [ _.transpose([0, 2, 3, 1]).reshape([0, -1, 4]) for _ in bboxes_reg_list ] cls_logits = paddle.concat(cls_logits, axis=1) bboxes_reg = paddle.concat(bboxes_reg, axis=1) cls_pred_list, cls_tar_list = [], [] reg_pred_list, reg_tar_list = [], [] # find and gather preds and targets in each image for matches, match_labels, cls_logit, bbox_reg, gt_bbox, gt_class in \ zip(matches_list, match_labels_list, cls_logits, bboxes_reg, targets['gt_bbox'], targets['gt_class']): pos_mask = (match_labels == 1) neg_mask = (match_labels == 0) chosen_mask = paddle.logical_or(pos_mask, neg_mask) gt_class = gt_class.reshape([-1]) bg_class = paddle.to_tensor( [self.num_classes], dtype=gt_class.dtype) # a trick to assign num_classes to negative targets gt_class = paddle.concat([gt_class, bg_class], axis=-1) matches = paddle.where(neg_mask, paddle.full_like(matches, gt_class.size - 1), matches) cls_pred = cls_logit[chosen_mask] cls_tar = gt_class[matches[chosen_mask]] reg_pred = bbox_reg[pos_mask].reshape([-1, 4]) reg_tar = gt_bbox[matches[pos_mask]].reshape([-1, 4]) reg_tar = bbox2delta(anchors[pos_mask], reg_tar, self.weights) cls_pred_list.append(cls_pred) cls_tar_list.append(cls_tar) reg_pred_list.append(reg_pred) reg_tar_list.append(reg_tar) cls_pred = paddle.concat(cls_pred_list) cls_tar = paddle.concat(cls_tar_list) reg_pred = paddle.concat(reg_pred_list) reg_tar = paddle.concat(reg_tar_list) avg_factor = max(1.0, reg_pred.shape[0]) cls_loss = self.loss_class( cls_pred, cls_tar, reduction='sum') / avg_factor if reg_pred.shape[0] == 0: reg_loss = paddle.zeros([]) reg_loss.stop_gradient = False else: reg_loss = self.loss_bbox( reg_pred, reg_tar, reduction='sum') / avg_factor loss = cls_loss + reg_loss out_dict = { 'loss_cls': cls_loss, 'loss_reg': reg_loss, 'loss': loss, } return out_dict def get_bboxes_single(self, anchors, cls_scores_list, bbox_preds_list, im_shape, scale_factor, rescale=True): assert len(cls_scores_list) == len(bbox_preds_list) mlvl_bboxes = [] mlvl_scores = [] for anchor, cls_score, bbox_pred in zip(anchors, cls_scores_list, bbox_preds_list): cls_score = cls_score.reshape([-1, self.num_classes]) bbox_pred = bbox_pred.reshape([-1, 4]) if self.nms_pre is not None and cls_score.shape[0] > self.nms_pre: max_score = cls_score.max(axis=1) _, topk_inds = max_score.topk(self.nms_pre) bbox_pred = bbox_pred.gather(topk_inds) anchor = anchor.gather(topk_inds) cls_score = cls_score.gather(topk_inds) bbox_pred = delta2bbox(bbox_pred, anchor, self.weights).squeeze() mlvl_bboxes.append(bbox_pred) mlvl_scores.append(F.sigmoid(cls_score)) mlvl_bboxes = paddle.concat(mlvl_bboxes) mlvl_bboxes = paddle.squeeze(mlvl_bboxes) if rescale: mlvl_bboxes = mlvl_bboxes / paddle.concat( [scale_factor[::-1], scale_factor[::-1]]) mlvl_scores = paddle.concat(mlvl_scores) mlvl_scores = mlvl_scores.transpose([1, 0]) return mlvl_bboxes, mlvl_scores def decode(self, anchors, cls_logits, bboxes_reg, im_shape, scale_factor): batch_bboxes = [] batch_scores = [] for img_id in range(cls_logits[0].shape[0]): num_lvls = len(cls_logits) cls_scores_list = [cls_logits[i][img_id] for i in range(num_lvls)] bbox_preds_list = [bboxes_reg[i][img_id] for i in range(num_lvls)] bboxes, scores = self.get_bboxes_single( anchors, cls_scores_list, bbox_preds_list, im_shape[img_id], scale_factor[img_id]) batch_bboxes.append(bboxes) batch_scores.append(scores) batch_bboxes = paddle.stack(batch_bboxes, axis=0) batch_scores = paddle.stack(batch_scores, axis=0) return batch_bboxes, batch_scores def post_process(self, head_outputs, im_shape, scale_factor): cls_logits_list, bboxes_reg_list = head_outputs anchors = self.anchor_generator(cls_logits_list) cls_logits = [_.transpose([0, 2, 3, 1]) for _ in cls_logits_list] bboxes_reg = [_.transpose([0, 2, 3, 1]) for _ in bboxes_reg_list] bboxes, scores = self.decode(anchors, cls_logits, bboxes_reg, im_shape, scale_factor) bbox_pred, bbox_num, nms_keep_idx = self.nms(bboxes, scores) return bbox_pred, bbox_num, nms_keep_idx def get_scores_single(self, cls_scores_list): mlvl_logits = [] for cls_score in cls_scores_list: cls_score = cls_score.reshape([-1, self.num_classes]) if self.nms_pre is not None and cls_score.shape[0] > self.nms_pre: max_score = cls_score.max(axis=1) _, topk_inds = max_score.topk(self.nms_pre) cls_score = cls_score.gather(topk_inds) mlvl_logits.append(cls_score) mlvl_logits = paddle.concat(mlvl_logits) mlvl_logits = mlvl_logits.transpose([1, 0]) return mlvl_logits def decode_cls_logits(self, cls_logits_list): cls_logits = [_.transpose([0, 2, 3, 1]) for _ in cls_logits_list] batch_logits = [] for img_id in range(cls_logits[0].shape[0]): num_lvls = len(cls_logits) cls_scores_list = [cls_logits[i][img_id] for i in range(num_lvls)] logits = self.get_scores_single(cls_scores_list) batch_logits.append(logits) batch_logits = paddle.stack(batch_logits, axis=0) return batch_logits ================================================ FILE: ppdet/modeling/heads/roi_extractor.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle from ppdet.core.workspace import register from ppdet.modeling import ops import paddle.nn as nn def _to_list(v): if not isinstance(v, (list, tuple)): return [v] return v @register class RoIAlign(nn.Layer): """ RoI Align module For more details, please refer to the document of roi_align in in https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/vision/ops.py Args: resolution (int): The output size, default 14 spatial_scale (float): Multiplicative spatial scale factor to translate ROI coords from their input scale to the scale used when pooling. default 0.0625 sampling_ratio (int): The number of sampling points in the interpolation grid, default 0 canconical_level (int): The referring level of FPN layer with specified level. default 4 canonical_size (int): The referring scale of FPN layer with specified scale. default 224 start_level (int): The start level of FPN layer to extract RoI feature, default 0 end_level (int): The end level of FPN layer to extract RoI feature, default 3 aligned (bool): Whether to add offset to rois' coord in roi_align. default false """ def __init__(self, resolution=14, spatial_scale=0.0625, sampling_ratio=0, canconical_level=4, canonical_size=224, start_level=0, end_level=3, aligned=False): super(RoIAlign, self).__init__() self.resolution = resolution self.spatial_scale = _to_list(spatial_scale) self.sampling_ratio = sampling_ratio self.canconical_level = canconical_level self.canonical_size = canonical_size self.start_level = start_level self.end_level = end_level self.aligned = False # TODO: npu kernel do not support aligned=True @classmethod def from_config(cls, cfg, input_shape): return {'spatial_scale': [1. / i.stride for i in input_shape]} def forward(self, feats, roi, rois_num): roi = paddle.concat(roi) if len(roi) > 1 else roi[0] if len(feats) == 1: rois_feat = paddle.vision.ops.roi_align( x=feats[self.start_level], boxes=roi, boxes_num=rois_num, output_size=self.resolution, spatial_scale=self.spatial_scale[0], aligned=self.aligned) else: offset = 2 k_min = self.start_level + offset k_max = self.end_level + offset if hasattr(paddle.vision.ops, "distribute_fpn_proposals"): distribute_fpn_proposals = getattr(paddle.vision.ops, "distribute_fpn_proposals") else: distribute_fpn_proposals = ops.distribute_fpn_proposals rois_dist, restore_index, rois_num_dist = distribute_fpn_proposals( roi, k_min, k_max, self.canconical_level, self.canonical_size, rois_num=rois_num) rois_feat_list = [] for lvl in range(self.start_level, self.end_level + 1): roi_feat = paddle.vision.ops.roi_align( x=feats[lvl], boxes=rois_dist[lvl], boxes_num=rois_num_dist[lvl], output_size=self.resolution, spatial_scale=self.spatial_scale[lvl], sampling_ratio=self.sampling_ratio, aligned=self.aligned) rois_feat_list.append(roi_feat) rois_feat_shuffle = paddle.concat(rois_feat_list) rois_feat = paddle.gather(rois_feat_shuffle, restore_index) return rois_feat ================================================ FILE: ppdet/modeling/heads/s2anet_head.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # The code is based on https://github.com/csuhan/s2anet/blob/master/mmdet/models/anchor_heads_rotated/s2anet_head.py import paddle from paddle import ParamAttr import paddle.nn as nn import paddle.nn.functional as F from paddle.nn.initializer import Normal, Constant from ppdet.core.workspace import register from ppdet.modeling.proposal_generator.target_layer import RBoxAssigner from ppdet.modeling.proposal_generator.anchor_generator import S2ANetAnchorGenerator from ppdet.modeling.layers import AlignConv from ..cls_utils import _get_class_default_kwargs import numpy as np @register class S2ANetHead(nn.Layer): """ S2Anet head Args: stacked_convs (int): number of stacked_convs feat_in (int): input channels of feat feat_out (int): output channels of feat num_classes (int): num_classes anchor_strides (list): stride of anchors anchor_scales (list): scale of anchors anchor_ratios (list): ratios of anchors target_means (list): target_means target_stds (list): target_stds align_conv_type (str): align_conv_type ['Conv', 'AlignConv'] align_conv_size (int): kernel size of align_conv use_sigmoid_cls (bool): use sigmoid_cls or not reg_loss_weight (list): loss weight for regression """ __shared__ = ['num_classes'] __inject__ = ['anchor_assign', 'nms'] def __init__(self, stacked_convs=2, feat_in=256, feat_out=256, num_classes=15, anchor_strides=[8, 16, 32, 64, 128], anchor_scales=[4], anchor_ratios=[1.0], target_means=0.0, target_stds=1.0, align_conv_type='AlignConv', align_conv_size=3, use_sigmoid_cls=True, anchor_assign=_get_class_default_kwargs(RBoxAssigner), reg_loss_weight=[1.0, 1.0, 1.0, 1.0, 1.1], cls_loss_weight=[1.1, 1.05], reg_loss_type='l1', nms_pre=2000, nms='MultiClassNMS'): super(S2ANetHead, self).__init__() self.stacked_convs = stacked_convs self.feat_in = feat_in self.feat_out = feat_out self.anchor_list = None self.anchor_scales = anchor_scales self.anchor_ratios = anchor_ratios self.anchor_strides = anchor_strides self.anchor_strides = paddle.to_tensor(anchor_strides) self.anchor_base_sizes = list(anchor_strides) self.means = paddle.ones(shape=[5]) * target_means self.stds = paddle.ones(shape=[5]) * target_stds assert align_conv_type in ['AlignConv', 'Conv', 'DCN'] self.align_conv_type = align_conv_type self.align_conv_size = align_conv_size self.use_sigmoid_cls = use_sigmoid_cls self.cls_out_channels = num_classes if self.use_sigmoid_cls else num_classes + 1 self.sampling = False self.anchor_assign = anchor_assign self.reg_loss_weight = reg_loss_weight self.cls_loss_weight = cls_loss_weight self.alpha = 1.0 self.beta = 1.0 self.reg_loss_type = reg_loss_type self.nms_pre = nms_pre self.nms = nms self.fake_bbox = paddle.to_tensor( np.array( [[-1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]], dtype='float32')) self.fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32')) # anchor self.anchor_generators = [] for anchor_base in self.anchor_base_sizes: self.anchor_generators.append( S2ANetAnchorGenerator(anchor_base, anchor_scales, anchor_ratios)) self.anchor_generators = nn.LayerList(self.anchor_generators) self.fam_cls_convs = nn.Sequential() self.fam_reg_convs = nn.Sequential() for i in range(self.stacked_convs): chan_in = self.feat_in if i == 0 else self.feat_out self.fam_cls_convs.add_sublayer( 'fam_cls_conv_{}'.format(i), nn.Conv2D( in_channels=chan_in, out_channels=self.feat_out, kernel_size=3, padding=1, weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), bias_attr=ParamAttr(initializer=Constant(0)))) self.fam_cls_convs.add_sublayer('fam_cls_conv_{}_act'.format(i), nn.ReLU()) self.fam_reg_convs.add_sublayer( 'fam_reg_conv_{}'.format(i), nn.Conv2D( in_channels=chan_in, out_channels=self.feat_out, kernel_size=3, padding=1, weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), bias_attr=ParamAttr(initializer=Constant(0)))) self.fam_reg_convs.add_sublayer('fam_reg_conv_{}_act'.format(i), nn.ReLU()) self.fam_reg = nn.Conv2D( self.feat_out, 5, 1, weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), bias_attr=ParamAttr(initializer=Constant(0))) prior_prob = 0.01 bias_init = float(-np.log((1 - prior_prob) / prior_prob)) self.fam_cls = nn.Conv2D( self.feat_out, self.cls_out_channels, 1, weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), bias_attr=ParamAttr(initializer=Constant(bias_init))) if self.align_conv_type == "AlignConv": self.align_conv = AlignConv(self.feat_out, self.feat_out, self.align_conv_size) elif self.align_conv_type == "Conv": self.align_conv = nn.Conv2D( self.feat_out, self.feat_out, self.align_conv_size, padding=(self.align_conv_size - 1) // 2, bias_attr=ParamAttr(initializer=Constant(0))) elif self.align_conv_type == "DCN": self.align_conv_offset = nn.Conv2D( self.feat_out, 2 * self.align_conv_size**2, 1, weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), bias_attr=ParamAttr(initializer=Constant(0))) self.align_conv = paddle.vision.ops.DeformConv2D( self.feat_out, self.feat_out, self.align_conv_size, padding=(self.align_conv_size - 1) // 2, weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), bias_attr=False) self.or_conv = nn.Conv2D( self.feat_out, self.feat_out, kernel_size=3, padding=1, weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), bias_attr=ParamAttr(initializer=Constant(0))) # ODM self.odm_cls_convs = nn.Sequential() self.odm_reg_convs = nn.Sequential() for i in range(self.stacked_convs): ch_in = self.feat_out # ch_in = int(self.feat_out / 8) if i == 0 else self.feat_out self.odm_cls_convs.add_sublayer( 'odm_cls_conv_{}'.format(i), nn.Conv2D( in_channels=ch_in, out_channels=self.feat_out, kernel_size=3, stride=1, padding=1, weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), bias_attr=ParamAttr(initializer=Constant(0)))) self.odm_cls_convs.add_sublayer('odm_cls_conv_{}_act'.format(i), nn.ReLU()) self.odm_reg_convs.add_sublayer( 'odm_reg_conv_{}'.format(i), nn.Conv2D( in_channels=self.feat_out, out_channels=self.feat_out, kernel_size=3, stride=1, padding=1, weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), bias_attr=ParamAttr(initializer=Constant(0)))) self.odm_reg_convs.add_sublayer('odm_reg_conv_{}_act'.format(i), nn.ReLU()) self.odm_cls = nn.Conv2D( self.feat_out, self.cls_out_channels, 3, padding=1, weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), bias_attr=ParamAttr(initializer=Constant(bias_init))) self.odm_reg = nn.Conv2D( self.feat_out, 5, 3, padding=1, weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), bias_attr=ParamAttr(initializer=Constant(0))) def forward(self, feats, targets=None): fam_reg_list, fam_cls_list = [], [] odm_reg_list, odm_cls_list = [], [] num_anchors_list, base_anchors_list, refine_anchors_list = [], [], [] for i, feat in enumerate(feats): # get shape B = feat.shape[0] H, W = feat.shape[2], feat.shape[3] NA = H * W num_anchors_list.append(NA) fam_cls_feat = self.fam_cls_convs(feat) fam_cls = self.fam_cls(fam_cls_feat) # [N, CLS, H, W] --> [N, H, W, CLS] fam_cls = fam_cls.transpose([0, 2, 3, 1]).reshape( [B, NA, self.cls_out_channels]) fam_cls_list.append(fam_cls) fam_reg_feat = self.fam_reg_convs(feat) fam_reg = self.fam_reg(fam_reg_feat) # [N, 5, H, W] --> [N, H, W, 5] fam_reg = fam_reg.transpose([0, 2, 3, 1]).reshape([B, NA, 5]) fam_reg_list.append(fam_reg) # prepare anchor init_anchors = self.anchor_generators[i]((H, W), self.anchor_strides[i]) init_anchors = init_anchors.reshape([1, NA, 5]) base_anchors_list.append(init_anchors.squeeze(0)) if self.training: refine_anchor = self.bbox_decode(fam_reg.detach(), init_anchors) else: refine_anchor = self.bbox_decode(fam_reg, init_anchors) refine_anchors_list.append(refine_anchor) if self.align_conv_type == 'AlignConv': align_feat = self.align_conv(feat, refine_anchor.clone(), (H, W), self.anchor_strides[i]) elif self.align_conv_type == 'DCN': align_offset = self.align_conv_offset(feat) align_feat = self.align_conv(feat, align_offset) elif self.align_conv_type == 'Conv': align_feat = self.align_conv(feat) or_feat = self.or_conv(align_feat) odm_reg_feat = or_feat odm_cls_feat = or_feat odm_reg_feat = self.odm_reg_convs(odm_reg_feat) odm_cls_feat = self.odm_cls_convs(odm_cls_feat) odm_cls = self.odm_cls(odm_cls_feat) # [N, CLS, H, W] --> [N, H, W, CLS] odm_cls = odm_cls.transpose([0, 2, 3, 1]).reshape( [B, NA, self.cls_out_channels]) odm_cls_list.append(odm_cls) odm_reg = self.odm_reg(odm_reg_feat) # [N, 5, H, W] --> [N, H, W, 5] odm_reg = odm_reg.transpose([0, 2, 3, 1]).reshape([B, NA, 5]) odm_reg_list.append(odm_reg) if self.training: return self.get_loss([ fam_cls_list, fam_reg_list, odm_cls_list, odm_reg_list, num_anchors_list, base_anchors_list, refine_anchors_list ], targets) else: odm_bboxes_list = [] for odm_reg, refine_anchor in zip(odm_reg_list, refine_anchors_list): odm_bboxes = self.bbox_decode(odm_reg, refine_anchor) odm_bboxes_list.append(odm_bboxes) return [odm_bboxes_list, odm_cls_list] def get_bboxes(self, head_outs): perd_bboxes_list, pred_scores_list = head_outs batch = pred_scores_list[0].shape[0] bboxes, bbox_num = [], [] for i in range(batch): pred_scores_per_image = [t[i] for t in pred_scores_list] pred_bboxes_per_image = [t[i] for t in perd_bboxes_list] bbox_per_image, bbox_num_per_image = self.get_bboxes_single( pred_scores_per_image, pred_bboxes_per_image) bboxes.append(bbox_per_image) bbox_num.append(bbox_num_per_image) bboxes = paddle.concat(bboxes) bbox_num = paddle.concat(bbox_num) return bboxes, bbox_num def get_pred(self, bboxes, bbox_num, im_shape, scale_factor): """ Rescale, clip and filter the bbox from the output of NMS to get final prediction. Args: bboxes(Tensor): bboxes [N, 10] bbox_num(Tensor): bbox_num im_shape(Tensor): [1 2] scale_factor(Tensor): [1 2] Returns: bbox_pred(Tensor): The output is the prediction with shape [N, 8] including labels, scores and bboxes. The size of bboxes are corresponding to the original image. """ origin_shape = paddle.floor(im_shape / scale_factor + 0.5) origin_shape_list = [] scale_factor_list = [] # scale_factor: scale_y, scale_x for i in range(bbox_num.shape[0]): expand_shape = paddle.expand(origin_shape[i:i + 1, :], [bbox_num[i], 2]) scale_y, scale_x = scale_factor[i, 0:1], scale_factor[i, 1:2] scale = paddle.concat([ scale_x, scale_y, scale_x, scale_y, scale_x, scale_y, scale_x, scale_y ]) expand_scale = paddle.expand(scale, [bbox_num[i], 8]) origin_shape_list.append(expand_shape) scale_factor_list.append(expand_scale) origin_shape_list = paddle.concat(origin_shape_list) scale_factor_list = paddle.concat(scale_factor_list) # bboxes: [N, 10], label, score, bbox pred_label_score = bboxes[:, 0:2] pred_bbox = bboxes[:, 2:] # rescale bbox to original image pred_bbox = pred_bbox.reshape([-1, 8]) scaled_bbox = pred_bbox / scale_factor_list origin_h = origin_shape_list[:, 0] origin_w = origin_shape_list[:, 1] bboxes = scaled_bbox zeros = paddle.zeros_like(origin_h) x1 = paddle.maximum(paddle.minimum(bboxes[:, 0], origin_w - 1), zeros) y1 = paddle.maximum(paddle.minimum(bboxes[:, 1], origin_h - 1), zeros) x2 = paddle.maximum(paddle.minimum(bboxes[:, 2], origin_w - 1), zeros) y2 = paddle.maximum(paddle.minimum(bboxes[:, 3], origin_h - 1), zeros) x3 = paddle.maximum(paddle.minimum(bboxes[:, 4], origin_w - 1), zeros) y3 = paddle.maximum(paddle.minimum(bboxes[:, 5], origin_h - 1), zeros) x4 = paddle.maximum(paddle.minimum(bboxes[:, 6], origin_w - 1), zeros) y4 = paddle.maximum(paddle.minimum(bboxes[:, 7], origin_h - 1), zeros) pred_bbox = paddle.stack([x1, y1, x2, y2, x3, y3, x4, y4], axis=-1) pred_result = paddle.concat([pred_label_score, pred_bbox], axis=1) return pred_result def get_bboxes_single(self, cls_score_list, bbox_pred_list): mlvl_bboxes = [] mlvl_scores = [] for cls_score, bbox_pred in zip(cls_score_list, bbox_pred_list): if self.use_sigmoid_cls: scores = F.sigmoid(cls_score) else: scores = F.softmax(cls_score, axis=-1) if scores.shape[0] > self.nms_pre: # Get maximum scores for foreground classes. if self.use_sigmoid_cls: max_scores = paddle.max(scores, axis=1) else: max_scores = paddle.max(scores[:, :-1], axis=1) topk_val, topk_inds = paddle.topk(max_scores, self.nms_pre) bbox_pred = paddle.gather(bbox_pred, topk_inds) scores = paddle.gather(scores, topk_inds) mlvl_bboxes.append(bbox_pred) mlvl_scores.append(scores) mlvl_bboxes = paddle.concat(mlvl_bboxes) mlvl_scores = paddle.concat(mlvl_scores) mlvl_polys = self.rbox2poly(mlvl_bboxes).unsqueeze(0) mlvl_scores = paddle.transpose(mlvl_scores, [1, 0]).unsqueeze(0) bbox, bbox_num, _ = self.nms(mlvl_polys, mlvl_scores) if bbox.shape[0] <= 0: bbox = self.fake_bbox bbox_num = self.fake_bbox_num return bbox, bbox_num def smooth_l1_loss(self, pred, label, delta=1.0 / 9.0): """ Args: pred: pred score label: label delta: delta Returns: loss """ assert pred.shape == label.shape and label.numel() > 0 assert delta > 0 diff = paddle.abs(pred - label) loss = paddle.where(diff < delta, 0.5 * diff * diff / delta, diff - 0.5 * delta) return loss def get_fam_loss(self, fam_target, s2anet_head_out, reg_loss_type='l1'): (labels, label_weights, bbox_targets, bbox_weights, bbox_gt_bboxes, pos_inds, neg_inds) = fam_target fam_cls_branch_list, fam_reg_branch_list, odm_cls_branch_list, odm_reg_branch_list, num_anchors_list = s2anet_head_out fam_cls_losses = [] fam_bbox_losses = [] st_idx = 0 num_total_samples = len(pos_inds) + len( neg_inds) if self.sampling else len(pos_inds) num_total_samples = max(1, num_total_samples) for idx, feat_anchor_num in enumerate(num_anchors_list): # step1: get data feat_labels = labels[st_idx:st_idx + feat_anchor_num] feat_label_weights = label_weights[st_idx:st_idx + feat_anchor_num] feat_bbox_targets = bbox_targets[st_idx:st_idx + feat_anchor_num, :] feat_bbox_weights = bbox_weights[st_idx:st_idx + feat_anchor_num, :] # step2: calc cls loss feat_labels = feat_labels.reshape(-1) feat_label_weights = feat_label_weights.reshape(-1) fam_cls_score = fam_cls_branch_list[idx] fam_cls_score = paddle.squeeze(fam_cls_score, axis=0) fam_cls_score1 = fam_cls_score feat_labels = paddle.to_tensor(feat_labels) feat_labels_one_hot = paddle.nn.functional.one_hot( feat_labels, self.cls_out_channels + 1) feat_labels_one_hot = feat_labels_one_hot[:, 1:] feat_labels_one_hot.stop_gradient = True num_total_samples = paddle.to_tensor( num_total_samples, dtype='float32', stop_gradient=True) fam_cls = F.sigmoid_focal_loss( fam_cls_score1, feat_labels_one_hot, normalizer=num_total_samples, reduction='none') feat_label_weights = feat_label_weights.reshape( feat_label_weights.shape[0], 1) feat_label_weights = np.repeat( feat_label_weights, self.cls_out_channels, axis=1) feat_label_weights = paddle.to_tensor( feat_label_weights, stop_gradient=True) fam_cls = fam_cls * feat_label_weights fam_cls_total = paddle.sum(fam_cls) fam_cls_losses.append(fam_cls_total) # step3: regression loss feat_bbox_targets = paddle.to_tensor( feat_bbox_targets, dtype='float32', stop_gradient=True) feat_bbox_targets = paddle.reshape(feat_bbox_targets, [-1, 5]) fam_bbox_pred = fam_reg_branch_list[idx] fam_bbox_pred = paddle.squeeze(fam_bbox_pred, axis=0) fam_bbox_pred = paddle.reshape(fam_bbox_pred, [-1, 5]) fam_bbox = self.smooth_l1_loss(fam_bbox_pred, feat_bbox_targets) loss_weight = paddle.to_tensor( self.reg_loss_weight, dtype='float32', stop_gradient=True) fam_bbox = paddle.multiply(fam_bbox, loss_weight) feat_bbox_weights = paddle.to_tensor( feat_bbox_weights, stop_gradient=True) fam_bbox = fam_bbox * feat_bbox_weights fam_bbox_total = paddle.sum(fam_bbox) / num_total_samples fam_bbox_losses.append(fam_bbox_total) st_idx += feat_anchor_num fam_cls_loss = paddle.add_n(fam_cls_losses) fam_cls_loss_weight = paddle.to_tensor( self.cls_loss_weight[0], dtype='float32', stop_gradient=True) fam_cls_loss = fam_cls_loss * fam_cls_loss_weight fam_reg_loss = paddle.add_n(fam_bbox_losses) return fam_cls_loss, fam_reg_loss def get_odm_loss(self, odm_target, s2anet_head_out, reg_loss_type='l1'): (labels, label_weights, bbox_targets, bbox_weights, bbox_gt_bboxes, pos_inds, neg_inds) = odm_target fam_cls_branch_list, fam_reg_branch_list, odm_cls_branch_list, odm_reg_branch_list, num_anchors_list = s2anet_head_out odm_cls_losses = [] odm_bbox_losses = [] st_idx = 0 num_total_samples = len(pos_inds) + len( neg_inds) if self.sampling else len(pos_inds) num_total_samples = max(1, num_total_samples) for idx, feat_anchor_num in enumerate(num_anchors_list): # step1: get data feat_labels = labels[st_idx:st_idx + feat_anchor_num] feat_label_weights = label_weights[st_idx:st_idx + feat_anchor_num] feat_bbox_targets = bbox_targets[st_idx:st_idx + feat_anchor_num, :] feat_bbox_weights = bbox_weights[st_idx:st_idx + feat_anchor_num, :] # step2: calc cls loss feat_labels = feat_labels.reshape(-1) feat_label_weights = feat_label_weights.reshape(-1) odm_cls_score = odm_cls_branch_list[idx] odm_cls_score = paddle.squeeze(odm_cls_score, axis=0) odm_cls_score1 = odm_cls_score feat_labels = paddle.to_tensor(feat_labels) feat_labels_one_hot = paddle.nn.functional.one_hot( feat_labels, self.cls_out_channels + 1) feat_labels_one_hot = feat_labels_one_hot[:, 1:] feat_labels_one_hot.stop_gradient = True num_total_samples = paddle.to_tensor( num_total_samples, dtype='float32', stop_gradient=True) odm_cls = F.sigmoid_focal_loss( odm_cls_score1, feat_labels_one_hot, normalizer=num_total_samples, reduction='none') feat_label_weights = feat_label_weights.reshape( feat_label_weights.shape[0], 1) feat_label_weights = np.repeat( feat_label_weights, self.cls_out_channels, axis=1) feat_label_weights = paddle.to_tensor(feat_label_weights) feat_label_weights.stop_gradient = True odm_cls = odm_cls * feat_label_weights odm_cls_total = paddle.sum(odm_cls) odm_cls_losses.append(odm_cls_total) # # step3: regression loss feat_bbox_targets = paddle.to_tensor( feat_bbox_targets, dtype='float32') feat_bbox_targets = paddle.reshape(feat_bbox_targets, [-1, 5]) feat_bbox_targets.stop_gradient = True odm_bbox_pred = odm_reg_branch_list[idx] odm_bbox_pred = paddle.squeeze(odm_bbox_pred, axis=0) odm_bbox_pred = paddle.reshape(odm_bbox_pred, [-1, 5]) odm_bbox = self.smooth_l1_loss(odm_bbox_pred, feat_bbox_targets) loss_weight = paddle.to_tensor( self.reg_loss_weight, dtype='float32', stop_gradient=True) odm_bbox = paddle.multiply(odm_bbox, loss_weight) feat_bbox_weights = paddle.to_tensor( feat_bbox_weights, stop_gradient=True) odm_bbox = odm_bbox * feat_bbox_weights odm_bbox_total = paddle.sum(odm_bbox) / num_total_samples odm_bbox_losses.append(odm_bbox_total) st_idx += feat_anchor_num odm_cls_loss = paddle.add_n(odm_cls_losses) odm_cls_loss_weight = paddle.to_tensor( self.cls_loss_weight[1], dtype='float32', stop_gradient=True) odm_cls_loss = odm_cls_loss * odm_cls_loss_weight odm_reg_loss = paddle.add_n(odm_bbox_losses) return odm_cls_loss, odm_reg_loss def get_loss(self, head_outs, inputs): fam_cls_list, fam_reg_list, odm_cls_list, odm_reg_list, \ num_anchors_list, base_anchors_list, refine_anchors_list = head_outs # compute loss fam_cls_loss_lst = [] fam_reg_loss_lst = [] odm_cls_loss_lst = [] odm_reg_loss_lst = [] batch = len(inputs['gt_rbox']) for i in range(batch): # data_format: (xc, yc, w, h, theta) gt_mask = inputs['pad_gt_mask'][i, :, 0] gt_idx = paddle.nonzero(gt_mask).squeeze(-1) gt_bboxes = paddle.gather(inputs['gt_rbox'][i], gt_idx).numpy() gt_labels = paddle.gather(inputs['gt_class'][i], gt_idx).numpy() is_crowd = paddle.gather(inputs['is_crowd'][i], gt_idx).numpy() gt_labels = gt_labels + 1 anchors_per_image = np.concatenate(base_anchors_list) fam_cls_per_image = [t[i] for t in fam_cls_list] fam_reg_per_image = [t[i] for t in fam_reg_list] odm_cls_per_image = [t[i] for t in odm_cls_list] odm_reg_per_image = [t[i] for t in odm_reg_list] im_s2anet_head_out = (fam_cls_per_image, fam_reg_per_image, odm_cls_per_image, odm_reg_per_image, num_anchors_list) # FAM im_fam_target = self.anchor_assign(anchors_per_image, gt_bboxes, gt_labels, is_crowd) if im_fam_target is not None: im_fam_cls_loss, im_fam_reg_loss = self.get_fam_loss( im_fam_target, im_s2anet_head_out, self.reg_loss_type) fam_cls_loss_lst.append(im_fam_cls_loss) fam_reg_loss_lst.append(im_fam_reg_loss) # ODM refine_anchors_per_image = [t[i] for t in refine_anchors_list] refine_anchors_per_image = paddle.concat( refine_anchors_per_image).numpy() im_odm_target = self.anchor_assign(refine_anchors_per_image, gt_bboxes, gt_labels, is_crowd) if im_odm_target is not None: im_odm_cls_loss, im_odm_reg_loss = self.get_odm_loss( im_odm_target, im_s2anet_head_out, self.reg_loss_type) odm_cls_loss_lst.append(im_odm_cls_loss) odm_reg_loss_lst.append(im_odm_reg_loss) fam_cls_loss = paddle.add_n(fam_cls_loss_lst) / batch fam_reg_loss = paddle.add_n(fam_reg_loss_lst) / batch odm_cls_loss = paddle.add_n(odm_cls_loss_lst) / batch odm_reg_loss = paddle.add_n(odm_reg_loss_lst) / batch loss = fam_cls_loss + fam_reg_loss + odm_cls_loss + odm_reg_loss return { 'loss': loss, 'fam_cls_loss': fam_cls_loss, 'fam_reg_loss': fam_reg_loss, 'odm_cls_loss': odm_cls_loss, 'odm_reg_loss': odm_reg_loss } def bbox_decode(self, preds, anchors, wh_ratio_clip=1e-6): """decode bbox from deltas Args: preds: [B, L, 5] anchors: [1, L, 5] return: bboxes: [B, L, 5] """ preds = paddle.add(paddle.multiply(preds, self.stds), self.means) dx, dy, dw, dh, dangle = paddle.split(preds, 5, axis=-1) max_ratio = np.abs(np.log(wh_ratio_clip)) dw = paddle.clip(dw, min=-max_ratio, max=max_ratio) dh = paddle.clip(dh, min=-max_ratio, max=max_ratio) rroi_x, rroi_y, rroi_w, rroi_h, rroi_angle = paddle.split( anchors, 5, axis=-1) gx = dx * rroi_w * paddle.cos(rroi_angle) - dy * rroi_h * paddle.sin( rroi_angle) + rroi_x gy = dx * rroi_w * paddle.sin(rroi_angle) + dy * rroi_h * paddle.cos( rroi_angle) + rroi_y gw = rroi_w * dw.exp() gh = rroi_h * dh.exp() ga = np.pi * dangle + rroi_angle ga = (ga + np.pi / 4) % np.pi - np.pi / 4 bboxes = paddle.concat([gx, gy, gw, gh, ga], axis=-1) return bboxes def rbox2poly(self, rboxes): """ rboxes: [x_ctr,y_ctr,w,h,angle] to polys: [x0,y0,x1,y1,x2,y2,x3,y3] """ N = rboxes.shape[0] x_ctr = rboxes[:, 0] y_ctr = rboxes[:, 1] width = rboxes[:, 2] height = rboxes[:, 3] angle = rboxes[:, 4] tl_x, tl_y, br_x, br_y = -width * 0.5, -height * 0.5, width * 0.5, height * 0.5 normal_rects = paddle.stack( [tl_x, br_x, br_x, tl_x, tl_y, tl_y, br_y, br_y], axis=0) normal_rects = paddle.reshape(normal_rects, [2, 4, N]) normal_rects = paddle.transpose(normal_rects, [2, 0, 1]) sin, cos = paddle.sin(angle), paddle.cos(angle) # M: [N,2,2] M = paddle.stack([cos, -sin, sin, cos], axis=0) M = paddle.reshape(M, [2, 2, N]) M = paddle.transpose(M, [2, 0, 1]) # polys: [N,8] polys = paddle.matmul(M, normal_rects) polys = paddle.transpose(polys, [2, 1, 0]) polys = paddle.reshape(polys, [-1, N]) polys = paddle.transpose(polys, [1, 0]) tmp = paddle.stack( [x_ctr, y_ctr, x_ctr, y_ctr, x_ctr, y_ctr, x_ctr, y_ctr], axis=1) polys = polys + tmp return polys ================================================ FILE: ppdet/modeling/heads/simota_head.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # The code is based on: # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/yolox_head.py from __future__ import absolute_import from __future__ import division from __future__ import print_function import math from functools import partial import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.nn.initializer import Normal, Constant from ppdet.core.workspace import register from ppdet.modeling.bbox_utils import distance2bbox, bbox2distance from ppdet.data.transform.atss_assigner import bbox_overlaps from .gfl_head import GFLHead @register class OTAHead(GFLHead): """ OTAHead Args: conv_feat (object): Instance of 'FCOSFeat' num_classes (int): Number of classes fpn_stride (list): The stride of each FPN Layer prior_prob (float): Used to set the bias init for the class prediction layer loss_qfl (object): Instance of QualityFocalLoss. loss_dfl (object): Instance of DistributionFocalLoss. loss_bbox (object): Instance of bbox loss. assigner (object): Instance of label assigner. reg_max: Max value of integral set :math: `{0, ..., reg_max}` n QFL setting. Default: 16. """ __inject__ = [ 'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox', 'assigner', 'nms' ] __shared__ = ['num_classes'] def __init__(self, conv_feat='FCOSFeat', dgqp_module=None, num_classes=80, fpn_stride=[8, 16, 32, 64, 128], prior_prob=0.01, loss_class='QualityFocalLoss', loss_dfl='DistributionFocalLoss', loss_bbox='GIoULoss', assigner='SimOTAAssigner', reg_max=16, feat_in_chan=256, nms=None, nms_pre=1000, cell_offset=0): super(OTAHead, self).__init__( conv_feat=conv_feat, dgqp_module=dgqp_module, num_classes=num_classes, fpn_stride=fpn_stride, prior_prob=prior_prob, loss_class=loss_class, loss_dfl=loss_dfl, loss_bbox=loss_bbox, reg_max=reg_max, feat_in_chan=feat_in_chan, nms=nms, nms_pre=nms_pre, cell_offset=cell_offset) self.conv_feat = conv_feat self.dgqp_module = dgqp_module self.num_classes = num_classes self.fpn_stride = fpn_stride self.prior_prob = prior_prob self.loss_qfl = loss_class self.loss_dfl = loss_dfl self.loss_bbox = loss_bbox self.reg_max = reg_max self.feat_in_chan = feat_in_chan self.nms = nms self.nms_pre = nms_pre self.cell_offset = cell_offset self.use_sigmoid = self.loss_qfl.use_sigmoid self.assigner = assigner def _get_target_single(self, flatten_cls_pred, flatten_center_and_stride, flatten_bbox, gt_bboxes, gt_labels): """Compute targets for priors in a single image. """ pos_num, label, label_weight, bbox_target = self.assigner( F.sigmoid(flatten_cls_pred), flatten_center_and_stride, flatten_bbox, gt_bboxes, gt_labels) return (pos_num, label, label_weight, bbox_target) def get_loss(self, head_outs, gt_meta): cls_scores, bbox_preds = head_outs num_level_anchors = [ featmap.shape[-2] * featmap.shape[-1] for featmap in cls_scores ] num_imgs = gt_meta['im_id'].shape[0] featmap_sizes = [[featmap.shape[-2], featmap.shape[-1]] for featmap in cls_scores] decode_bbox_preds = [] center_and_strides = [] for featmap_size, stride, bbox_pred in zip(featmap_sizes, self.fpn_stride, bbox_preds): # center in origin image yy, xx = self.get_single_level_center_point(featmap_size, stride, self.cell_offset) center_and_stride = paddle.stack([xx, yy, stride, stride], -1).tile( [num_imgs, 1, 1]) center_and_strides.append(center_and_stride) center_in_feature = center_and_stride.reshape( [-1, 4])[:, :-2] / stride bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape( [num_imgs, -1, 4 * (self.reg_max + 1)]) pred_distances = self.distribution_project(bbox_pred) decode_bbox_pred_wo_stride = distance2bbox( center_in_feature, pred_distances).reshape([num_imgs, -1, 4]) decode_bbox_preds.append(decode_bbox_pred_wo_stride * stride) flatten_cls_preds = [ cls_pred.transpose([0, 2, 3, 1]).reshape( [num_imgs, -1, self.cls_out_channels]) for cls_pred in cls_scores ] flatten_cls_preds = paddle.concat(flatten_cls_preds, axis=1) flatten_bboxes = paddle.concat(decode_bbox_preds, axis=1) flatten_center_and_strides = paddle.concat(center_and_strides, axis=1) gt_boxes, gt_labels = gt_meta['gt_bbox'], gt_meta['gt_class'] pos_num_l, label_l, label_weight_l, bbox_target_l = [], [], [], [] for flatten_cls_pred,flatten_center_and_stride,flatten_bbox,gt_box, gt_label \ in zip(flatten_cls_preds.detach(),flatten_center_and_strides.detach(), \ flatten_bboxes.detach(),gt_boxes, gt_labels): pos_num, label, label_weight, bbox_target = self._get_target_single( flatten_cls_pred, flatten_center_and_stride, flatten_bbox, gt_box, gt_label) pos_num_l.append(pos_num) label_l.append(label) label_weight_l.append(label_weight) bbox_target_l.append(bbox_target) labels = paddle.to_tensor(np.stack(label_l, axis=0)) label_weights = paddle.to_tensor(np.stack(label_weight_l, axis=0)) bbox_targets = paddle.to_tensor(np.stack(bbox_target_l, axis=0)) center_and_strides_list = self._images_to_levels( flatten_center_and_strides, num_level_anchors) labels_list = self._images_to_levels(labels, num_level_anchors) label_weights_list = self._images_to_levels(label_weights, num_level_anchors) bbox_targets_list = self._images_to_levels(bbox_targets, num_level_anchors) num_total_pos = sum(pos_num_l) try: paddle.distributed.all_reduce(paddle.to_tensor(num_total_pos)) num_total_pos = paddle.clip( num_total_pos / paddle.distributed.get_world_size(), min=1.) except: num_total_pos = max(num_total_pos, 1) loss_bbox_list, loss_dfl_list, loss_qfl_list, avg_factor = [], [], [], [] for cls_score, bbox_pred, center_and_strides, labels, label_weights, bbox_targets, stride in zip( cls_scores, bbox_preds, center_and_strides_list, labels_list, label_weights_list, bbox_targets_list, self.fpn_stride): center_and_strides = center_and_strides.reshape([-1, 4]) cls_score = cls_score.transpose([0, 2, 3, 1]).reshape( [-1, self.cls_out_channels]) bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape( [-1, 4 * (self.reg_max + 1)]) bbox_targets = bbox_targets.reshape([-1, 4]) labels = labels.reshape([-1]) label_weights = label_weights.reshape([-1]) bg_class_ind = self.num_classes pos_inds = paddle.nonzero( paddle.logical_and((labels >= 0), (labels < bg_class_ind)), as_tuple=False).squeeze(1) score = np.zeros(labels.shape) if len(pos_inds) > 0: pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0) pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0) pos_centers = paddle.gather( center_and_strides[:, :-2], pos_inds, axis=0) / stride weight_targets = F.sigmoid(cls_score.detach()) weight_targets = paddle.gather( weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0) pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred) pos_decode_bbox_pred = distance2bbox(pos_centers, pos_bbox_pred_corners) pos_decode_bbox_targets = pos_bbox_targets / stride bbox_iou = bbox_overlaps( pos_decode_bbox_pred.detach().numpy(), pos_decode_bbox_targets.detach().numpy(), is_aligned=True) score[pos_inds.numpy()] = bbox_iou pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1]) target_corners = bbox2distance(pos_centers, pos_decode_bbox_targets, self.reg_max).reshape([-1]) # regression loss loss_bbox = paddle.sum( self.loss_bbox(pos_decode_bbox_pred, pos_decode_bbox_targets) * weight_targets) # dfl loss loss_dfl = self.loss_dfl( pred_corners, target_corners, weight=weight_targets.expand([-1, 4]).reshape([-1]), avg_factor=4.0) else: loss_bbox = bbox_pred.sum() * 0 loss_dfl = bbox_pred.sum() * 0 weight_targets = paddle.to_tensor([0], dtype='float32') # qfl loss score = paddle.to_tensor(score) loss_qfl = self.loss_qfl( cls_score, (labels, score), weight=label_weights, avg_factor=num_total_pos) loss_bbox_list.append(loss_bbox) loss_dfl_list.append(loss_dfl) loss_qfl_list.append(loss_qfl) avg_factor.append(weight_targets.sum()) avg_factor = sum(avg_factor) try: paddle.distributed.all_reduce(paddle.to_tensor(avg_factor)) avg_factor = paddle.clip( avg_factor / paddle.distributed.get_world_size(), min=1) except: avg_factor = max(avg_factor.item(), 1) if avg_factor <= 0: loss_qfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False) loss_bbox = paddle.to_tensor( 0, dtype='float32', stop_gradient=False) loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False) else: losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list)) losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list)) loss_qfl = sum(loss_qfl_list) loss_bbox = sum(losses_bbox) loss_dfl = sum(losses_dfl) loss_states = dict( loss_qfl=loss_qfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl) return loss_states @register class OTAVFLHead(OTAHead): __inject__ = [ 'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox', 'assigner', 'nms' ] __shared__ = ['num_classes'] def __init__(self, conv_feat='FCOSFeat', dgqp_module=None, num_classes=80, fpn_stride=[8, 16, 32, 64, 128], prior_prob=0.01, loss_class='VarifocalLoss', loss_dfl='DistributionFocalLoss', loss_bbox='GIoULoss', assigner='SimOTAAssigner', reg_max=16, feat_in_chan=256, nms=None, nms_pre=1000, cell_offset=0): super(OTAVFLHead, self).__init__( conv_feat=conv_feat, dgqp_module=dgqp_module, num_classes=num_classes, fpn_stride=fpn_stride, prior_prob=prior_prob, loss_class=loss_class, loss_dfl=loss_dfl, loss_bbox=loss_bbox, reg_max=reg_max, feat_in_chan=feat_in_chan, nms=nms, nms_pre=nms_pre, cell_offset=cell_offset) self.conv_feat = conv_feat self.dgqp_module = dgqp_module self.num_classes = num_classes self.fpn_stride = fpn_stride self.prior_prob = prior_prob self.loss_vfl = loss_class self.loss_dfl = loss_dfl self.loss_bbox = loss_bbox self.reg_max = reg_max self.feat_in_chan = feat_in_chan self.nms = nms self.nms_pre = nms_pre self.cell_offset = cell_offset self.use_sigmoid = self.loss_vfl.use_sigmoid self.assigner = assigner def get_loss(self, head_outs, gt_meta): cls_scores, bbox_preds = head_outs num_level_anchors = [ featmap.shape[-2] * featmap.shape[-1] for featmap in cls_scores ] num_imgs = gt_meta['im_id'].shape[0] featmap_sizes = [[featmap.shape[-2], featmap.shape[-1]] for featmap in cls_scores] decode_bbox_preds = [] center_and_strides = [] for featmap_size, stride, bbox_pred in zip(featmap_sizes, self.fpn_stride, bbox_preds): # center in origin image yy, xx = self.get_single_level_center_point(featmap_size, stride, self.cell_offset) strides = paddle.full((len(xx), ), stride) center_and_stride = paddle.stack([xx, yy, strides, strides], -1).tile([num_imgs, 1, 1]) center_and_strides.append(center_and_stride) center_in_feature = center_and_stride.reshape( [-1, 4])[:, :-2] / stride bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape( [num_imgs, -1, 4 * (self.reg_max + 1)]) pred_distances = self.distribution_project(bbox_pred) decode_bbox_pred_wo_stride = distance2bbox( center_in_feature, pred_distances).reshape([num_imgs, -1, 4]) decode_bbox_preds.append(decode_bbox_pred_wo_stride * stride) flatten_cls_preds = [ cls_pred.transpose([0, 2, 3, 1]).reshape( [num_imgs, -1, self.cls_out_channels]) for cls_pred in cls_scores ] flatten_cls_preds = paddle.concat(flatten_cls_preds, axis=1) flatten_bboxes = paddle.concat(decode_bbox_preds, axis=1) flatten_center_and_strides = paddle.concat(center_and_strides, axis=1) gt_boxes, gt_labels = gt_meta['gt_bbox'], gt_meta['gt_class'] pos_num_l, label_l, label_weight_l, bbox_target_l = [], [], [], [] for flatten_cls_pred, flatten_center_and_stride, flatten_bbox,gt_box,gt_label \ in zip(flatten_cls_preds.detach(), flatten_center_and_strides.detach(), \ flatten_bboxes.detach(),gt_boxes,gt_labels): pos_num, label, label_weight, bbox_target = self._get_target_single( flatten_cls_pred, flatten_center_and_stride, flatten_bbox, gt_box, gt_label) pos_num_l.append(pos_num) label_l.append(label) label_weight_l.append(label_weight) bbox_target_l.append(bbox_target) labels = paddle.to_tensor(np.stack(label_l, axis=0)) label_weights = paddle.to_tensor(np.stack(label_weight_l, axis=0)) bbox_targets = paddle.to_tensor(np.stack(bbox_target_l, axis=0)) center_and_strides_list = self._images_to_levels( flatten_center_and_strides, num_level_anchors) labels_list = self._images_to_levels(labels, num_level_anchors) label_weights_list = self._images_to_levels(label_weights, num_level_anchors) bbox_targets_list = self._images_to_levels(bbox_targets, num_level_anchors) num_total_pos = sum(pos_num_l) try: paddle.distributed.all_reduce(paddle.to_tensor(num_total_pos)) num_total_pos = paddle.clip( num_total_pos / paddle.distributed.get_world_size(), min=1.) except: num_total_pos = max(num_total_pos, 1) loss_bbox_list, loss_dfl_list, loss_vfl_list, avg_factor = [], [], [], [] for cls_score, bbox_pred, center_and_strides, labels, label_weights, bbox_targets, stride in zip( cls_scores, bbox_preds, center_and_strides_list, labels_list, label_weights_list, bbox_targets_list, self.fpn_stride): center_and_strides = center_and_strides.reshape([-1, 4]) cls_score = cls_score.transpose([0, 2, 3, 1]).reshape( [-1, self.cls_out_channels]) bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape( [-1, 4 * (self.reg_max + 1)]) bbox_targets = bbox_targets.reshape([-1, 4]) labels = labels.reshape([-1]) bg_class_ind = self.num_classes pos_inds = paddle.nonzero( paddle.logical_and((labels >= 0), (labels < bg_class_ind)), as_tuple=False).squeeze(1) # vfl vfl_score = np.zeros(cls_score.shape) if len(pos_inds) > 0: pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0) pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0) pos_centers = paddle.gather( center_and_strides[:, :-2], pos_inds, axis=0) / stride weight_targets = F.sigmoid(cls_score.detach()) weight_targets = paddle.gather( weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0) pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred) pos_decode_bbox_pred = distance2bbox(pos_centers, pos_bbox_pred_corners) pos_decode_bbox_targets = pos_bbox_targets / stride bbox_iou = bbox_overlaps( pos_decode_bbox_pred.detach().numpy(), pos_decode_bbox_targets.detach().numpy(), is_aligned=True) # vfl pos_labels = paddle.gather(labels, pos_inds, axis=0) vfl_score[pos_inds.numpy(), pos_labels] = bbox_iou pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1]) target_corners = bbox2distance(pos_centers, pos_decode_bbox_targets, self.reg_max).reshape([-1]) # regression loss loss_bbox = paddle.sum( self.loss_bbox(pos_decode_bbox_pred, pos_decode_bbox_targets) * weight_targets) # dfl loss loss_dfl = self.loss_dfl( pred_corners, target_corners, weight=weight_targets.expand([-1, 4]).reshape([-1]), avg_factor=4.0) else: loss_bbox = bbox_pred.sum() * 0 loss_dfl = bbox_pred.sum() * 0 weight_targets = paddle.to_tensor([0], dtype='float32') # vfl loss num_pos_avg_per_gpu = num_total_pos vfl_score = paddle.to_tensor(vfl_score) loss_vfl = self.loss_vfl( cls_score, vfl_score, avg_factor=num_pos_avg_per_gpu) loss_bbox_list.append(loss_bbox) loss_dfl_list.append(loss_dfl) loss_vfl_list.append(loss_vfl) avg_factor.append(weight_targets.sum()) avg_factor = sum(avg_factor) try: paddle.distributed.all_reduce(paddle.to_tensor(avg_factor)) avg_factor = paddle.clip( avg_factor / paddle.distributed.get_world_size(), min=1) except: avg_factor = max(avg_factor.item(), 1) if avg_factor <= 0: loss_vfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False) loss_bbox = paddle.to_tensor( 0, dtype='float32', stop_gradient=False) loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False) else: losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list)) losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list)) loss_vfl = sum(loss_vfl_list) loss_bbox = sum(losses_bbox) loss_dfl = sum(losses_dfl) loss_states = dict( loss_vfl=loss_vfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl) return loss_states ================================================ FILE: ppdet/modeling/heads/solov2_head.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle from paddle import ParamAttr import paddle.nn as nn import paddle.nn.functional as F from paddle.nn.initializer import Normal, Constant from ppdet.modeling.layers import ConvNormLayer, MaskMatrixNMS, DropBlock from ppdet.core.workspace import register from six.moves import zip import numpy as np __all__ = ['SOLOv2Head'] @register class SOLOv2MaskHead(nn.Layer): """ MaskHead of SOLOv2. The code of this function is based on: https://github.com/WXinlong/SOLO/blob/master/mmdet/models/mask_heads/mask_feat_head.py Args: in_channels (int): The channel number of input Tensor. out_channels (int): The channel number of output Tensor. start_level (int): The position where the input starts. end_level (int): The position where the input ends. use_dcn_in_tower (bool): Whether to use dcn in tower or not. """ __shared__ = ['norm_type'] def __init__(self, in_channels=256, mid_channels=128, out_channels=256, start_level=0, end_level=3, use_dcn_in_tower=False, norm_type='gn'): super(SOLOv2MaskHead, self).__init__() assert start_level >= 0 and end_level >= start_level self.in_channels = in_channels self.out_channels = out_channels self.mid_channels = mid_channels self.use_dcn_in_tower = use_dcn_in_tower self.range_level = end_level - start_level + 1 self.use_dcn = True if self.use_dcn_in_tower else False self.convs_all_levels = [] self.norm_type = norm_type for i in range(start_level, end_level + 1): conv_feat_name = 'mask_feat_head.convs_all_levels.{}'.format(i) conv_pre_feat = nn.Sequential() if i == start_level: conv_pre_feat.add_sublayer( conv_feat_name + '.conv' + str(i), ConvNormLayer( ch_in=self.in_channels, ch_out=self.mid_channels, filter_size=3, stride=1, use_dcn=self.use_dcn, norm_type=self.norm_type)) self.add_sublayer('conv_pre_feat' + str(i), conv_pre_feat) self.convs_all_levels.append(conv_pre_feat) else: for j in range(i): ch_in = 0 if j == 0: ch_in = self.in_channels + 2 if i == end_level else self.in_channels else: ch_in = self.mid_channels conv_pre_feat.add_sublayer( conv_feat_name + '.conv' + str(j), ConvNormLayer( ch_in=ch_in, ch_out=self.mid_channels, filter_size=3, stride=1, use_dcn=self.use_dcn, norm_type=self.norm_type)) conv_pre_feat.add_sublayer( conv_feat_name + '.conv' + str(j) + 'act', nn.ReLU()) conv_pre_feat.add_sublayer( 'upsample' + str(i) + str(j), nn.Upsample( scale_factor=2, mode='bilinear')) self.add_sublayer('conv_pre_feat' + str(i), conv_pre_feat) self.convs_all_levels.append(conv_pre_feat) conv_pred_name = 'mask_feat_head.conv_pred.0' self.conv_pred = self.add_sublayer( conv_pred_name, ConvNormLayer( ch_in=self.mid_channels, ch_out=self.out_channels, filter_size=1, stride=1, use_dcn=self.use_dcn, norm_type=self.norm_type)) def forward(self, inputs): """ Get SOLOv2MaskHead output. Args: inputs(list[Tensor]): feature map from each necks with shape of [N, C, H, W] Returns: ins_pred(Tensor): Output of SOLOv2MaskHead head """ feat_all_level = F.relu(self.convs_all_levels[0](inputs[0])) for i in range(1, self.range_level): input_p = inputs[i] if i == (self.range_level - 1): input_feat = input_p x_range = paddle.linspace( -1, 1, input_feat.shape[-1], dtype='float32') y_range = paddle.linspace( -1, 1, input_feat.shape[-2], dtype='float32') y, x = paddle.meshgrid([y_range, x_range]) x = paddle.unsqueeze(x, [0, 1]) y = paddle.unsqueeze(y, [0, 1]) y = paddle.expand( y, shape=[input_feat.shape[0], 1, -1, -1]) x = paddle.expand( x, shape=[input_feat.shape[0], 1, -1, -1]) coord_feat = paddle.concat([x, y], axis=1) input_p = paddle.concat([input_p, coord_feat], axis=1) feat_all_level = paddle.add(feat_all_level, self.convs_all_levels[i](input_p)) ins_pred = F.relu(self.conv_pred(feat_all_level)) return ins_pred @register class SOLOv2Head(nn.Layer): """ Head block for SOLOv2 network Args: num_classes (int): Number of output classes. in_channels (int): Number of input channels. seg_feat_channels (int): Num_filters of kernel & categroy branch convolution operation. stacked_convs (int): Times of convolution operation. num_grids (list[int]): List of feature map grids size. kernel_out_channels (int): Number of output channels in kernel branch. dcn_v2_stages (list): Which stage use dcn v2 in tower. It is between [0, stacked_convs). segm_strides (list[int]): List of segmentation area stride. solov2_loss (object): SOLOv2Loss instance. score_threshold (float): Threshold of categroy score. mask_nms (object): MaskMatrixNMS instance. """ __inject__ = ['solov2_loss', 'mask_nms'] __shared__ = ['norm_type', 'num_classes'] def __init__(self, num_classes=80, in_channels=256, seg_feat_channels=256, stacked_convs=4, num_grids=[40, 36, 24, 16, 12], kernel_out_channels=256, dcn_v2_stages=[], segm_strides=[8, 8, 16, 32, 32], solov2_loss=None, score_threshold=0.1, mask_threshold=0.5, mask_nms=None, norm_type='gn', drop_block=False): super(SOLOv2Head, self).__init__() self.num_classes = num_classes self.in_channels = in_channels self.seg_num_grids = num_grids self.cate_out_channels = self.num_classes self.seg_feat_channels = seg_feat_channels self.stacked_convs = stacked_convs self.kernel_out_channels = kernel_out_channels self.dcn_v2_stages = dcn_v2_stages self.segm_strides = segm_strides self.solov2_loss = solov2_loss self.mask_nms = mask_nms self.score_threshold = score_threshold self.mask_threshold = mask_threshold self.norm_type = norm_type self.drop_block = drop_block self.kernel_pred_convs = [] self.cate_pred_convs = [] for i in range(self.stacked_convs): use_dcn = True if i in self.dcn_v2_stages else False ch_in = self.in_channels + 2 if i == 0 else self.seg_feat_channels kernel_conv = self.add_sublayer( 'bbox_head.kernel_convs.' + str(i), ConvNormLayer( ch_in=ch_in, ch_out=self.seg_feat_channels, filter_size=3, stride=1, use_dcn=use_dcn, norm_type=self.norm_type)) self.kernel_pred_convs.append(kernel_conv) ch_in = self.in_channels if i == 0 else self.seg_feat_channels cate_conv = self.add_sublayer( 'bbox_head.cate_convs.' + str(i), ConvNormLayer( ch_in=ch_in, ch_out=self.seg_feat_channels, filter_size=3, stride=1, use_dcn=use_dcn, norm_type=self.norm_type)) self.cate_pred_convs.append(cate_conv) self.solo_kernel = self.add_sublayer( 'bbox_head.solo_kernel', nn.Conv2D( self.seg_feat_channels, self.kernel_out_channels, kernel_size=3, stride=1, padding=1, weight_attr=ParamAttr(initializer=Normal( mean=0., std=0.01)), bias_attr=True)) self.solo_cate = self.add_sublayer( 'bbox_head.solo_cate', nn.Conv2D( self.seg_feat_channels, self.cate_out_channels, kernel_size=3, stride=1, padding=1, weight_attr=ParamAttr(initializer=Normal( mean=0., std=0.01)), bias_attr=ParamAttr(initializer=Constant( value=float(-np.log((1 - 0.01) / 0.01)))))) if self.drop_block and self.training: self.drop_block_fun = DropBlock( block_size=3, keep_prob=0.9, name='solo_cate.dropblock') def _points_nms(self, heat, kernel_size=2): hmax = F.max_pool2d(heat, kernel_size=kernel_size, stride=1, padding=1) keep = paddle.cast((hmax[:, :, :-1, :-1] == heat), 'float32') return heat * keep def _split_feats(self, feats): return (F.interpolate( feats[0], scale_factor=0.5, align_corners=False, align_mode=0, mode='bilinear'), feats[1], feats[2], feats[3], F.interpolate( feats[4], size=feats[3].shape[-2:], mode='bilinear', align_corners=False, align_mode=0)) def forward(self, input): """ Get SOLOv2 head output Args: input (list): List of Tensors, output of backbone or neck stages Returns: cate_pred_list (list): Tensors of each category branch layer kernel_pred_list (list): Tensors of each kernel branch layer """ feats = self._split_feats(input) cate_pred_list = [] kernel_pred_list = [] for idx in range(len(self.seg_num_grids)): cate_pred, kernel_pred = self._get_output_single(feats[idx], idx) cate_pred_list.append(cate_pred) kernel_pred_list.append(kernel_pred) return cate_pred_list, kernel_pred_list def _get_output_single(self, input, idx): ins_kernel_feat = input # CoordConv x_range = paddle.linspace( -1, 1, ins_kernel_feat.shape[-1], dtype='float32') y_range = paddle.linspace( -1, 1, ins_kernel_feat.shape[-2], dtype='float32') y, x = paddle.meshgrid([y_range, x_range]) x = paddle.unsqueeze(x, [0, 1]) y = paddle.unsqueeze(y, [0, 1]) y = paddle.expand( y, shape=[ins_kernel_feat.shape[0], 1, -1, -1]) x = paddle.expand( x, shape=[ins_kernel_feat.shape[0], 1, -1, -1]) coord_feat = paddle.concat([x, y], axis=1) ins_kernel_feat = paddle.concat([ins_kernel_feat, coord_feat], axis=1) # kernel branch kernel_feat = ins_kernel_feat seg_num_grid = self.seg_num_grids[idx] kernel_feat = F.interpolate( kernel_feat, size=[seg_num_grid, seg_num_grid], mode='bilinear', align_corners=False, align_mode=0) cate_feat = kernel_feat[:, :-2, :, :] for kernel_layer in self.kernel_pred_convs: kernel_feat = F.relu(kernel_layer(kernel_feat)) if self.drop_block and self.training: kernel_feat = self.drop_block_fun(kernel_feat) kernel_pred = self.solo_kernel(kernel_feat) # cate branch for cate_layer in self.cate_pred_convs: cate_feat = F.relu(cate_layer(cate_feat)) if self.drop_block and self.training: cate_feat = self.drop_block_fun(cate_feat) cate_pred = self.solo_cate(cate_feat) if not self.training: cate_pred = self._points_nms(F.sigmoid(cate_pred), kernel_size=2) cate_pred = paddle.transpose(cate_pred, [0, 2, 3, 1]) return cate_pred, kernel_pred def get_loss(self, cate_preds, kernel_preds, ins_pred, ins_labels, cate_labels, grid_order_list, fg_num): """ Get loss of network of SOLOv2. Args: cate_preds (list): Tensor list of categroy branch output. kernel_preds (list): Tensor list of kernel branch output. ins_pred (list): Tensor list of instance branch output. ins_labels (list): List of instance labels pre batch. cate_labels (list): List of categroy labels pre batch. grid_order_list (list): List of index in pre grid. fg_num (int): Number of positive samples in a mini-batch. Returns: loss_ins (Tensor): The instance loss Tensor of SOLOv2 network. loss_cate (Tensor): The category loss Tensor of SOLOv2 network. """ batch_size = grid_order_list[0].shape[0] ins_pred_list = [] for kernel_preds_level, grid_orders_level in zip(kernel_preds, grid_order_list): if grid_orders_level.shape[1] == 0: ins_pred_list.append(None) continue grid_orders_level = paddle.reshape(grid_orders_level, [-1]) reshape_pred = paddle.reshape( kernel_preds_level, shape=(kernel_preds_level.shape[0], kernel_preds_level.shape[1], -1)) reshape_pred = paddle.transpose(reshape_pred, [0, 2, 1]) reshape_pred = paddle.reshape( reshape_pred, shape=(-1, reshape_pred.shape[2])) gathered_pred = paddle.gather(reshape_pred, index=grid_orders_level) gathered_pred = paddle.reshape( gathered_pred, shape=[batch_size, -1, gathered_pred.shape[1]]) cur_ins_pred = ins_pred cur_ins_pred = paddle.reshape( cur_ins_pred, shape=(cur_ins_pred.shape[0], cur_ins_pred.shape[1], -1)) ins_pred_conv = paddle.matmul(gathered_pred, cur_ins_pred) cur_ins_pred = paddle.reshape( ins_pred_conv, shape=(-1, ins_pred.shape[-2], ins_pred.shape[-1])) ins_pred_list.append(cur_ins_pred) num_ins = paddle.sum(fg_num) cate_preds = [ paddle.reshape( paddle.transpose(cate_pred, [0, 2, 3, 1]), shape=(-1, self.cate_out_channels)) for cate_pred in cate_preds ] flatten_cate_preds = paddle.concat(cate_preds) new_cate_labels = [] for cate_label in cate_labels: new_cate_labels.append(paddle.reshape(cate_label, shape=[-1])) cate_labels = paddle.concat(new_cate_labels) loss_ins, loss_cate = self.solov2_loss( ins_pred_list, ins_labels, flatten_cate_preds, cate_labels, num_ins) return {'loss_ins': loss_ins, 'loss_cate': loss_cate} def get_prediction(self, cate_preds, kernel_preds, seg_pred, im_shape, scale_factor): """ Get prediction result of SOLOv2 network Args: cate_preds (list): List of Variables, output of categroy branch. kernel_preds (list): List of Variables, output of kernel branch. seg_pred (list): List of Variables, output of mask head stages. im_shape (Variables): [h, w] for input images. scale_factor (Variables): [scale, scale] for input images. Returns: seg_masks (Tensor): The prediction segmentation. cate_labels (Tensor): The prediction categroy label of each segmentation. seg_masks (Tensor): The prediction score of each segmentation. """ num_levels = len(cate_preds) featmap_size = seg_pred.shape[-2:] seg_masks_list = [] cate_labels_list = [] cate_scores_list = [] cate_preds = [cate_pred * 1.0 for cate_pred in cate_preds] kernel_preds = [kernel_pred * 1.0 for kernel_pred in kernel_preds] # Currently only supports batch size == 1 for idx in range(1): cate_pred_list = [ paddle.reshape( cate_preds[i][idx], shape=(-1, self.cate_out_channels)) for i in range(num_levels) ] seg_pred_list = seg_pred kernel_pred_list = [ paddle.reshape( paddle.transpose(kernel_preds[i][idx], [1, 2, 0]), shape=(-1, self.kernel_out_channels)) for i in range(num_levels) ] cate_pred_list = paddle.concat(cate_pred_list, axis=0) kernel_pred_list = paddle.concat(kernel_pred_list, axis=0) seg_masks, cate_labels, cate_scores = self.get_seg_single( cate_pred_list, seg_pred_list, kernel_pred_list, featmap_size, im_shape[idx], scale_factor[idx][0]) bbox_num = cate_labels.shape[0:1] return seg_masks, cate_labels, cate_scores, bbox_num def get_seg_single(self, cate_preds, seg_preds, kernel_preds, featmap_size, im_shape, scale_factor): """ The code of this function is based on: https://github.com/WXinlong/SOLO/blob/master/mmdet/models/anchor_heads/solov2_head.py#L385 """ h = paddle.cast(im_shape[0], 'int32') w = paddle.cast(im_shape[1], 'int32') upsampled_size_out = [featmap_size[0] * 4, featmap_size[1] * 4] y = paddle.zeros(shape=cate_preds.shape, dtype='float32') inds = paddle.where(cate_preds > self.score_threshold, cate_preds, y) inds = paddle.nonzero(inds) cate_preds = paddle.reshape(cate_preds, shape=[-1]) # Prevent empty and increase fake data ind_a = paddle.cast(paddle.shape(kernel_preds)[0:1], 'int64') ind_b = paddle.zeros(shape=[1], dtype='int64') inds_end = paddle.unsqueeze(paddle.concat([ind_a, ind_b]), 0) inds = paddle.concat([inds, inds_end]) kernel_preds_end = paddle.ones( shape=[1, self.kernel_out_channels], dtype='float32') kernel_preds = paddle.concat([kernel_preds, kernel_preds_end]) cate_preds = paddle.concat( [cate_preds, paddle.zeros( shape=[1], dtype='float32')]) # cate_labels & kernel_preds cate_labels = inds[:, 1] kernel_preds = paddle.gather(kernel_preds, index=inds[:, 0]) cate_score_idx = paddle.add(inds[:, 0] * self.cate_out_channels, cate_labels) cate_scores = paddle.gather(cate_preds, index=cate_score_idx) size_trans = np.power(self.seg_num_grids, 2) strides = [] for _ind in range(len(self.segm_strides)): strides.append( paddle.full( shape=[int(size_trans[_ind])], fill_value=self.segm_strides[_ind], dtype="int32")) strides = paddle.concat(strides) strides = paddle.concat( [strides, paddle.zeros( shape=[1], dtype='int32')]) strides = paddle.gather(strides, index=inds[:, 0]) # mask encoding. kernel_preds = paddle.unsqueeze(kernel_preds, [2, 3]) seg_preds = F.conv2d(seg_preds, kernel_preds) seg_preds = F.sigmoid(paddle.squeeze(seg_preds, [0])) seg_masks = seg_preds > self.mask_threshold seg_masks = paddle.cast(seg_masks, 'float32') sum_masks = paddle.sum(seg_masks, axis=[1, 2]) y = paddle.zeros(shape=paddle.shape(sum_masks), dtype='float32') keep = paddle.where(sum_masks > strides.cast(sum_masks.dtype), sum_masks, y) keep = paddle.nonzero(keep) keep = paddle.squeeze(keep, axis=[1]) # Prevent empty and increase fake data keep_other = paddle.concat( [keep, paddle.cast(paddle.shape(sum_masks)[0:1] - 1, 'int64')]) keep_scores = paddle.concat( [keep, paddle.cast(paddle.shape(sum_masks)[0:1], 'int64')]) cate_scores_end = paddle.zeros(shape=[1], dtype='float32') cate_scores = paddle.concat([cate_scores, cate_scores_end]) seg_masks = paddle.gather(seg_masks, index=keep_other) seg_preds = paddle.gather(seg_preds, index=keep_other) sum_masks = paddle.gather(sum_masks, index=keep_other) cate_labels = paddle.gather(cate_labels, index=keep_other) cate_scores = paddle.gather(cate_scores, index=keep_scores) # mask scoring. seg_mul = paddle.cast(seg_preds * seg_masks, 'float32') seg_scores = paddle.sum(seg_mul, axis=[1, 2]) / sum_masks cate_scores *= seg_scores # Matrix NMS seg_preds, cate_scores, cate_labels = self.mask_nms( seg_preds, seg_masks, cate_labels, cate_scores, sum_masks=sum_masks) ori_shape = im_shape[:2] / scale_factor + 0.5 ori_shape = paddle.cast(ori_shape, 'int32') seg_preds = F.interpolate( paddle.unsqueeze(seg_preds, 0), size=upsampled_size_out, mode='bilinear', align_corners=False, align_mode=0) seg_preds = paddle.slice( seg_preds, axes=[2, 3], starts=[0, 0], ends=[h, w]) seg_masks = paddle.squeeze( F.interpolate( seg_preds, size=ori_shape[:2], mode='bilinear', align_corners=False, align_mode=0), axis=[0]) seg_masks = paddle.cast(seg_masks > self.mask_threshold, 'uint8') return seg_masks, cate_labels, cate_scores ================================================ FILE: ppdet/modeling/heads/sparse_roi_head.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # This code is referenced from: https://github.com/open-mmlab/mmdetection from __future__ import absolute_import from __future__ import division from __future__ import print_function import copy import paddle from paddle import nn from ppdet.core.workspace import register from ppdet.modeling import initializer as init from .roi_extractor import RoIAlign from ..bbox_utils import delta2bbox_v2 from ..cls_utils import _get_class_default_kwargs from ..layers import MultiHeadAttention __all__ = ['SparseRoIHead', 'DIIHead', 'DynamicMaskHead'] class DynamicConv(nn.Layer): def __init__(self, in_channels=256, feature_channels=64, out_channels=None, roi_resolution=7, with_proj=True): super(DynamicConv, self).__init__() self.in_channels = in_channels self.feature_channels = feature_channels self.out_channels = out_channels if out_channels else in_channels self.num_params_in = self.in_channels * self.feature_channels self.num_params_out = self.out_channels * self.feature_channels self.dynamic_layer = nn.Linear(self.in_channels, self.num_params_in + self.num_params_out) self.norm_in = nn.LayerNorm(self.feature_channels) self.norm_out = nn.LayerNorm(self.out_channels) self.activation = nn.ReLU() self.with_proj = with_proj if self.with_proj: num_output = self.out_channels * roi_resolution**2 self.fc_layer = nn.Linear(num_output, self.out_channels) self.fc_norm = nn.LayerNorm(self.out_channels) def forward(self, param_feature, input_feature): input_feature = input_feature.flatten(2).transpose([2, 0, 1]) input_feature = input_feature.transpose([1, 0, 2]) parameters = self.dynamic_layer(param_feature) param_in = parameters[:, :self.num_params_in].reshape( [-1, self.in_channels, self.feature_channels]) param_out = parameters[:, -self.num_params_out:].reshape( [-1, self.feature_channels, self.out_channels]) features = paddle.bmm(input_feature, param_in) features = self.norm_in(features) features = self.activation(features) features = paddle.bmm(features, param_out) features = self.norm_out(features) features = self.activation(features) if self.with_proj: features = features.flatten(1) features = self.fc_layer(features) features = self.fc_norm(features) features = self.activation(features) return features class FFN(nn.Layer): def __init__(self, embed_dims=256, feedforward_channels=2048, num_fcs=2, ffn_drop=0.0, add_identity=True): super(FFN, self).__init__() layers = [] in_channels = embed_dims for _ in range(num_fcs - 1): layers.append( nn.Sequential( nn.Linear(in_channels, feedforward_channels), nn.ReLU(), nn.Dropout(ffn_drop))) in_channels = feedforward_channels layers.append(nn.Linear(feedforward_channels, embed_dims)) layers.append(nn.Dropout(ffn_drop)) self.layers = nn.Sequential(*layers) self.add_identity = add_identity def forward(self, x): identity = x out = self.layers(x) if not self.add_identity: return out else: return out + identity @register class DynamicMaskHead(nn.Layer): __shared__ = ['num_classes', 'proposal_embedding_dim', 'norm_type'] def __init__(self, num_classes=80, proposal_embedding_dim=256, dynamic_feature_channels=64, roi_resolution=14, num_convs=4, conv_kernel_size=3, conv_channels=256, upsample_method='deconv', upsample_scale_factor=2, norm_type='bn'): super(DynamicMaskHead, self).__init__() self.d_model = proposal_embedding_dim self.instance_interactive_conv = DynamicConv( self.d_model, dynamic_feature_channels, roi_resolution=roi_resolution, with_proj=False) self.convs = nn.LayerList() for i in range(num_convs): self.convs.append( nn.Sequential( nn.Conv2D( self.d_model if i == 0 else conv_channels, conv_channels, conv_kernel_size, padding='same', bias_attr=False), nn.BatchNorm2D(conv_channels), nn.ReLU())) if norm_type == 'sync_bn': self.convs = nn.SyncBatchNorm.convert_sync_batchnorm(self.convs) self.upsample_method = upsample_method if upsample_method is None: self.upsample = None elif upsample_method == 'deconv': self.upsample = nn.Conv2DTranspose( conv_channels if num_convs > 0 else self.d_model, conv_channels, upsample_scale_factor, stride=upsample_scale_factor) self.relu = nn.ReLU() else: self.upsample = nn.Upsample(None, upsample_scale_factor) cls_in_channels = conv_channels if num_convs > 0 else self.d_model cls_in_channels = conv_channels if upsample_method == 'deconv' else cls_in_channels self.conv_cls = nn.Conv2D(cls_in_channels, num_classes, 1) self._init_weights() def _init_weights(self): for p in self.parameters(): if p.dim() > 1: init.xavier_uniform_(p) init.constant_(self.conv_cls.bias, 0.) def forward(self, roi_features, attn_features): attn_features = attn_features.reshape([-1, self.d_model]) attn_features_iic = self.instance_interactive_conv(attn_features, roi_features) x = attn_features_iic.transpose([0, 2, 1]).reshape(roi_features.shape) for conv in self.convs: x = conv(x) if self.upsample is not None: x = self.upsample(x) if self.upsample_method == 'deconv': x = self.relu(x) mask_pred = self.conv_cls(x) return mask_pred @register class DIIHead(nn.Layer): __shared__ = ['num_classes', 'proposal_embedding_dim'] def __init__(self, num_classes=80, proposal_embedding_dim=256, feedforward_channels=2048, dynamic_feature_channels=64, roi_resolution=7, num_attn_heads=8, dropout=0.0, num_ffn_fcs=2, num_cls_fcs=1, num_reg_fcs=3): super(DIIHead, self).__init__() self.num_classes = num_classes self.d_model = proposal_embedding_dim self.attention = MultiHeadAttention(self.d_model, num_attn_heads, dropout) self.attention_norm = nn.LayerNorm(self.d_model) self.instance_interactive_conv = DynamicConv( self.d_model, dynamic_feature_channels, roi_resolution=roi_resolution, with_proj=True) self.instance_interactive_conv_dropout = nn.Dropout(dropout) self.instance_interactive_conv_norm = nn.LayerNorm(self.d_model) self.ffn = FFN(self.d_model, feedforward_channels, num_ffn_fcs, dropout) self.ffn_norm = nn.LayerNorm(self.d_model) self.cls_fcs = nn.LayerList() for _ in range(num_cls_fcs): self.cls_fcs.append( nn.Linear( self.d_model, self.d_model, bias_attr=False)) self.cls_fcs.append(nn.LayerNorm(self.d_model)) self.cls_fcs.append(nn.ReLU()) self.fc_cls = nn.Linear(self.d_model, self.num_classes) self.reg_fcs = nn.LayerList() for _ in range(num_reg_fcs): self.reg_fcs.append( nn.Linear( self.d_model, self.d_model, bias_attr=False)) self.reg_fcs.append(nn.LayerNorm(self.d_model)) self.reg_fcs.append(nn.ReLU()) self.fc_reg = nn.Linear(self.d_model, 4) self._init_weights() def _init_weights(self): for p in self.parameters(): if p.dim() > 1: init.xavier_uniform_(p) bias_init = init.bias_init_with_prob(0.01) init.constant_(self.fc_cls.bias, bias_init) def forward(self, roi_features, proposal_features): N, num_proposals = proposal_features.shape[:2] proposal_features = proposal_features + self.attention( proposal_features) attn_features = self.attention_norm(proposal_features) proposal_features = attn_features.reshape([-1, self.d_model]) proposal_features_iic = self.instance_interactive_conv( proposal_features, roi_features) proposal_features = proposal_features + self.instance_interactive_conv_dropout( proposal_features_iic) obj_features = self.instance_interactive_conv_norm(proposal_features) obj_features = self.ffn(obj_features) obj_features = self.ffn_norm(obj_features) cls_feature = obj_features.clone() reg_feature = obj_features.clone() for cls_layer in self.cls_fcs: cls_feature = cls_layer(cls_feature) class_logits = self.fc_cls(cls_feature) for reg_layer in self.reg_fcs: reg_feature = reg_layer(reg_feature) bbox_deltas = self.fc_reg(reg_feature) class_logits = class_logits.reshape( [N, num_proposals, self.num_classes]) bbox_deltas = bbox_deltas.reshape([N, num_proposals, 4]) obj_features = obj_features.reshape([N, num_proposals, self.d_model]) return class_logits, bbox_deltas, obj_features, attn_features @staticmethod def refine_bboxes(proposal_bboxes, bbox_deltas): pred_bboxes = delta2bbox_v2( bbox_deltas.reshape([-1, 4]), proposal_bboxes.reshape([-1, 4]), delta_mean=[0.0, 0.0, 0.0, 0.0], delta_std=[0.5, 0.5, 1.0, 1.0], ctr_clip=None) return pred_bboxes.reshape(proposal_bboxes.shape) @register class SparseRoIHead(nn.Layer): __inject__ = ['bbox_head', 'mask_head', 'loss_func'] def __init__(self, num_stages=6, bbox_roi_extractor=_get_class_default_kwargs(RoIAlign), mask_roi_extractor=_get_class_default_kwargs(RoIAlign), bbox_head='DIIHead', mask_head='DynamicMaskHead', loss_func='QueryInstLoss'): super(SparseRoIHead, self).__init__() self.num_stages = num_stages self.bbox_roi_extractor = bbox_roi_extractor self.mask_roi_extractor = mask_roi_extractor if isinstance(bbox_roi_extractor, dict): self.bbox_roi_extractor = RoIAlign(**bbox_roi_extractor) if isinstance(mask_roi_extractor, dict): self.mask_roi_extractor = RoIAlign(**mask_roi_extractor) self.bbox_heads = nn.LayerList( [copy.deepcopy(bbox_head) for _ in range(num_stages)]) self.mask_heads = nn.LayerList( [copy.deepcopy(mask_head) for _ in range(num_stages)]) self.loss_helper = loss_func @classmethod def from_config(cls, cfg, input_shape): bbox_roi_extractor = cfg['bbox_roi_extractor'] mask_roi_extractor = cfg['mask_roi_extractor'] assert isinstance(bbox_roi_extractor, dict) assert isinstance(mask_roi_extractor, dict) kwargs = RoIAlign.from_config(cfg, input_shape) bbox_roi_extractor.update(kwargs) mask_roi_extractor.update(kwargs) return { 'bbox_roi_extractor': bbox_roi_extractor, 'mask_roi_extractor': mask_roi_extractor } @staticmethod def get_roi_features(features, bboxes, roi_extractor): rois_list = [ bboxes[i] for i in range(len(bboxes)) if len(bboxes[i]) > 0 ] rois_num = paddle.to_tensor( [len(bboxes[i]) for i in range(len(bboxes))], dtype='int32') pos_ids = paddle.cast(rois_num, dtype='bool') if pos_ids.sum() != len(rois_num): rois_num = rois_num[pos_ids] features = [features[i][pos_ids] for i in range(len(features))] return roi_extractor(features, rois_list, rois_num) def _forward_train(self, body_feats, pro_bboxes, pro_feats, targets): all_stage_losses = {} for stage in range(self.num_stages): bbox_head = self.bbox_heads[stage] mask_head = self.mask_heads[stage] roi_feats = self.get_roi_features(body_feats, pro_bboxes, self.bbox_roi_extractor) class_logits, bbox_deltas, pro_feats, attn_feats = bbox_head( roi_feats, pro_feats) bbox_pred = self.bbox_heads[stage].refine_bboxes(pro_bboxes, bbox_deltas) indices = self.loss_helper.matcher({ 'pred_logits': class_logits.detach(), 'pred_boxes': bbox_pred.detach() }, targets) avg_factor = paddle.to_tensor( [sum(len(tgt['labels']) for tgt in targets)], dtype='float32') if paddle.distributed.get_world_size() > 1: paddle.distributed.all_reduce(avg_factor) avg_factor /= paddle.distributed.get_world_size() avg_factor = paddle.clip(avg_factor, min=1.) loss_classes = self.loss_helper.loss_classes(class_logits, targets, indices, avg_factor) if sum(len(v['labels']) for v in targets) == 0: loss_bboxes = { 'loss_bbox': paddle.to_tensor([0.]), 'loss_giou': paddle.to_tensor([0.]) } loss_masks = {'loss_mask': paddle.to_tensor([0.])} else: loss_bboxes = self.loss_helper.loss_bboxes(bbox_pred, targets, indices, avg_factor) pos_attn_feats = paddle.concat([ paddle.gather( src, src_idx, axis=0) for src, (src_idx, _) in zip(attn_feats, indices) ]) pos_bbox_pred = [ paddle.gather( src, src_idx, axis=0) for src, (src_idx, _) in zip(bbox_pred.detach(), indices) ] pos_roi_feats = self.get_roi_features(body_feats, pos_bbox_pred, self.mask_roi_extractor) mask_logits = mask_head(pos_roi_feats, pos_attn_feats) loss_masks = self.loss_helper.loss_masks( pos_bbox_pred, mask_logits, targets, indices, avg_factor) for loss in [loss_classes, loss_bboxes, loss_masks]: for key in loss.keys(): all_stage_losses[f'stage{stage}_{key}'] = loss[key] pro_bboxes = bbox_pred.detach() return all_stage_losses def _forward_test(self, body_feats, pro_bboxes, pro_feats): for stage in range(self.num_stages): roi_feats = self.get_roi_features(body_feats, pro_bboxes, self.bbox_roi_extractor) class_logits, bbox_deltas, pro_feats, attn_feats = self.bbox_heads[ stage](roi_feats, pro_feats) bbox_pred = self.bbox_heads[stage].refine_bboxes(pro_bboxes, bbox_deltas) pro_bboxes = bbox_pred.detach() roi_feats = self.get_roi_features(body_feats, bbox_pred, self.mask_roi_extractor) mask_logits = self.mask_heads[stage](roi_feats, attn_feats) return { 'class_logits': class_logits, 'bbox_pred': bbox_pred, 'mask_logits': mask_logits } def forward(self, body_features, proposal_bboxes, proposal_features, targets=None): if self.training: return self._forward_train(body_features, proposal_bboxes, proposal_features, targets) else: return self._forward_test(body_features, proposal_bboxes, proposal_features) ================================================ FILE: ppdet/modeling/heads/sparsercnn_head.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This code is based on https://github.com/PeizeSun/SparseR-CNN/blob/main/projects/SparseRCNN/sparsercnn/head.py Ths copyright of PeizeSun/SparseR-CNN is as follows: MIT License [see LICENSE for details] """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import math import copy import paddle import paddle.nn as nn from ppdet.core.workspace import register from ppdet.modeling.heads.roi_extractor import RoIAlign from ppdet.modeling.bbox_utils import delta2bbox from .. import initializer as init _DEFAULT_SCALE_CLAMP = math.log(100000. / 16) class DynamicConv(nn.Layer): def __init__( self, head_hidden_dim, head_dim_dynamic, head_num_dynamic, ): super().__init__() self.hidden_dim = head_hidden_dim self.dim_dynamic = head_dim_dynamic self.num_dynamic = head_num_dynamic self.num_params = self.hidden_dim * self.dim_dynamic self.dynamic_layer = nn.Linear(self.hidden_dim, self.num_dynamic * self.num_params) self.norm1 = nn.LayerNorm(self.dim_dynamic) self.norm2 = nn.LayerNorm(self.hidden_dim) self.activation = nn.ReLU() pooler_resolution = 7 num_output = self.hidden_dim * pooler_resolution**2 self.out_layer = nn.Linear(num_output, self.hidden_dim) self.norm3 = nn.LayerNorm(self.hidden_dim) def forward(self, pro_features, roi_features): ''' pro_features: (1, N * nr_boxes, self.d_model) roi_features: (49, N * nr_boxes, self.d_model) ''' features = roi_features.transpose(perm=[1, 0, 2]) parameters = self.dynamic_layer(pro_features).transpose(perm=[1, 0, 2]) param1 = parameters[:, :, :self.num_params].reshape( [-1, self.hidden_dim, self.dim_dynamic]) param2 = parameters[:, :, self.num_params:].reshape( [-1, self.dim_dynamic, self.hidden_dim]) features = paddle.bmm(features, param1) features = self.norm1(features) features = self.activation(features) features = paddle.bmm(features, param2) features = self.norm2(features) features = self.activation(features) features = features.flatten(1) features = self.out_layer(features) features = self.norm3(features) features = self.activation(features) return features class RCNNHead(nn.Layer): def __init__( self, d_model, num_classes, dim_feedforward, nhead, dropout, head_cls, head_reg, head_dim_dynamic, head_num_dynamic, scale_clamp: float=_DEFAULT_SCALE_CLAMP, bbox_weights=(2.0, 2.0, 1.0, 1.0), ): super().__init__() self.d_model = d_model # dynamic. self.self_attn = nn.MultiHeadAttention(d_model, nhead, dropout=dropout) self.inst_interact = DynamicConv(d_model, head_dim_dynamic, head_num_dynamic) self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.dropout3 = nn.Dropout(dropout) self.activation = nn.ReLU() # cls. num_cls = head_cls cls_module = list() for _ in range(num_cls): cls_module.append(nn.Linear(d_model, d_model, bias_attr=False)) cls_module.append(nn.LayerNorm(d_model)) cls_module.append(nn.ReLU()) self.cls_module = nn.LayerList(cls_module) # reg. num_reg = head_reg reg_module = list() for _ in range(num_reg): reg_module.append(nn.Linear(d_model, d_model, bias_attr=False)) reg_module.append(nn.LayerNorm(d_model)) reg_module.append(nn.ReLU()) self.reg_module = nn.LayerList(reg_module) # pred. self.class_logits = nn.Linear(d_model, num_classes) self.bboxes_delta = nn.Linear(d_model, 4) self.scale_clamp = scale_clamp self.bbox_weights = bbox_weights def forward(self, features, bboxes, pro_features, pooler): """ :param bboxes: (N, nr_boxes, 4) :param pro_features: (N, nr_boxes, d_model) """ N, nr_boxes = bboxes.shape[:2] proposal_boxes = list() for b in range(N): proposal_boxes.append(bboxes[b]) roi_num = paddle.full([N], nr_boxes).astype("int32") roi_features = pooler(features, proposal_boxes, roi_num) roi_features = roi_features.reshape( [N * nr_boxes, self.d_model, -1]).transpose(perm=[2, 0, 1]) # self_att. pro_features = pro_features.reshape([N, nr_boxes, self.d_model]) pro_features2 = self.self_attn( pro_features, pro_features, value=pro_features) pro_features = pro_features.transpose(perm=[1, 0, 2]) + self.dropout1( pro_features2.transpose(perm=[1, 0, 2])) pro_features = self.norm1(pro_features) # inst_interact. pro_features = pro_features.reshape( [nr_boxes, N, self.d_model]).transpose(perm=[1, 0, 2]).reshape( [1, N * nr_boxes, self.d_model]) pro_features2 = self.inst_interact(pro_features, roi_features) pro_features = pro_features + self.dropout2(pro_features2) obj_features = self.norm2(pro_features) # obj_feature. obj_features2 = self.linear2( self.dropout(self.activation(self.linear1(obj_features)))) obj_features = obj_features + self.dropout3(obj_features2) obj_features = self.norm3(obj_features) fc_feature = obj_features.transpose(perm=[1, 0, 2]).reshape( [N * nr_boxes, -1]) cls_feature = fc_feature.clone() reg_feature = fc_feature.clone() for cls_layer in self.cls_module: cls_feature = cls_layer(cls_feature) for reg_layer in self.reg_module: reg_feature = reg_layer(reg_feature) class_logits = self.class_logits(cls_feature) bboxes_deltas = self.bboxes_delta(reg_feature) pred_bboxes = delta2bbox(bboxes_deltas, bboxes.reshape([-1, 4]), self.bbox_weights) return class_logits.reshape([N, nr_boxes, -1]), pred_bboxes.reshape( [N, nr_boxes, -1]), obj_features @register class SparseRCNNHead(nn.Layer): ''' SparsercnnHead Args: roi_input_shape (list[ShapeSpec]): The output shape of fpn num_classes (int): Number of classes, head_hidden_dim (int): The param of MultiHeadAttention, head_dim_feedforward (int): The param of MultiHeadAttention, nhead (int): The param of MultiHeadAttention, head_dropout (float): The p of dropout, head_cls (int): The number of class head, head_reg (int): The number of regressionhead, head_num_dynamic (int): The number of DynamicConv's param, head_num_heads (int): The number of RCNNHead, deep_supervision (int): wheather supervise the intermediate results, num_proposals (int): the number of proposals boxes and features ''' __inject__ = ['loss_func'] __shared__ = ['num_classes'] def __init__( self, head_hidden_dim, head_dim_feedforward, nhead, head_dropout, head_cls, head_reg, head_dim_dynamic, head_num_dynamic, head_num_heads, deep_supervision, num_proposals, num_classes=80, loss_func="SparseRCNNLoss", roi_input_shape=None, ): super().__init__() assert head_num_heads > 0, \ f'At least one RoI Head is required, but {head_num_heads}.' # Build RoI. box_pooler = self._init_box_pooler(roi_input_shape) self.box_pooler = box_pooler # Build heads. rcnn_head = RCNNHead( head_hidden_dim, num_classes, head_dim_feedforward, nhead, head_dropout, head_cls, head_reg, head_dim_dynamic, head_num_dynamic, ) self.head_series = nn.LayerList( [copy.deepcopy(rcnn_head) for i in range(head_num_heads)]) self.return_intermediate = deep_supervision self.num_classes = num_classes # build init proposal self.init_proposal_features = nn.Embedding(num_proposals, head_hidden_dim) self.init_proposal_boxes = nn.Embedding(num_proposals, 4) self.lossfunc = loss_func # Init parameters. init.reset_initialized_parameter(self) self._reset_parameters() def _reset_parameters(self): # init all parameters. prior_prob = 0.01 bias_value = -math.log((1 - prior_prob) / prior_prob) for m in self.sublayers(): if isinstance(m, nn.Linear): init.xavier_normal_(m.weight, reverse=True) elif not isinstance(m, nn.Embedding) and hasattr( m, "weight") and m.weight.dim() > 1: init.xavier_normal_(m.weight, reverse=False) if hasattr(m, "bias") and m.bias is not None and m.bias.shape[ -1] == self.num_classes: init.constant_(m.bias, bias_value) init_bboxes = paddle.empty_like(self.init_proposal_boxes.weight) init_bboxes[:, :2] = 0.5 init_bboxes[:, 2:] = 1.0 self.init_proposal_boxes.weight.set_value(init_bboxes) @staticmethod def _init_box_pooler(input_shape): pooler_resolution = 7 sampling_ratio = 2 if input_shape is not None: pooler_scales = tuple(1.0 / input_shape[k].stride for k in range(len(input_shape))) in_channels = [ input_shape[f].channels for f in range(len(input_shape)) ] end_level = len(input_shape) - 1 # Check all channel counts are equal assert len(set(in_channels)) == 1, in_channels else: pooler_scales = [1.0 / 4.0, 1.0 / 8.0, 1.0 / 16.0, 1.0 / 32.0] end_level = 3 aligned = True if paddle.device.is_compiled_with_custom_device('npu'): aligned = False box_pooler = RoIAlign( resolution=pooler_resolution, spatial_scale=pooler_scales, sampling_ratio=sampling_ratio, end_level=end_level, aligned=aligned) return box_pooler def forward(self, features, input_whwh): bs = len(features[0]) bboxes = box_cxcywh_to_xyxy(self.init_proposal_boxes.weight.clone( )).unsqueeze(0) bboxes = bboxes * input_whwh.unsqueeze(-2) init_features = self.init_proposal_features.weight.unsqueeze(0).tile( [1, bs, 1]) proposal_features = init_features.clone() inter_class_logits = [] inter_pred_bboxes = [] for stage, rcnn_head in enumerate(self.head_series): class_logits, pred_bboxes, proposal_features = rcnn_head( features, bboxes, proposal_features, self.box_pooler) if self.return_intermediate or stage == len(self.head_series) - 1: inter_class_logits.append(class_logits) inter_pred_bboxes.append(pred_bboxes) bboxes = pred_bboxes.detach() output = { 'pred_logits': inter_class_logits[-1], 'pred_boxes': inter_pred_bboxes[-1] } if self.return_intermediate: output['aux_outputs'] = [{ 'pred_logits': a, 'pred_boxes': b } for a, b in zip(inter_class_logits[:-1], inter_pred_bboxes[:-1])] return output def get_loss(self, outputs, targets): losses = self.lossfunc(outputs, targets) weight_dict = self.lossfunc.weight_dict for k in losses.keys(): if k in weight_dict: losses[k] *= weight_dict[k] return losses def box_cxcywh_to_xyxy(x): x_c, y_c, w, h = x.unbind(-1) b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)] return paddle.stack(b, axis=-1) ================================================ FILE: ppdet/modeling/heads/ssd_head.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from paddle.regularizer import L2Decay from paddle import ParamAttr from ..layers import AnchorGeneratorSSD from ..cls_utils import _get_class_default_kwargs class SepConvLayer(nn.Layer): def __init__(self, in_channels, out_channels, kernel_size=3, padding=1, conv_decay=0.): super(SepConvLayer, self).__init__() self.dw_conv = nn.Conv2D( in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size, stride=1, padding=padding, groups=in_channels, weight_attr=ParamAttr(regularizer=L2Decay(conv_decay)), bias_attr=False) self.bn = nn.BatchNorm2D( in_channels, weight_attr=ParamAttr(regularizer=L2Decay(0.)), bias_attr=ParamAttr(regularizer=L2Decay(0.))) self.pw_conv = nn.Conv2D( in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=1, padding=0, weight_attr=ParamAttr(regularizer=L2Decay(conv_decay)), bias_attr=False) def forward(self, x): x = self.dw_conv(x) x = F.relu6(self.bn(x)) x = self.pw_conv(x) return x class SSDExtraHead(nn.Layer): def __init__(self, in_channels=256, out_channels=([256, 512], [256, 512], [128, 256], [128, 256], [128, 256]), strides=(2, 2, 2, 1, 1), paddings=(1, 1, 1, 0, 0)): super(SSDExtraHead, self).__init__() self.convs = nn.LayerList() for out_channel, stride, padding in zip(out_channels, strides, paddings): self.convs.append( self._make_layers(in_channels, out_channel[0], out_channel[1], stride, padding)) in_channels = out_channel[-1] def _make_layers(self, c_in, c_hidden, c_out, stride_3x3, padding_3x3): return nn.Sequential( nn.Conv2D(c_in, c_hidden, 1), nn.ReLU(), nn.Conv2D(c_hidden, c_out, 3, stride_3x3, padding_3x3), nn.ReLU()) def forward(self, x): out = [x] for conv_layer in self.convs: out.append(conv_layer(out[-1])) return out @register class SSDHead(nn.Layer): """ SSDHead Args: num_classes (int): Number of classes in_channels (list): Number of channels per input feature anchor_generator (dict): Configuration of 'AnchorGeneratorSSD' instance kernel_size (int): Conv kernel size padding (int): Conv padding use_sepconv (bool): Use SepConvLayer if true conv_decay (float): Conv regularization coeff loss (object): 'SSDLoss' instance use_extra_head (bool): If use ResNet34 as baskbone, you should set `use_extra_head`=True """ __shared__ = ['num_classes'] __inject__ = ['anchor_generator', 'loss'] def __init__(self, num_classes=80, in_channels=(512, 1024, 512, 256, 256, 256), anchor_generator=_get_class_default_kwargs(AnchorGeneratorSSD), kernel_size=3, padding=1, use_sepconv=False, conv_decay=0., loss='SSDLoss', use_extra_head=False): super(SSDHead, self).__init__() # add background class self.num_classes = num_classes + 1 self.in_channels = in_channels self.anchor_generator = anchor_generator self.loss = loss self.use_extra_head = use_extra_head if self.use_extra_head: self.ssd_extra_head = SSDExtraHead() self.in_channels = [256, 512, 512, 256, 256, 256] if isinstance(anchor_generator, dict): self.anchor_generator = AnchorGeneratorSSD(**anchor_generator) self.num_priors = self.anchor_generator.num_priors self.box_convs = [] self.score_convs = [] for i, num_prior in enumerate(self.num_priors): box_conv_name = "boxes{}".format(i) if not use_sepconv: box_conv = self.add_sublayer( box_conv_name, nn.Conv2D( in_channels=self.in_channels[i], out_channels=num_prior * 4, kernel_size=kernel_size, padding=padding)) else: box_conv = self.add_sublayer( box_conv_name, SepConvLayer( in_channels=self.in_channels[i], out_channels=num_prior * 4, kernel_size=kernel_size, padding=padding, conv_decay=conv_decay)) self.box_convs.append(box_conv) score_conv_name = "scores{}".format(i) if not use_sepconv: score_conv = self.add_sublayer( score_conv_name, nn.Conv2D( in_channels=self.in_channels[i], out_channels=num_prior * self.num_classes, kernel_size=kernel_size, padding=padding)) else: score_conv = self.add_sublayer( score_conv_name, SepConvLayer( in_channels=self.in_channels[i], out_channels=num_prior * self.num_classes, kernel_size=kernel_size, padding=padding, conv_decay=conv_decay)) self.score_convs.append(score_conv) @classmethod def from_config(cls, cfg, input_shape): return {'in_channels': [i.channels for i in input_shape], } def forward(self, feats, image, gt_bbox=None, gt_class=None): if self.use_extra_head: assert len(feats) == 1, \ ("If you set use_extra_head=True, backbone feature " "list length should be 1.") feats = self.ssd_extra_head(feats[0]) box_preds = [] cls_scores = [] for feat, box_conv, score_conv in zip(feats, self.box_convs, self.score_convs): box_pred = box_conv(feat) box_pred = paddle.transpose(box_pred, [0, 2, 3, 1]) box_pred = paddle.reshape(box_pred, [0, -1, 4]) box_preds.append(box_pred) cls_score = score_conv(feat) cls_score = paddle.transpose(cls_score, [0, 2, 3, 1]) cls_score = paddle.reshape(cls_score, [0, -1, self.num_classes]) cls_scores.append(cls_score) prior_boxes = self.anchor_generator(feats, image) if self.training: return self.get_loss(box_preds, cls_scores, gt_bbox, gt_class, prior_boxes) else: return (box_preds, cls_scores), prior_boxes def get_loss(self, boxes, scores, gt_bbox, gt_class, prior_boxes): return self.loss(boxes, scores, gt_bbox, gt_class, prior_boxes) ================================================ FILE: ppdet/modeling/heads/tood_head.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.nn.initializer import Constant from ppdet.core.workspace import register from ..initializer import normal_, constant_, bias_init_with_prob from ppdet.modeling.bbox_utils import bbox_center, batch_distance2bbox from ..losses import GIoULoss from ppdet.modeling.layers import ConvNormLayer from ppdet.modeling.ops import get_static_shape from ppdet.modeling.assigners.utils import generate_anchors_for_grid_cell class ScaleReg(nn.Layer): """ Parameter for scaling the regression outputs. """ def __init__(self, init_scale=1.): super(ScaleReg, self).__init__() self.scale_reg = self.create_parameter( shape=[1], attr=ParamAttr(initializer=Constant(value=init_scale)), dtype="float32") def forward(self, inputs): out = inputs * self.scale_reg return out class TaskDecomposition(nn.Layer): """This code is based on https://github.com/fcjian/TOOD/blob/master/mmdet/models/dense_heads/tood_head.py """ def __init__( self, feat_channels, stacked_convs, la_down_rate=8, norm_type='gn', norm_groups=32, ): super(TaskDecomposition, self).__init__() self.feat_channels = feat_channels self.stacked_convs = stacked_convs self.norm_type = norm_type self.norm_groups = norm_groups self.in_channels = self.feat_channels * self.stacked_convs self.la_conv1 = nn.Conv2D(self.in_channels, self.in_channels // la_down_rate, 1) self.la_conv2 = nn.Conv2D(self.in_channels // la_down_rate, self.stacked_convs, 1) self.reduction_conv = ConvNormLayer( self.in_channels, self.feat_channels, filter_size=1, stride=1, norm_type=self.norm_type, norm_groups=self.norm_groups) self._init_weights() def _init_weights(self): normal_(self.la_conv1.weight, std=0.001) normal_(self.la_conv2.weight, std=0.001) def forward(self, feat, avg_feat): feat_shape = get_static_shape(feat) b = feat_shape[0:1] h = feat_shape[2:3] w = feat_shape[3:4] weight = F.relu(self.la_conv1(avg_feat)) weight = F.sigmoid(self.la_conv2(weight)).unsqueeze(-1) feat = paddle.reshape( feat, [b, self.stacked_convs, self.feat_channels, h, w]) * weight feat = self.reduction_conv(feat.flatten(1, 2)) feat = F.relu(feat) return feat @register class TOODHead(nn.Layer): """This code is based on https://github.com/fcjian/TOOD/blob/master/mmdet/models/dense_heads/tood_head.py """ __inject__ = ['nms', 'static_assigner', 'assigner'] __shared__ = ['num_classes'] def __init__(self, num_classes=80, feat_channels=256, stacked_convs=6, fpn_strides=(8, 16, 32, 64, 128), grid_cell_scale=8, grid_cell_offset=0.5, norm_type='gn', norm_groups=32, static_assigner_epoch=4, use_align_head=True, loss_weight={ 'class': 1.0, 'bbox': 1.0, 'iou': 2.0, }, nms='MultiClassNMS', static_assigner='ATSSAssigner', assigner='TaskAlignedAssigner'): super(TOODHead, self).__init__() self.num_classes = num_classes self.feat_channels = feat_channels self.stacked_convs = stacked_convs self.fpn_strides = fpn_strides self.grid_cell_scale = grid_cell_scale self.grid_cell_offset = grid_cell_offset self.static_assigner_epoch = static_assigner_epoch self.use_align_head = use_align_head self.nms = nms self.static_assigner = static_assigner self.assigner = assigner self.loss_weight = loss_weight self.giou_loss = GIoULoss() self.inter_convs = nn.LayerList() for i in range(self.stacked_convs): self.inter_convs.append( ConvNormLayer( self.feat_channels, self.feat_channels, filter_size=3, stride=1, norm_type=norm_type, norm_groups=norm_groups)) self.cls_decomp = TaskDecomposition( self.feat_channels, self.stacked_convs, self.stacked_convs * 8, norm_type=norm_type, norm_groups=norm_groups) self.reg_decomp = TaskDecomposition( self.feat_channels, self.stacked_convs, self.stacked_convs * 8, norm_type=norm_type, norm_groups=norm_groups) self.tood_cls = nn.Conv2D( self.feat_channels, self.num_classes, 3, padding=1) self.tood_reg = nn.Conv2D(self.feat_channels, 4, 3, padding=1) if self.use_align_head: self.cls_prob_conv1 = nn.Conv2D(self.feat_channels * self.stacked_convs, self.feat_channels // 4, 1) self.cls_prob_conv2 = nn.Conv2D( self.feat_channels // 4, 1, 3, padding=1) self.reg_offset_conv1 = nn.Conv2D(self.feat_channels * self.stacked_convs, self.feat_channels // 4, 1) self.reg_offset_conv2 = nn.Conv2D( self.feat_channels // 4, 4 * 2, 3, padding=1) self.scales_regs = nn.LayerList([ScaleReg() for _ in self.fpn_strides]) self._init_weights() @classmethod def from_config(cls, cfg, input_shape): return { 'feat_channels': input_shape[0].channels, 'fpn_strides': [i.stride for i in input_shape], } def _init_weights(self): bias_cls = bias_init_with_prob(0.01) normal_(self.tood_cls.weight, std=0.01) constant_(self.tood_cls.bias, bias_cls) normal_(self.tood_reg.weight, std=0.01) if self.use_align_head: normal_(self.cls_prob_conv1.weight, std=0.01) normal_(self.cls_prob_conv2.weight, std=0.01) constant_(self.cls_prob_conv2.bias, bias_cls) normal_(self.reg_offset_conv1.weight, std=0.001) constant_(self.reg_offset_conv2.weight) constant_(self.reg_offset_conv2.bias) def _reg_grid_sample(self, feat, offset, anchor_points): feat_shape = get_static_shape(feat) b = feat_shape[0:1] h = feat_shape[2:3] w = feat_shape[3:4] feat = paddle.reshape(feat, [-1, 1, h, w]) offset = paddle.reshape(offset, [-1, 2, h, w]).transpose([0, 2, 3, 1]) grid_shape = paddle.concat([w, h]).astype('float32') grid = (offset + anchor_points) / grid_shape grid = 2 * grid.clip(0., 1.) - 1 feat = F.grid_sample(feat, grid) feat = paddle.reshape(feat, [b, -1, h, w]) return feat def forward(self, feats): assert len(feats) == len(self.fpn_strides), \ "The size of feats is not equal to size of fpn_strides" anchors, anchor_points, num_anchors_list, stride_tensor =\ generate_anchors_for_grid_cell( feats, self.fpn_strides, self.grid_cell_scale, self.grid_cell_offset) anchor_centers_split = paddle.split(anchor_points / stride_tensor, num_anchors_list) cls_score_list, bbox_pred_list = [], [] for feat, scale_reg, anchor_centers, stride in zip( feats, self.scales_regs, anchor_centers_split, self.fpn_strides): b, _, h, w = get_static_shape(feat) inter_feats = [] for inter_conv in self.inter_convs: feat = F.relu(inter_conv(feat)) inter_feats.append(feat) feat = paddle.concat(inter_feats, axis=1) # task decomposition avg_feat = F.adaptive_avg_pool2d(feat, (1, 1)) cls_feat = self.cls_decomp(feat, avg_feat) reg_feat = self.reg_decomp(feat, avg_feat) # cls prediction and alignment cls_logits = self.tood_cls(cls_feat) if self.use_align_head: cls_prob = F.relu(self.cls_prob_conv1(feat)) cls_prob = F.sigmoid(self.cls_prob_conv2(cls_prob)) cls_score = (F.sigmoid(cls_logits) * cls_prob).sqrt() else: cls_score = F.sigmoid(cls_logits) cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1])) # reg prediction and alignment reg_dist = scale_reg(self.tood_reg(reg_feat).exp()) reg_dist = reg_dist.flatten(2).transpose([0, 2, 1]) reg_bbox = batch_distance2bbox( anchor_centers.unsqueeze(0), reg_dist) if self.use_align_head: reg_offset = F.relu(self.reg_offset_conv1(feat)) reg_offset = self.reg_offset_conv2(reg_offset) reg_bbox = reg_bbox.transpose([0, 2, 1]).reshape([b, 4, h, w]) anchor_centers = anchor_centers.reshape([1, h, w, 2]) bbox_pred = self._reg_grid_sample(reg_bbox, reg_offset, anchor_centers) bbox_pred = bbox_pred.flatten(2).transpose([0, 2, 1]) else: bbox_pred = reg_bbox if not self.training: bbox_pred *= stride bbox_pred_list.append(bbox_pred) cls_score_list = paddle.concat(cls_score_list, axis=1) bbox_pred_list = paddle.concat(bbox_pred_list, axis=1) return cls_score_list, bbox_pred_list, anchors, num_anchors_list, stride_tensor @staticmethod def _focal_loss(score, label, alpha=0.25, gamma=2.0): weight = (score - label).pow(gamma) if alpha > 0: alpha_t = alpha * label + (1 - alpha) * (1 - label) weight *= alpha_t loss = F.binary_cross_entropy( score, label, weight=weight, reduction='sum') return loss def get_loss(self, head_outs, gt_meta): pred_scores, pred_bboxes, anchors, \ num_anchors_list, stride_tensor = head_outs gt_labels = gt_meta['gt_class'] gt_bboxes = gt_meta['gt_bbox'] pad_gt_mask = gt_meta['pad_gt_mask'] # label assignment if gt_meta['epoch_id'] < self.static_assigner_epoch: assigned_labels, assigned_bboxes, assigned_scores = self.static_assigner( anchors, num_anchors_list, gt_labels, gt_bboxes, pad_gt_mask, bg_index=self.num_classes) alpha_l = 0.25 else: assigned_labels, assigned_bboxes, assigned_scores = self.assigner( pred_scores.detach(), pred_bboxes.detach() * stride_tensor, bbox_center(anchors), num_anchors_list, gt_labels, gt_bboxes, pad_gt_mask, bg_index=self.num_classes) alpha_l = -1 # rescale bbox assigned_bboxes /= stride_tensor # classification loss loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha=alpha_l) # select positive samples mask mask_positive = (assigned_labels != self.num_classes) num_pos = mask_positive.astype(paddle.float32).sum() # bbox regression loss if num_pos > 0: bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 4]) pred_bboxes_pos = paddle.masked_select(pred_bboxes, bbox_mask).reshape([-1, 4]) assigned_bboxes_pos = paddle.masked_select( assigned_bboxes, bbox_mask).reshape([-1, 4]) bbox_weight = paddle.masked_select( assigned_scores.sum(-1), mask_positive).unsqueeze(-1) # iou loss loss_iou = self.giou_loss(pred_bboxes_pos, assigned_bboxes_pos) * bbox_weight loss_iou = loss_iou.sum() / bbox_weight.sum() # l1 loss loss_l1 = F.l1_loss(pred_bboxes_pos, assigned_bboxes_pos) else: loss_iou = paddle.zeros([]) loss_l1 = paddle.zeros([]) loss_cls /= assigned_scores.sum().clip(min=1) loss = self.loss_weight['class'] * loss_cls + self.loss_weight[ 'iou'] * loss_iou return { 'loss': loss, 'loss_class': loss_cls, 'loss_iou': loss_iou, 'loss_l1': loss_l1 } def post_process(self, head_outs, img_shape, scale_factor): pred_scores, pred_bboxes, _, _, _ = head_outs pred_scores = pred_scores.transpose([0, 2, 1]) for i in range(len(pred_bboxes)): pred_bboxes[i, :, 0] = pred_bboxes[i, :, 0].clip( min=0, max=img_shape[i, 1]) pred_bboxes[i, :, 1] = pred_bboxes[i, :, 1].clip( min=0, max=img_shape[i, 0]) pred_bboxes[i, :, 2] = pred_bboxes[i, :, 2].clip( min=0, max=img_shape[i, 1]) pred_bboxes[i, :, 3] = pred_bboxes[i, :, 3].clip( min=0, max=img_shape[i, 0]) # scale bbox to origin scale_factor = scale_factor.flip([1]).tile([1, 2]).unsqueeze(1) pred_bboxes /= scale_factor bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores) return bbox_pred, bbox_num ================================================ FILE: ppdet/modeling/heads/ttf_head.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.nn.initializer import Constant, Normal from paddle.regularizer import L2Decay from ppdet.core.workspace import register from ppdet.modeling.layers import DeformableConvV2, LiteConv import numpy as np @register class HMHead(nn.Layer): """ Args: ch_in (int): The channel number of input Tensor. ch_out (int): The channel number of output Tensor. num_classes (int): Number of classes. conv_num (int): The convolution number of hm_feat. dcn_head(bool): whether use dcn in head. False by default. lite_head(bool): whether use lite version. False by default. norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional. bn by default Return: Heatmap head output """ __shared__ = ['num_classes', 'norm_type'] def __init__( self, ch_in, ch_out=128, num_classes=80, conv_num=2, dcn_head=False, lite_head=False, norm_type='bn', ): super(HMHead, self).__init__() head_conv = nn.Sequential() for i in range(conv_num): name = 'conv.{}'.format(i) if lite_head: lite_name = 'hm.' + name head_conv.add_sublayer( lite_name, LiteConv( in_channels=ch_in if i == 0 else ch_out, out_channels=ch_out, norm_type=norm_type)) else: if dcn_head: head_conv.add_sublayer( name, DeformableConvV2( in_channels=ch_in if i == 0 else ch_out, out_channels=ch_out, kernel_size=3, weight_attr=ParamAttr(initializer=Normal(0, 0.01)))) else: head_conv.add_sublayer( name, nn.Conv2D( in_channels=ch_in if i == 0 else ch_out, out_channels=ch_out, kernel_size=3, padding=1, weight_attr=ParamAttr(initializer=Normal(0, 0.01)), bias_attr=ParamAttr( learning_rate=2., regularizer=L2Decay(0.)))) head_conv.add_sublayer(name + '.act', nn.ReLU()) self.feat = head_conv bias_init = float(-np.log((1 - 0.01) / 0.01)) weight_attr = None if lite_head else ParamAttr(initializer=Normal(0, 0.01)) self.head = nn.Conv2D( in_channels=ch_out, out_channels=num_classes, kernel_size=1, weight_attr=weight_attr, bias_attr=ParamAttr( learning_rate=2., regularizer=L2Decay(0.), initializer=Constant(bias_init))) def forward(self, feat): out = self.feat(feat) out = self.head(out) return out @register class WHHead(nn.Layer): """ Args: ch_in (int): The channel number of input Tensor. ch_out (int): The channel number of output Tensor. conv_num (int): The convolution number of wh_feat. dcn_head(bool): whether use dcn in head. False by default. lite_head(bool): whether use lite version. False by default. norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional. bn by default Return: Width & Height head output """ __shared__ = ['norm_type'] def __init__(self, ch_in, ch_out=64, conv_num=2, dcn_head=False, lite_head=False, norm_type='bn'): super(WHHead, self).__init__() head_conv = nn.Sequential() for i in range(conv_num): name = 'conv.{}'.format(i) if lite_head: lite_name = 'wh.' + name head_conv.add_sublayer( lite_name, LiteConv( in_channels=ch_in if i == 0 else ch_out, out_channels=ch_out, norm_type=norm_type)) else: if dcn_head: head_conv.add_sublayer( name, DeformableConvV2( in_channels=ch_in if i == 0 else ch_out, out_channels=ch_out, kernel_size=3, weight_attr=ParamAttr(initializer=Normal(0, 0.01)))) else: head_conv.add_sublayer( name, nn.Conv2D( in_channels=ch_in if i == 0 else ch_out, out_channels=ch_out, kernel_size=3, padding=1, weight_attr=ParamAttr(initializer=Normal(0, 0.01)), bias_attr=ParamAttr( learning_rate=2., regularizer=L2Decay(0.)))) head_conv.add_sublayer(name + '.act', nn.ReLU()) weight_attr = None if lite_head else ParamAttr(initializer=Normal(0, 0.01)) self.feat = head_conv self.head = nn.Conv2D( in_channels=ch_out, out_channels=4, kernel_size=1, weight_attr=weight_attr, bias_attr=ParamAttr( learning_rate=2., regularizer=L2Decay(0.))) def forward(self, feat): out = self.feat(feat) out = self.head(out) out = F.relu(out) return out @register class TTFHead(nn.Layer): """ TTFHead Args: in_channels (int): the channel number of input to TTFHead. num_classes (int): the number of classes, 80 by default. hm_head_planes (int): the channel number in heatmap head, 128 by default. wh_head_planes (int): the channel number in width & height head, 64 by default. hm_head_conv_num (int): the number of convolution in heatmap head, 2 by default. wh_head_conv_num (int): the number of convolution in width & height head, 2 by default. hm_loss (object): Instance of 'CTFocalLoss'. wh_loss (object): Instance of 'GIoULoss'. wh_offset_base (float): the base offset of width and height, 16.0 by default. down_ratio (int): the actual down_ratio is calculated by base_down_ratio (default 16) and the number of upsample layers. lite_head(bool): whether use lite version. False by default. norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional. bn by default ags_module(bool): whether use AGS module to reweight location feature. false by default. """ __shared__ = ['num_classes', 'down_ratio', 'norm_type'] __inject__ = ['hm_loss', 'wh_loss'] def __init__(self, in_channels, num_classes=80, hm_head_planes=128, wh_head_planes=64, hm_head_conv_num=2, wh_head_conv_num=2, hm_loss='CTFocalLoss', wh_loss='GIoULoss', wh_offset_base=16., down_ratio=4, dcn_head=False, lite_head=False, norm_type='bn', ags_module=False): super(TTFHead, self).__init__() self.in_channels = in_channels self.hm_head = HMHead(in_channels, hm_head_planes, num_classes, hm_head_conv_num, dcn_head, lite_head, norm_type) self.wh_head = WHHead(in_channels, wh_head_planes, wh_head_conv_num, dcn_head, lite_head, norm_type) self.hm_loss = hm_loss self.wh_loss = wh_loss self.wh_offset_base = wh_offset_base self.down_ratio = down_ratio self.ags_module = ags_module @classmethod def from_config(cls, cfg, input_shape): if isinstance(input_shape, (list, tuple)): input_shape = input_shape[0] return {'in_channels': input_shape.channels, } def forward(self, feats): hm = self.hm_head(feats) wh = self.wh_head(feats) * self.wh_offset_base return hm, wh def filter_box_by_weight(self, pred, target, weight): """ Filter out boxes where ttf_reg_weight is 0, only keep positive samples. """ index = paddle.nonzero(weight > 0) index.stop_gradient = True weight = paddle.gather_nd(weight, index) pred = paddle.gather_nd(pred, index) target = paddle.gather_nd(target, index) return pred, target, weight def filter_loc_by_weight(self, score, weight): index = paddle.nonzero(weight > 0) index.stop_gradient = True score = paddle.gather_nd(score, index) return score def get_loss(self, pred_hm, pred_wh, target_hm, box_target, target_weight): pred_hm = paddle.clip(F.sigmoid(pred_hm), 1e-4, 1 - 1e-4) hm_loss = self.hm_loss(pred_hm, target_hm) H, W = target_hm.shape[2:] mask = paddle.reshape(target_weight, [-1, H, W]) avg_factor = paddle.sum(mask) + 1e-4 base_step = self.down_ratio shifts_x = paddle.arange(0, W * base_step, base_step, dtype='int32') shifts_y = paddle.arange(0, H * base_step, base_step, dtype='int32') shift_y, shift_x = paddle.tensor.meshgrid([shifts_y, shifts_x]) base_loc = paddle.stack([shift_x, shift_y], axis=0) base_loc.stop_gradient = True pred_boxes = paddle.concat( [0 - pred_wh[:, 0:2, :, :] + base_loc.astype(pred_wh.dtype), pred_wh[:, 2:4] + base_loc.astype(pred_wh.dtype)], axis=1) pred_boxes = paddle.transpose(pred_boxes, [0, 2, 3, 1]) boxes = paddle.transpose(box_target, [0, 2, 3, 1]) boxes.stop_gradient = True if self.ags_module: pred_hm_max = paddle.max(pred_hm, axis=1, keepdim=True) pred_hm_max_softmax = F.softmax(pred_hm_max, axis=1) pred_hm_max_softmax = paddle.transpose(pred_hm_max_softmax, [0, 2, 3, 1]) pred_hm_max_softmax = self.filter_loc_by_weight(pred_hm_max_softmax, mask) else: pred_hm_max_softmax = None pred_boxes, boxes, mask = self.filter_box_by_weight(pred_boxes, boxes, mask) mask.stop_gradient = True wh_loss = self.wh_loss( pred_boxes, boxes, iou_weight=mask.unsqueeze(1), loc_reweight=pred_hm_max_softmax) wh_loss = wh_loss / avg_factor ttf_loss = {'hm_loss': hm_loss, 'wh_loss': wh_loss} return ttf_loss ================================================ FILE: ppdet/modeling/heads/vitpose_head.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from ppdet.modeling.keypoint_utils import resize, flip_back from paddle.nn.initializer import TruncatedNormal, Constant, Normal from ppdet.modeling.layers import ConvTranspose2d, BatchNorm2d trunc_normal_ = TruncatedNormal(std=.02) normal_ = Normal(std=0.001) zeros_ = Constant(value=0.) ones_ = Constant(value=1.) __all__ = ['TopdownHeatmapSimpleHead'] @register class TopdownHeatmapSimpleHead(nn.Layer): def __init__(self, in_channels=768, out_channels=17, num_deconv_layers=3, num_deconv_filters=(256, 256, 256), num_deconv_kernels=(4, 4, 4), extra=None, in_index=0, input_transform=None, align_corners=False, upsample=0, flip_pairs=None, shift_heatmap=False, target_type='GaussianHeatmap'): super(TopdownHeatmapSimpleHead, self).__init__() self.in_channels = in_channels self.upsample = upsample self.flip_pairs = flip_pairs self.shift_heatmap = shift_heatmap self.target_type = target_type self._init_inputs(in_channels, in_index, input_transform) self.in_index = in_index self.align_corners = align_corners if extra is not None and not isinstance(extra, dict): raise TypeError('extra should be dict or None.') if num_deconv_layers > 0: self.deconv_layers = self._make_deconv_layer( num_deconv_layers, num_deconv_filters, num_deconv_kernels, ) elif num_deconv_layers == 0: self.deconv_layers = nn.Identity() else: raise ValueError( f'num_deconv_layers ({num_deconv_layers}) should >= 0.') identity_final_layer = False if extra is not None and 'final_conv_kernel' in extra: assert extra['final_conv_kernel'] in [0, 1, 3] if extra['final_conv_kernel'] == 3: padding = 1 elif extra['final_conv_kernel'] == 1: padding = 0 else: # 0 for Identity mapping. identity_final_layer = True kernel_size = extra['final_conv_kernel'] else: kernel_size = 1 padding = 0 if identity_final_layer: self.final_layer = nn.Identity() else: conv_channels = num_deconv_filters[ -1] if num_deconv_layers > 0 else self.in_channels layers = [] if extra is not None: num_conv_layers = extra.get('num_conv_layers', 0) num_conv_kernels = extra.get('num_conv_kernels', [1] * num_conv_layers) for i in range(num_conv_layers): layers.append( nn.Conv2D( in_channels=conv_channels, out_channels=conv_channels, kernel_size=num_conv_kernels[i], stride=1, padding=(num_conv_kernels[i] - 1) // 2)) layers.append(nn.BatchNorm2D(conv_channels)) layers.append(nn.ReLU()) layers.append( nn.Conv2D( in_channels=conv_channels, out_channels=out_channels, kernel_size=kernel_size, stride=1, padding=(padding, padding))) if len(layers) > 1: self.final_layer = nn.Sequential(*layers) else: self.final_layer = layers[0] self.init_weights() @staticmethod def _get_deconv_cfg(deconv_kernel): """Get configurations for deconv layers.""" if deconv_kernel == 4: padding = 1 output_padding = 0 elif deconv_kernel == 3: padding = 1 output_padding = 1 elif deconv_kernel == 2: padding = 0 output_padding = 0 else: raise ValueError(f'Not supported num_kernels ({deconv_kernel}).') return deconv_kernel, padding, output_padding def _init_inputs(self, in_channels, in_index, input_transform): """Check and initialize input transforms. """ if input_transform is not None: assert input_transform in ['resize_concat', 'multiple_select'] self.input_transform = input_transform self.in_index = in_index if input_transform is not None: assert isinstance(in_channels, (list, tuple)) assert isinstance(in_index, (list, tuple)) assert len(in_channels) == len(in_index) if input_transform == 'resize_concat': self.in_channels = sum(in_channels) else: self.in_channels = in_channels else: assert isinstance(in_channels, int) assert isinstance(in_index, int) self.in_channels = in_channels def _transform_inputs(self, inputs): """Transform inputs for decoder. """ if not isinstance(inputs, list): if not isinstance(inputs, list): if self.upsample > 0: inputs = resize( input=F.relu(inputs), scale_factor=self.upsample, mode='bilinear', align_corners=self.align_corners) return inputs if self.input_transform == 'resize_concat': inputs = [inputs[i] for i in self.in_index] upsampled_inputs = [ resize( input=x, size=inputs[0].shape[2:], mode='bilinear', align_corners=self.align_corners) for x in inputs ] inputs = paddle.concat(upsampled_inputs, dim=1) elif self.input_transform == 'multiple_select': inputs = [inputs[i] for i in self.in_index] else: inputs = inputs[self.in_index] return inputs def forward(self, x): """Forward function.""" x = self._transform_inputs(x) x = self.deconv_layers(x) x = self.final_layer(x) return x def inference_model(self, x, flip_pairs=None): """Inference function. Returns: output_heatmap (np.ndarray): Output heatmaps. Args: x (torch.Tensor[N,K,H,W]): Input features. flip_pairs (None | list[tuple]): Pairs of keypoints which are mirrored. """ output = self.forward(x) if flip_pairs is not None: output_heatmap = flip_back( output, self.flip_pairs, target_type=self.target_type) # feature is not aligned, shift flipped heatmap for higher accuracy if self.shift_heatmap: output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1] else: output_heatmap = output return output_heatmap def _make_deconv_layer(self, num_layers, num_filters, num_kernels): """Make deconv layers.""" if num_layers != len(num_filters): error_msg = f'num_layers({num_layers}) ' \ f'!= length of num_filters({len(num_filters)})' raise ValueError(error_msg) if num_layers != len(num_kernels): error_msg = f'num_layers({num_layers}) ' \ f'!= length of num_kernels({len(num_kernels)})' raise ValueError(error_msg) layers = [] for i in range(num_layers): kernel, padding, output_padding = \ self._get_deconv_cfg(num_kernels[i]) planes = num_filters[i] layers.append( ConvTranspose2d( in_channels=self.in_channels, out_channels=planes, kernel_size=kernel, stride=2, padding=padding, output_padding=output_padding, bias=False)) layers.append(nn.BatchNorm2D(planes)) layers.append(nn.ReLU()) self.in_channels = planes return nn.Sequential(*layers) def init_weights(self): """Initialize model weights.""" if not isinstance(self.deconv_layers, nn.Identity): for m in self.deconv_layers: if isinstance(m, nn.BatchNorm2D): ones_(m.weight) ones_(m.bias) if not isinstance(self.final_layer, nn.Conv2D): for m in self.final_layer: if isinstance(m, nn.Conv2D): normal_(m.weight) zeros_(m.bias) elif isinstance(m, nn.BatchNorm2D): ones_(m.weight) ones_(m.bias) else: normal_(self.final_layer.weight) zeros_(self.final_layer.bias) ================================================ FILE: ppdet/modeling/heads/yolo_head.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.regularizer import L2Decay from ppdet.core.workspace import register import math import numpy as np from ..initializer import bias_init_with_prob, constant_ from ..backbones.csp_darknet import BaseConv, DWConv from ..losses import IouLoss from ppdet.modeling.assigners.simota_assigner import SimOTAAssigner from ppdet.modeling.bbox_utils import bbox_overlaps from ppdet.modeling.layers import MultiClassNMS __all__ = ['YOLOv3Head', 'YOLOXHead'] def _de_sigmoid(x, eps=1e-7): x = paddle.clip(x, eps, 1. / eps) x = paddle.clip(1. / x - 1., eps, 1. / eps) x = -paddle.log(x) return x @register class YOLOv3Head(nn.Layer): __shared__ = ['num_classes', 'data_format'] __inject__ = ['loss'] def __init__(self, in_channels=[1024, 512, 256], anchors=[[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], [59, 119], [116, 90], [156, 198], [373, 326]], anchor_masks=[[6, 7, 8], [3, 4, 5], [0, 1, 2]], num_classes=80, loss='YOLOv3Loss', iou_aware=False, iou_aware_factor=0.4, data_format='NCHW'): """ Head for YOLOv3 network Args: num_classes (int): number of foreground classes anchors (list): anchors anchor_masks (list): anchor masks loss (object): YOLOv3Loss instance iou_aware (bool): whether to use iou_aware iou_aware_factor (float): iou aware factor data_format (str): data format, NCHW or NHWC """ super(YOLOv3Head, self).__init__() assert len(in_channels) > 0, "in_channels length should > 0" self.in_channels = in_channels self.num_classes = num_classes self.loss = loss self.iou_aware = iou_aware self.iou_aware_factor = iou_aware_factor self.parse_anchor(anchors, anchor_masks) self.num_outputs = len(self.anchors) self.data_format = data_format self.yolo_outputs = [] for i in range(len(self.anchors)): if self.iou_aware: num_filters = len(self.anchors[i]) * (self.num_classes + 6) else: num_filters = len(self.anchors[i]) * (self.num_classes + 5) name = 'yolo_output.{}'.format(i) conv = nn.Conv2D( in_channels=self.in_channels[i], out_channels=num_filters, kernel_size=1, stride=1, padding=0, data_format=data_format, bias_attr=ParamAttr(regularizer=L2Decay(0.))) conv.skip_quant = True yolo_output = self.add_sublayer(name, conv) self.yolo_outputs.append(yolo_output) def parse_anchor(self, anchors, anchor_masks): self.anchors = [[anchors[i] for i in mask] for mask in anchor_masks] self.mask_anchors = [] anchor_num = len(anchors) for masks in anchor_masks: self.mask_anchors.append([]) for mask in masks: assert mask < anchor_num, "anchor mask index overflow" self.mask_anchors[-1].extend(anchors[mask]) def forward(self, feats, targets=None): assert len(feats) == len(self.anchors) yolo_outputs = [] for i, feat in enumerate(feats): yolo_output = self.yolo_outputs[i](feat) if self.data_format == 'NHWC': yolo_output = paddle.transpose(yolo_output, [0, 3, 1, 2]) yolo_outputs.append(yolo_output) if self.training: return self.loss(yolo_outputs, targets, self.anchors) else: if self.iou_aware: y = [] for i, out in enumerate(yolo_outputs): na = len(self.anchors[i]) ioup, x = out[:, 0:na, :, :], out[:, na:, :, :] b, c, h, w = x.shape no = c // na x = x.reshape((b, na, no, h * w)) ioup = ioup.reshape((b, na, 1, h * w)) obj = x[:, :, 4:5, :] ioup = F.sigmoid(ioup) obj = F.sigmoid(obj) obj_t = (obj**(1 - self.iou_aware_factor)) * ( ioup**self.iou_aware_factor) obj_t = _de_sigmoid(obj_t) loc_t = x[:, :, :4, :] cls_t = x[:, :, 5:, :] y_t = paddle.concat([loc_t, obj_t, cls_t], axis=2) y_t = y_t.reshape((b, c, h, w)) y.append(y_t) return y else: return yolo_outputs @classmethod def from_config(cls, cfg, input_shape): return {'in_channels': [i.channels for i in input_shape], } @register class YOLOXHead(nn.Layer): __shared__ = ['num_classes', 'width_mult', 'act', 'trt', 'exclude_nms'] __inject__ = ['assigner', 'nms'] def __init__(self, num_classes=80, width_mult=1.0, depthwise=False, in_channels=[256, 512, 1024], feat_channels=256, fpn_strides=(8, 16, 32), l1_epoch=285, act='silu', assigner=SimOTAAssigner(use_vfl=False), nms='MultiClassNMS', loss_weight={ 'cls': 1.0, 'obj': 1.0, 'iou': 5.0, 'l1': 1.0, }, trt=False, exclude_nms=False): super(YOLOXHead, self).__init__() self._dtype = paddle.framework.get_default_dtype() self.num_classes = num_classes assert len(in_channels) > 0, "in_channels length should > 0" self.in_channels = in_channels feat_channels = int(feat_channels * width_mult) self.fpn_strides = fpn_strides self.l1_epoch = l1_epoch self.assigner = assigner self.nms = nms if isinstance(self.nms, MultiClassNMS) and trt: self.nms.trt = trt self.exclude_nms = exclude_nms self.loss_weight = loss_weight self.iou_loss = IouLoss(loss_weight=1.0) # default loss_weight 2.5 ConvBlock = DWConv if depthwise else BaseConv self.stem_conv = nn.LayerList() self.conv_cls = nn.LayerList() self.conv_reg = nn.LayerList() # reg [x,y,w,h] + obj for in_c in self.in_channels: self.stem_conv.append(BaseConv(in_c, feat_channels, 1, 1, act=act)) self.conv_cls.append( nn.Sequential(* [ ConvBlock( feat_channels, feat_channels, 3, 1, act=act), ConvBlock( feat_channels, feat_channels, 3, 1, act=act), nn.Conv2D( feat_channels, self.num_classes, 1, bias_attr=ParamAttr(regularizer=L2Decay(0.0))) ])) self.conv_reg.append( nn.Sequential(* [ ConvBlock( feat_channels, feat_channels, 3, 1, act=act), ConvBlock( feat_channels, feat_channels, 3, 1, act=act), nn.Conv2D( feat_channels, 4 + 1, # reg [x,y,w,h] + obj 1, bias_attr=ParamAttr(regularizer=L2Decay(0.0))) ])) self._init_weights() @classmethod def from_config(cls, cfg, input_shape): return {'in_channels': [i.channels for i in input_shape], } def _init_weights(self): bias_cls = bias_init_with_prob(0.01) bias_reg = paddle.full([5], math.log(5.), dtype=self._dtype) bias_reg[:2] = 0. bias_reg[-1] = bias_cls for cls_, reg_ in zip(self.conv_cls, self.conv_reg): constant_(cls_[-1].weight) constant_(cls_[-1].bias, bias_cls) constant_(reg_[-1].weight) reg_[-1].bias.set_value(bias_reg) def _generate_anchor_point(self, feat_sizes, strides, offset=0.): anchor_points, stride_tensor = [], [] num_anchors_list = [] for feat_size, stride in zip(feat_sizes, strides): h, w = feat_size x = (paddle.arange(w) + offset) * stride y = (paddle.arange(h) + offset) * stride y, x = paddle.meshgrid(y, x) anchor_points.append(paddle.stack([x, y], axis=-1).reshape([-1, 2])) stride_tensor.append( paddle.full( [len(anchor_points[-1]), 1], stride, dtype=self._dtype)) num_anchors_list.append(len(anchor_points[-1])) anchor_points = paddle.concat(anchor_points).astype(self._dtype) anchor_points.stop_gradient = True stride_tensor = paddle.concat(stride_tensor) stride_tensor.stop_gradient = True return anchor_points, stride_tensor, num_anchors_list def forward(self, feats, targets=None): assert len(feats) == len(self.fpn_strides), \ "The size of feats is not equal to size of fpn_strides" feat_sizes = [[f.shape[-2], f.shape[-1]] for f in feats] cls_score_list, reg_pred_list = [], [] obj_score_list = [] for i, feat in enumerate(feats): feat = self.stem_conv[i](feat) cls_logit = self.conv_cls[i](feat) reg_pred = self.conv_reg[i](feat) # cls prediction cls_score = F.sigmoid(cls_logit) cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1])) # reg prediction reg_xywh, obj_logit = paddle.split(reg_pred, [4, 1], axis=1) reg_xywh = reg_xywh.flatten(2).transpose([0, 2, 1]) reg_pred_list.append(reg_xywh) # obj prediction obj_score = F.sigmoid(obj_logit) obj_score_list.append(obj_score.flatten(2).transpose([0, 2, 1])) cls_score_list = paddle.concat(cls_score_list, axis=1) reg_pred_list = paddle.concat(reg_pred_list, axis=1) obj_score_list = paddle.concat(obj_score_list, axis=1) # bbox decode anchor_points, stride_tensor, _ =\ self._generate_anchor_point(feat_sizes, self.fpn_strides) reg_xy, reg_wh = paddle.split(reg_pred_list, 2, axis=-1) reg_xy += (anchor_points / stride_tensor) reg_wh = paddle.exp(reg_wh) * 0.5 bbox_pred_list = paddle.concat( [reg_xy - reg_wh, reg_xy + reg_wh], axis=-1) if self.training: anchor_points, stride_tensor, num_anchors_list =\ self._generate_anchor_point(feat_sizes, self.fpn_strides, 0.5) yolox_losses = self.get_loss([ cls_score_list, bbox_pred_list, obj_score_list, anchor_points, stride_tensor, num_anchors_list ], targets) return yolox_losses else: pred_scores = (cls_score_list * obj_score_list).sqrt() return pred_scores, bbox_pred_list, stride_tensor def get_loss(self, head_outs, targets): pred_cls, pred_bboxes, pred_obj,\ anchor_points, stride_tensor, num_anchors_list = head_outs gt_labels = targets['gt_class'] gt_bboxes = targets['gt_bbox'] pred_scores = (pred_cls * pred_obj).sqrt() # label assignment center_and_strides = paddle.concat( [anchor_points, stride_tensor, stride_tensor], axis=-1) pos_num_list, label_list, bbox_target_list = [], [], [] for pred_score, pred_bbox, gt_box, gt_label in zip( pred_scores.detach(), pred_bboxes.detach() * stride_tensor, gt_bboxes, gt_labels): pos_num, label, _, bbox_target = self.assigner( pred_score, center_and_strides, pred_bbox, gt_box, gt_label) pos_num_list.append(pos_num) label_list.append(label) bbox_target_list.append(bbox_target) labels = paddle.to_tensor(np.stack(label_list, axis=0)) bbox_targets = paddle.to_tensor(np.stack(bbox_target_list, axis=0)) bbox_targets /= stride_tensor # rescale bbox # 1. obj score loss mask_positive = (labels != self.num_classes) loss_obj = F.binary_cross_entropy( pred_obj, mask_positive.astype(pred_obj.dtype).unsqueeze(-1), reduction='sum') num_pos = sum(pos_num_list) if num_pos > 0: num_pos = paddle.to_tensor(num_pos, dtype=self._dtype).clip(min=1) loss_obj /= num_pos # 2. iou loss bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 4]) pred_bboxes_pos = paddle.masked_select(pred_bboxes, bbox_mask).reshape([-1, 4]) assigned_bboxes_pos = paddle.masked_select( bbox_targets, bbox_mask).reshape([-1, 4]) bbox_iou = bbox_overlaps(pred_bboxes_pos, assigned_bboxes_pos) bbox_iou = paddle.diag(bbox_iou) loss_iou = self.iou_loss( pred_bboxes_pos.split( 4, axis=-1), assigned_bboxes_pos.split( 4, axis=-1)) loss_iou = loss_iou.sum() / num_pos # 3. cls loss cls_mask = mask_positive.unsqueeze(-1).tile( [1, 1, self.num_classes]) pred_cls_pos = paddle.masked_select( pred_cls, cls_mask).reshape([-1, self.num_classes]) assigned_cls_pos = paddle.masked_select(labels, mask_positive) assigned_cls_pos = F.one_hot(assigned_cls_pos, self.num_classes + 1)[..., :-1] assigned_cls_pos *= bbox_iou.unsqueeze(-1) loss_cls = F.binary_cross_entropy( pred_cls_pos, assigned_cls_pos, reduction='sum') loss_cls /= num_pos # 4. l1 loss if targets['epoch_id'] >= self.l1_epoch: loss_l1 = F.l1_loss( pred_bboxes_pos, assigned_bboxes_pos, reduction='sum') loss_l1 /= num_pos else: loss_l1 = paddle.zeros([]) loss_l1.stop_gradient = False else: loss_cls = paddle.zeros([]) loss_iou = paddle.zeros([]) loss_l1 = paddle.zeros([]) loss_cls.stop_gradient = False loss_iou.stop_gradient = False loss_l1.stop_gradient = False loss = self.loss_weight['obj'] * loss_obj + \ self.loss_weight['cls'] * loss_cls + \ self.loss_weight['iou'] * loss_iou if targets['epoch_id'] >= self.l1_epoch: loss += (self.loss_weight['l1'] * loss_l1) yolox_losses = { 'loss': loss, 'loss_cls': loss_cls, 'loss_obj': loss_obj, 'loss_iou': loss_iou, 'loss_l1': loss_l1, } return yolox_losses def post_process(self, head_outs, img_shape, scale_factor): pred_scores, pred_bboxes, stride_tensor = head_outs pred_scores = pred_scores.transpose([0, 2, 1]) pred_bboxes *= stride_tensor # scale bbox to origin image scale_factor = scale_factor.flip(-1).tile([1, 2]).unsqueeze(1) pred_bboxes /= scale_factor if self.exclude_nms: # `exclude_nms=True` just use in benchmark return pred_bboxes.sum(), pred_scores.sum() else: bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores) return bbox_pred, bbox_num ================================================ FILE: ppdet/modeling/heads/yolof_head.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.regularizer import L2Decay from paddle.nn.initializer import Normal, Constant from ppdet.modeling.layers import MultiClassNMS from ppdet.core.workspace import register from ppdet.modeling.bbox_utils import delta2bbox_v2 __all__ = ['YOLOFHead'] INF = 1e8 def reduce_mean(tensor): world_size = paddle.distributed.get_world_size() if world_size == 1: return tensor paddle.distributed.all_reduce(tensor) return tensor / world_size def find_inside_anchor(feat_size, stride, num_anchors, im_shape): feat_h, feat_w = feat_size[:2] im_h, im_w = im_shape[:2] inside_h = min(int(np.ceil(im_h / stride)), feat_h) inside_w = min(int(np.ceil(im_w / stride)), feat_w) inside_mask = paddle.zeros([feat_h, feat_w], dtype=paddle.bool) inside_mask[:inside_h, :inside_w] = True inside_mask = inside_mask.unsqueeze(-1).expand( [feat_h, feat_w, num_anchors]) return inside_mask.reshape([-1]) @register class YOLOFFeat(nn.Layer): def __init__(self, feat_in=256, feat_out=256, num_cls_convs=2, num_reg_convs=4, norm_type='bn'): super(YOLOFFeat, self).__init__() assert norm_type == 'bn', "YOLOFFeat only support BN now." self.feat_in = feat_in self.feat_out = feat_out self.num_cls_convs = num_cls_convs self.num_reg_convs = num_reg_convs self.norm_type = norm_type cls_subnet, reg_subnet = [], [] for i in range(self.num_cls_convs): feat_in = self.feat_in if i == 0 else self.feat_out cls_subnet.append( nn.Conv2D( feat_in, self.feat_out, 3, stride=1, padding=1, weight_attr=ParamAttr(initializer=Normal( mean=0.0, std=0.01)), bias_attr=ParamAttr(initializer=Constant(value=0.0)))) cls_subnet.append( nn.BatchNorm2D( self.feat_out, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0)))) cls_subnet.append(nn.ReLU()) for i in range(self.num_reg_convs): feat_in = self.feat_in if i == 0 else self.feat_out reg_subnet.append( nn.Conv2D( feat_in, self.feat_out, 3, stride=1, padding=1, weight_attr=ParamAttr(initializer=Normal( mean=0.0, std=0.01)), bias_attr=ParamAttr(initializer=Constant(value=0.0)))) reg_subnet.append( nn.BatchNorm2D( self.feat_out, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0)))) reg_subnet.append(nn.ReLU()) self.cls_subnet = nn.Sequential(*cls_subnet) self.reg_subnet = nn.Sequential(*reg_subnet) def forward(self, fpn_feat): cls_feat = self.cls_subnet(fpn_feat) reg_feat = self.reg_subnet(fpn_feat) return cls_feat, reg_feat @register class YOLOFHead(nn.Layer): __shared__ = ['num_classes', 'trt', 'exclude_nms'] __inject__ = [ 'conv_feat', 'anchor_generator', 'bbox_assigner', 'loss_class', 'loss_bbox', 'nms' ] def __init__(self, num_classes=80, conv_feat='YOLOFFeat', anchor_generator='AnchorGenerator', bbox_assigner='UniformAssigner', loss_class='FocalLoss', loss_bbox='GIoULoss', ctr_clip=32.0, delta_mean=[0.0, 0.0, 0.0, 0.0], delta_std=[1.0, 1.0, 1.0, 1.0], nms='MultiClassNMS', prior_prob=0.01, nms_pre=1000, use_inside_anchor=False, trt=False, exclude_nms=False): super(YOLOFHead, self).__init__() self.num_classes = num_classes self.conv_feat = conv_feat self.anchor_generator = anchor_generator self.na = self.anchor_generator.num_anchors self.bbox_assigner = bbox_assigner self.loss_class = loss_class self.loss_bbox = loss_bbox self.ctr_clip = ctr_clip self.delta_mean = delta_mean self.delta_std = delta_std self.nms = nms self.nms_pre = nms_pre self.use_inside_anchor = use_inside_anchor if isinstance(self.nms, MultiClassNMS) and trt: self.nms.trt = trt self.exclude_nms = exclude_nms bias_init_value = -math.log((1 - prior_prob) / prior_prob) self.cls_score = self.add_sublayer( 'cls_score', nn.Conv2D( in_channels=conv_feat.feat_out, out_channels=self.num_classes * self.na, kernel_size=3, stride=1, padding=1, weight_attr=ParamAttr(initializer=Normal( mean=0.0, std=0.01)), bias_attr=ParamAttr(initializer=Constant( value=bias_init_value)))) self.bbox_pred = self.add_sublayer( 'bbox_pred', nn.Conv2D( in_channels=conv_feat.feat_out, out_channels=4 * self.na, kernel_size=3, stride=1, padding=1, weight_attr=ParamAttr(initializer=Normal( mean=0.0, std=0.01)), bias_attr=ParamAttr(initializer=Constant(value=0)))) self.object_pred = self.add_sublayer( 'object_pred', nn.Conv2D( in_channels=conv_feat.feat_out, out_channels=self.na, kernel_size=3, stride=1, padding=1, weight_attr=ParamAttr(initializer=Normal( mean=0.0, std=0.01)), bias_attr=ParamAttr(initializer=Constant(value=0)))) def forward(self, feats, targets=None): assert len(feats) == 1, "YOLOF only has one level feature." conv_cls_feat, conv_reg_feat = self.conv_feat(feats[0]) cls_logits = self.cls_score(conv_cls_feat) objectness = self.object_pred(conv_reg_feat) bboxes_reg = self.bbox_pred(conv_reg_feat) N, C, H, W = cls_logits.shape[:] cls_logits = cls_logits.reshape((N, self.na, self.num_classes, H, W)) objectness = objectness.reshape((N, self.na, 1, H, W)) norm_cls_logits = cls_logits + objectness - paddle.log( 1.0 + paddle.clip( cls_logits.exp(), max=INF) + paddle.clip( objectness.exp(), max=INF)) norm_cls_logits = norm_cls_logits.reshape((N, C, H, W)) anchors = self.anchor_generator([norm_cls_logits]) if self.training: yolof_losses = self.get_loss( [anchors[0], norm_cls_logits, bboxes_reg], targets) return yolof_losses else: return anchors[0], norm_cls_logits, bboxes_reg def get_loss(self, head_outs, targets): anchors, cls_logits, bbox_preds = head_outs feat_size = cls_logits.shape[-2:] cls_logits = cls_logits.transpose([0, 2, 3, 1]) cls_logits = cls_logits.reshape([0, -1, self.num_classes]) bbox_preds = bbox_preds.transpose([0, 2, 3, 1]) bbox_preds = bbox_preds.reshape([0, -1, 4]) num_pos_list = [] cls_pred_list, cls_tar_list = [], [] reg_pred_list, reg_tar_list = [], [] # find and gather preds and targets in each image for cls_logit, bbox_pred, gt_bbox, gt_class, im_shape in zip( cls_logits, bbox_preds, targets['gt_bbox'], targets['gt_class'], targets['im_shape']): if self.use_inside_anchor: inside_mask = find_inside_anchor( feat_size, self.anchor_generator.strides[0], self.na, im_shape.tolist()) cls_logit = cls_logit[inside_mask] bbox_pred = bbox_pred[inside_mask] anchors = anchors[inside_mask] bbox_pred = delta2bbox_v2( bbox_pred, anchors, self.delta_mean, self.delta_std, ctr_clip=self.ctr_clip) bbox_pred = bbox_pred.reshape([-1, bbox_pred.shape[-1]]) # -2:ignore, -1:neg, >=0:pos match_labels, pos_bbox_pred, pos_bbox_tar = self.bbox_assigner( bbox_pred, anchors, gt_bbox) pos_mask = (match_labels >= 0) neg_mask = (match_labels == -1) chosen_mask = paddle.logical_or(pos_mask, neg_mask) gt_class = gt_class.reshape([-1]) bg_class = paddle.to_tensor( [self.num_classes], dtype=gt_class.dtype) # a trick to assign num_classes to negative targets gt_class = paddle.concat([gt_class, bg_class], axis=-1) match_labels = paddle.where( neg_mask, paddle.full_like(match_labels, gt_class.size - 1), match_labels) num_pos_list.append(max(1.0, pos_mask.sum().item())) cls_pred_list.append(cls_logit[chosen_mask]) cls_tar_list.append(gt_class[match_labels[chosen_mask]]) reg_pred_list.append(pos_bbox_pred) reg_tar_list.append(pos_bbox_tar) num_tot_pos = paddle.to_tensor(sum(num_pos_list)) num_tot_pos = reduce_mean(num_tot_pos).item() num_tot_pos = max(1.0, num_tot_pos) cls_pred = paddle.concat(cls_pred_list) cls_tar = paddle.concat(cls_tar_list) cls_loss = self.loss_class( cls_pred, cls_tar, reduction='sum') / num_tot_pos reg_pred_list = [_ for _ in reg_pred_list if _ is not None] reg_tar_list = [_ for _ in reg_tar_list if _ is not None] if len(reg_pred_list) == 0: reg_loss = bbox_preds.sum() * 0.0 else: reg_pred = paddle.concat(reg_pred_list) reg_tar = paddle.concat(reg_tar_list) reg_loss = self.loss_bbox(reg_pred, reg_tar).sum() / num_tot_pos yolof_losses = { 'loss': cls_loss + reg_loss, 'loss_cls': cls_loss, 'loss_reg': reg_loss, } return yolof_losses def get_bboxes_single(self, anchors, cls_scores, bbox_preds, im_shape, scale_factor, rescale=True): assert len(cls_scores) == len(bbox_preds) mlvl_bboxes = [] mlvl_scores = [] for anchor, cls_score, bbox_pred in zip(anchors, cls_scores, bbox_preds): cls_score = cls_score.reshape([-1, self.num_classes]) bbox_pred = bbox_pred.reshape([-1, 4]) if self.nms_pre is not None and cls_score.shape[0] > self.nms_pre: max_score = cls_score.max(axis=1) _, topk_inds = max_score.topk(self.nms_pre) bbox_pred = bbox_pred.gather(topk_inds) anchor = anchor.gather(topk_inds) cls_score = cls_score.gather(topk_inds) bbox_pred = delta2bbox_v2( bbox_pred, anchor, self.delta_mean, self.delta_std, max_shape=im_shape, ctr_clip=self.ctr_clip).squeeze() mlvl_bboxes.append(bbox_pred) mlvl_scores.append(F.sigmoid(cls_score)) mlvl_bboxes = paddle.concat(mlvl_bboxes) mlvl_bboxes = paddle.squeeze(mlvl_bboxes) if rescale: mlvl_bboxes = mlvl_bboxes / paddle.concat( [scale_factor[::-1], scale_factor[::-1]]) mlvl_scores = paddle.concat(mlvl_scores) mlvl_scores = mlvl_scores.transpose([1, 0]) return mlvl_bboxes, mlvl_scores def decode(self, anchors, cls_scores, bbox_preds, im_shape, scale_factor): batch_bboxes = [] batch_scores = [] for img_id in range(cls_scores[0].shape[0]): num_lvls = len(cls_scores) cls_score_list = [cls_scores[i][img_id] for i in range(num_lvls)] bbox_pred_list = [bbox_preds[i][img_id] for i in range(num_lvls)] bboxes, scores = self.get_bboxes_single( anchors, cls_score_list, bbox_pred_list, im_shape[img_id], scale_factor[img_id]) batch_bboxes.append(bboxes) batch_scores.append(scores) batch_bboxes = paddle.stack(batch_bboxes, 0) batch_scores = paddle.stack(batch_scores, 0) return batch_bboxes, batch_scores def post_process(self, head_outs, im_shape, scale_factor): anchors, cls_scores, bbox_preds = head_outs cls_scores = cls_scores.transpose([0, 2, 3, 1]) bbox_preds = bbox_preds.transpose([0, 2, 3, 1]) pred_bboxes, pred_scores = self.decode( [anchors], [cls_scores], [bbox_preds], im_shape, scale_factor) if self.exclude_nms: # `exclude_nms=True` just use in benchmark return pred_bboxes.sum(), pred_scores.sum() else: bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores) return bbox_pred, bbox_num ================================================ FILE: ppdet/modeling/initializer.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file. """ import math import numpy as np import paddle import paddle.nn as nn __all__ = [ 'uniform_', 'normal_', 'constant_', 'ones_', 'zeros_', 'xavier_uniform_', 'xavier_normal_', 'kaiming_uniform_', 'kaiming_normal_', 'linear_init_', 'conv_init_', 'reset_initialized_parameter', ] def _no_grad_uniform_(tensor, a, b): with paddle.no_grad(): tensor.set_value( paddle.uniform( shape=tensor.shape, dtype=tensor.dtype, min=a, max=b)) return tensor def _no_grad_normal_(tensor, mean=0., std=1.): with paddle.no_grad(): tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape)) return tensor def _no_grad_fill_(tensor, value=0.): with paddle.no_grad(): tensor.set_value(paddle.full_like(tensor, value, dtype=tensor.dtype)) return tensor def uniform_(tensor, a, b): """ Modified tensor inspace using uniform_ Args: tensor (paddle.Tensor): paddle Tensor a (float|int): min value. b (float|int): max value. Return: tensor """ return _no_grad_uniform_(tensor, a, b) def normal_(tensor, mean=0., std=1.): """ Modified tensor inspace using normal_ Args: tensor (paddle.Tensor): paddle Tensor mean (float|int): mean value. std (float|int): std value. Return: tensor """ return _no_grad_normal_(tensor, mean, std) def constant_(tensor, value=0.): """ Modified tensor inspace using constant_ Args: tensor (paddle.Tensor): paddle Tensor value (float|int): value to fill tensor. Return: tensor """ return _no_grad_fill_(tensor, value) def ones_(tensor): """ Modified tensor inspace using ones_ Args: tensor (paddle.Tensor): paddle Tensor Return: tensor """ return _no_grad_fill_(tensor, 1) def zeros_(tensor): """ Modified tensor inspace using zeros_ Args: tensor (paddle.Tensor): paddle Tensor Return: tensor """ return _no_grad_fill_(tensor, 0) def vector_(tensor, vector): with paddle.no_grad(): tensor.set_value(paddle.to_tensor(vector, dtype=tensor.dtype)) return tensor def _calculate_fan_in_and_fan_out(tensor, reverse=False): """ Calculate (fan_in, _fan_out) for tensor Args: tensor (Tensor): paddle.Tensor reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True Return: Tuple[fan_in, fan_out] """ if tensor.ndim < 2: raise ValueError( "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions" ) if reverse: num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1] else: num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0] receptive_field_size = 1 if tensor.ndim > 2: receptive_field_size = np.prod(tensor.shape[2:]) fan_in = num_input_fmaps * receptive_field_size fan_out = num_output_fmaps * receptive_field_size return fan_in, fan_out def xavier_uniform_(tensor, gain=1., reverse=False): """ Modified tensor inspace using xavier_uniform_ Args: tensor (paddle.Tensor): paddle Tensor gain (float): super parameter, 1. default. reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. Return: tensor """ fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse) std = gain * math.sqrt(2.0 / float(fan_in + fan_out)) k = math.sqrt(3.0) * std return _no_grad_uniform_(tensor, -k, k) def xavier_normal_(tensor, gain=1., reverse=False): """ Modified tensor inspace using xavier_normal_ Args: tensor (paddle.Tensor): paddle Tensor gain (float): super parameter, 1. default. reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. Return: tensor """ fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse) std = gain * math.sqrt(2.0 / float(fan_in + fan_out)) return _no_grad_normal_(tensor, 0, std) # reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html def _calculate_correct_fan(tensor, mode, reverse=False): mode = mode.lower() valid_modes = ['fan_in', 'fan_out'] if mode not in valid_modes: raise ValueError("Mode {} not supported, please use one of {}".format( mode, valid_modes)) fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse) return fan_in if mode == 'fan_in' else fan_out def _calculate_gain(nonlinearity, param=None): linear_fns = [ 'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d', 'conv_transpose2d', 'conv_transpose3d' ] if nonlinearity in linear_fns or nonlinearity == 'sigmoid': return 1 elif nonlinearity == 'tanh': return 5.0 / 3 elif nonlinearity == 'relu': return math.sqrt(2.0) elif nonlinearity == 'leaky_relu': if param is None: negative_slope = 0.01 elif not isinstance(param, bool) and isinstance( param, int) or isinstance(param, float): # True/False are instances of int, hence check above negative_slope = param else: raise ValueError("negative_slope {} not a valid number".format( param)) return math.sqrt(2.0 / (1 + negative_slope**2)) elif nonlinearity == 'selu': return 3.0 / 4 else: raise ValueError("Unsupported nonlinearity {}".format(nonlinearity)) def kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu', reverse=False): """ Modified tensor inspace using kaiming_uniform method Args: tensor (paddle.Tensor): paddle Tensor mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut nonlinearity (str): nonlinearity method name reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. Return: tensor """ fan = _calculate_correct_fan(tensor, mode, reverse) gain = _calculate_gain(nonlinearity, a) std = gain / math.sqrt(fan) k = math.sqrt(3.0) * std return _no_grad_uniform_(tensor, -k, k) def kaiming_normal_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu', reverse=False): """ Modified tensor inspace using kaiming_normal_ Args: tensor (paddle.Tensor): paddle Tensor mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut nonlinearity (str): nonlinearity method name reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. Return: tensor """ fan = _calculate_correct_fan(tensor, mode, reverse) gain = _calculate_gain(nonlinearity, a) std = gain / math.sqrt(fan) return _no_grad_normal_(tensor, 0, std) def linear_init_(module): bound = 1 / math.sqrt(module.weight.shape[0]) uniform_(module.weight, -bound, bound) if hasattr(module, "bias") and module.bias is not None: uniform_(module.bias, -bound, bound) def conv_init_(module): bound = 1 / np.sqrt(np.prod(module.weight.shape[1:])) uniform_(module.weight, -bound, bound) if module.bias is not None: uniform_(module.bias, -bound, bound) def bias_init_with_prob(prior_prob=0.01): """initialize conv/fc bias value according to a given probability value.""" bias_init = float(-np.log((1 - prior_prob) / prior_prob)) return bias_init @paddle.no_grad() def reset_initialized_parameter(model, include_self=True): """ Reset initialized parameter using following method for [conv, linear, embedding, bn] Args: model (paddle.Layer): paddle Layer include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself Return: None """ for _, m in model.named_sublayers(include_self=include_self): if isinstance(m, nn.Conv2D): k = float(m._groups) / (m._in_channels * m._kernel_size[0] * m._kernel_size[1]) k = math.sqrt(k) _no_grad_uniform_(m.weight, -k, k) if hasattr(m, 'bias') and getattr(m, 'bias') is not None: _no_grad_uniform_(m.bias, -k, k) elif isinstance(m, nn.Linear): k = math.sqrt(1. / m.weight.shape[0]) _no_grad_uniform_(m.weight, -k, k) if hasattr(m, 'bias') and getattr(m, 'bias') is not None: _no_grad_uniform_(m.bias, -k, k) elif isinstance(m, nn.Embedding): _no_grad_normal_(m.weight, mean=0., std=1.) elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)): _no_grad_fill_(m.weight, 1.) if hasattr(m, 'bias') and getattr(m, 'bias') is not None: _no_grad_fill_(m.bias, 0) ================================================ FILE: ppdet/modeling/keypoint_utils.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ this code is based on https://github.com/open-mmlab/mmpose """ import cv2 import numpy as np import paddle.nn.functional as F def get_affine_mat_kernel(h, w, s, inv=False): if w < h: w_ = s h_ = int(np.ceil((s / w * h) / 64.) * 64) scale_w = w scale_h = h_ / w_ * w else: h_ = s w_ = int(np.ceil((s / h * w) / 64.) * 64) scale_h = h scale_w = w_ / h_ * h center = np.array([np.round(w / 2.), np.round(h / 2.)]) size_resized = (w_, h_) trans = get_affine_transform( center, np.array([scale_w, scale_h]), 0, size_resized, inv=inv) return trans, size_resized def get_affine_transform(center, input_size, rot, output_size, shift=(0., 0.), inv=False): """Get the affine transform matrix, given the center/scale/rot/output_size. Args: center (np.ndarray[2, ]): Center of the bounding box (x, y). input_size (np.ndarray[2, ]): Size of input feature (width, height). rot (float): Rotation angle (degree). output_size (np.ndarray[2, ]): Size of the destination heatmaps. shift (0-100%): Shift translation ratio wrt the width/height. Default (0., 0.). inv (bool): Option to inverse the affine transform direction. (inv=False: src->dst or inv=True: dst->src) Returns: np.ndarray: The transform matrix. """ assert len(center) == 2 assert len(output_size) == 2 assert len(shift) == 2 if not isinstance(input_size, (np.ndarray, list)): input_size = np.array([input_size, input_size], dtype=np.float32) scale_tmp = input_size shift = np.array(shift) src_w = scale_tmp[0] dst_w = output_size[0] dst_h = output_size[1] rot_rad = np.pi * rot / 180 src_dir = rotate_point([0., src_w * -0.5], rot_rad) dst_dir = np.array([0., dst_w * -0.5]) src = np.zeros((3, 2), dtype=np.float32) src[0, :] = center + scale_tmp * shift src[1, :] = center + src_dir + scale_tmp * shift src[2, :] = _get_3rd_point(src[0, :], src[1, :]) dst = np.zeros((3, 2), dtype=np.float32) dst[0, :] = [dst_w * 0.5, dst_h * 0.5] dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :]) if inv: trans = cv2.getAffineTransform(np.float32(dst), np.float32(src)) else: trans = cv2.getAffineTransform(np.float32(src), np.float32(dst)) return trans def get_warp_matrix(theta, size_input, size_dst, size_target): """This code is based on https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py Calculate the transformation matrix under the constraint of unbiased. Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020). Args: theta (float): Rotation angle in degrees. size_input (np.ndarray): Size of input image [w, h]. size_dst (np.ndarray): Size of output image [w, h]. size_target (np.ndarray): Size of ROI in input plane [w, h]. Returns: matrix (np.ndarray): A matrix for transformation. """ theta = np.deg2rad(theta) matrix = np.zeros((2, 3), dtype=np.float32) scale_x = size_dst[0] / size_target[0] scale_y = size_dst[1] / size_target[1] matrix[0, 0] = np.cos(theta) * scale_x matrix[0, 1] = -np.sin(theta) * scale_x matrix[0, 2] = scale_x * ( -0.5 * size_input[0] * np.cos(theta) + 0.5 * size_input[1] * np.sin(theta) + 0.5 * size_target[0]) matrix[1, 0] = np.sin(theta) * scale_y matrix[1, 1] = np.cos(theta) * scale_y matrix[1, 2] = scale_y * ( -0.5 * size_input[0] * np.sin(theta) - 0.5 * size_input[1] * np.cos(theta) + 0.5 * size_target[1]) return matrix def _get_3rd_point(a, b): """To calculate the affine matrix, three pairs of points are required. This function is used to get the 3rd point, given 2D points a & b. The 3rd point is defined by rotating vector `a - b` by 90 degrees anticlockwise, using b as the rotation center. Args: a (np.ndarray): point(x,y) b (np.ndarray): point(x,y) Returns: np.ndarray: The 3rd point. """ assert len( a) == 2, 'input of _get_3rd_point should be point with length of 2' assert len( b) == 2, 'input of _get_3rd_point should be point with length of 2' direction = a - b third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32) return third_pt def rotate_point(pt, angle_rad): """Rotate a point by an angle. Args: pt (list[float]): 2 dimensional point to be rotated angle_rad (float): rotation angle by radian Returns: list[float]: Rotated point. """ assert len(pt) == 2 sn, cs = np.sin(angle_rad), np.cos(angle_rad) new_x = pt[0] * cs - pt[1] * sn new_y = pt[0] * sn + pt[1] * cs rotated_pt = [new_x, new_y] return rotated_pt def transpred(kpts, h, w, s): trans, _ = get_affine_mat_kernel(h, w, s, inv=True) return warp_affine_joints(kpts[..., :2].copy(), trans) def warp_affine_joints(joints, mat): """Apply affine transformation defined by the transform matrix on the joints. Args: joints (np.ndarray[..., 2]): Origin coordinate of joints. mat (np.ndarray[3, 2]): The affine matrix. Returns: matrix (np.ndarray[..., 2]): Result coordinate of joints. """ joints = np.array(joints) shape = joints.shape joints = joints.reshape(-1, 2) return np.dot(np.concatenate( (joints, joints[:, 0:1] * 0 + 1), axis=1), mat.T).reshape(shape) def affine_transform(pt, t): new_pt = np.array([pt[0], pt[1], 1.]).T new_pt = np.dot(t, new_pt) return new_pt[:2] def transform_preds(coords, center, scale, output_size): target_coords = np.zeros(coords.shape) trans = get_affine_transform(center, scale * 200, 0, output_size, inv=1) for p in range(coords.shape[0]): target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans) return target_coords def oks_iou(g, d, a_g, a_d, sigmas=None, in_vis_thre=None): if not isinstance(sigmas, np.ndarray): sigmas = np.array([ .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89 ]) / 10.0 vars = (sigmas * 2)**2 xg = g[0::3] yg = g[1::3] vg = g[2::3] ious = np.zeros((d.shape[0])) for n_d in range(0, d.shape[0]): xd = d[n_d, 0::3] yd = d[n_d, 1::3] vd = d[n_d, 2::3] dx = xd - xg dy = yd - yg e = (dx**2 + dy**2) / vars / ((a_g + a_d[n_d]) / 2 + np.spacing(1)) / 2 if in_vis_thre is not None: ind = list(vg > in_vis_thre) and list(vd > in_vis_thre) e = e[ind] ious[n_d] = np.sum(np.exp(-e)) / e.shape[0] if e.shape[0] != 0 else 0.0 return ious def oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None): """greedily select boxes with high confidence and overlap with current maximum <= thresh rule out overlap >= thresh Args: kpts_db (list): The predicted keypoints within the image thresh (float): The threshold to select the boxes sigmas (np.array): The variance to calculate the oks iou Default: None in_vis_thre (float): The threshold to select the high confidence boxes Default: None Return: keep (list): indexes to keep """ if len(kpts_db) == 0: return [] scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))]) kpts = np.array( [kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))]) areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))]) order = scores.argsort()[::-1] keep = [] while order.size > 0: i = order[0] keep.append(i) oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]], sigmas, in_vis_thre) inds = np.where(oks_ovr <= thresh)[0] order = order[inds + 1] return keep def rescore(overlap, scores, thresh, type='gaussian'): assert overlap.shape[0] == scores.shape[0] if type == 'linear': inds = np.where(overlap >= thresh)[0] scores[inds] = scores[inds] * (1 - overlap[inds]) else: scores = scores * np.exp(-overlap**2 / thresh) return scores def soft_oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None): """greedily select boxes with high confidence and overlap with current maximum <= thresh rule out overlap >= thresh Args: kpts_db (list): The predicted keypoints within the image thresh (float): The threshold to select the boxes sigmas (np.array): The variance to calculate the oks iou Default: None in_vis_thre (float): The threshold to select the high confidence boxes Default: None Return: keep (list): indexes to keep """ if len(kpts_db) == 0: return [] scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))]) kpts = np.array( [kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))]) areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))]) order = scores.argsort()[::-1] scores = scores[order] # max_dets = order.size max_dets = 20 keep = np.zeros(max_dets, dtype=np.intp) keep_cnt = 0 while order.size > 0 and keep_cnt < max_dets: i = order[0] oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]], sigmas, in_vis_thre) order = order[1:] scores = rescore(oks_ovr, scores[1:], thresh) tmp = scores.argsort()[::-1] order = order[tmp] scores = scores[tmp] keep[keep_cnt] = i keep_cnt += 1 keep = keep[:keep_cnt] return keep def resize(input, size=None, scale_factor=None, mode='nearest', align_corners=None, warning=True): if warning: if size is not None and align_corners: input_h, input_w = tuple(int(x) for x in input.shape[2:]) output_h, output_w = tuple(int(x) for x in size) if output_h > input_h or output_w > output_h: if ((output_h > 1 and output_w > 1 and input_h > 1 and input_w > 1) and (output_h - 1) % (input_h - 1) and (output_w - 1) % (input_w - 1)): warnings.warn( f'When align_corners={align_corners}, ' 'the output would more aligned if ' f'input size {(input_h, input_w)} is `x+1` and ' f'out size {(output_h, output_w)} is `nx+1`') return F.interpolate(input, size, scale_factor, mode, align_corners) def flip_back(output_flipped, flip_pairs, target_type='GaussianHeatmap'): """Flip the flipped heatmaps back to the original form. Note: - batch_size: N - num_keypoints: K - heatmap height: H - heatmap width: W Args: output_flipped (np.ndarray[N, K, H, W]): The output heatmaps obtained from the flipped images. flip_pairs (list[tuple()): Pairs of keypoints which are mirrored (for example, left ear -- right ear). target_type (str): GaussianHeatmap or CombinedTarget Returns: np.ndarray: heatmaps that flipped back to the original image """ assert len(output_flipped.shape) == 4, \ 'output_flipped should be [batch_size, num_keypoints, height, width]' shape_ori = output_flipped.shape channels = 1 if target_type.lower() == 'CombinedTarget'.lower(): channels = 3 output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...] output_flipped = output_flipped.reshape((shape_ori[0], -1, channels, shape_ori[2], shape_ori[3])) output_flipped_back = output_flipped.clone() # Swap left-right parts for left, right in flip_pairs: output_flipped_back[:, left, ...] = output_flipped[:, right, ...] output_flipped_back[:, right, ...] = output_flipped[:, left, ...] output_flipped_back = output_flipped_back.reshape(shape_ori) # Flip horizontally output_flipped_back = output_flipped_back[..., ::-1] return output_flipped_back def _calc_distances(preds, targets, mask, normalize): """Calculate the normalized distances between preds and target. Note: batch_size: N num_keypoints: K dimension of keypoints: D (normally, D=2 or D=3) Args: preds (np.ndarray[N, K, D]): Predicted keypoint location. targets (np.ndarray[N, K, D]): Groundtruth keypoint location. mask (np.ndarray[N, K]): Visibility of the target. False for invisible joints, and True for visible. Invisible joints will be ignored for accuracy calculation. normalize (np.ndarray[N, D]): Typical value is heatmap_size Returns: np.ndarray[K, N]: The normalized distances. \ If target keypoints are missing, the distance is -1. """ N, K, _ = preds.shape # set mask=0 when normalize==0 _mask = mask.copy() _mask[np.where((normalize == 0).sum(1))[0], :] = False distances = np.full((N, K), -1, dtype=np.float32) # handle invalid values normalize[np.where(normalize <= 0)] = 1e6 distances[_mask] = np.linalg.norm( ((preds - targets) / normalize[:, None, :])[_mask], axis=-1) return distances.T def _distance_acc(distances, thr=0.5): """Return the percentage below the distance threshold, while ignoring distances values with -1. Note: batch_size: N Args: distances (np.ndarray[N, ]): The normalized distances. thr (float): Threshold of the distances. Returns: float: Percentage of distances below the threshold. \ If all target keypoints are missing, return -1. """ distance_valid = distances != -1 num_distance_valid = distance_valid.sum() if num_distance_valid > 0: return (distances[distance_valid] < thr).sum() / num_distance_valid return -1 def keypoint_pck_accuracy(pred, gt, mask, thr, normalize): """Calculate the pose accuracy of PCK for each individual keypoint and the averaged accuracy across all keypoints for coordinates. Note: PCK metric measures accuracy of the localization of the body joints. The distances between predicted positions and the ground-truth ones are typically normalized by the bounding box size. The threshold (thr) of the normalized distance is commonly set as 0.05, 0.1 or 0.2 etc. - batch_size: N - num_keypoints: K Args: pred (np.ndarray[N, K, 2]): Predicted keypoint location. gt (np.ndarray[N, K, 2]): Groundtruth keypoint location. mask (np.ndarray[N, K]): Visibility of the target. False for invisible joints, and True for visible. Invisible joints will be ignored for accuracy calculation. thr (float): Threshold of PCK calculation. normalize (np.ndarray[N, 2]): Normalization factor for H&W. Returns: tuple: A tuple containing keypoint accuracy. - acc (np.ndarray[K]): Accuracy of each keypoint. - avg_acc (float): Averaged accuracy across all keypoints. - cnt (int): Number of valid keypoints. """ distances = _calc_distances(pred, gt, mask, normalize) acc = np.array([_distance_acc(d, thr) for d in distances]) valid_acc = acc[acc >= 0] cnt = len(valid_acc) avg_acc = valid_acc.mean() if cnt > 0 else 0 return acc, avg_acc, cnt def keypoint_auc(pred, gt, mask, normalize, num_step=20): """Calculate the pose accuracy of PCK for each individual keypoint and the averaged accuracy across all keypoints for coordinates. Note: - batch_size: N - num_keypoints: K Args: pred (np.ndarray[N, K, 2]): Predicted keypoint location. gt (np.ndarray[N, K, 2]): Groundtruth keypoint location. mask (np.ndarray[N, K]): Visibility of the target. False for invisible joints, and True for visible. Invisible joints will be ignored for accuracy calculation. normalize (float): Normalization factor. Returns: float: Area under curve. """ nor = np.tile(np.array([[normalize, normalize]]), (pred.shape[0], 1)) x = [1.0 * i / num_step for i in range(num_step)] y = [] for thr in x: _, avg_acc, _ = keypoint_pck_accuracy(pred, gt, mask, thr, nor) y.append(avg_acc) auc = 0 for i in range(num_step): auc += 1.0 / num_step * y[i] return auc def keypoint_epe(pred, gt, mask): """Calculate the end-point error. Note: - batch_size: N - num_keypoints: K Args: pred (np.ndarray[N, K, 2]): Predicted keypoint location. gt (np.ndarray[N, K, 2]): Groundtruth keypoint location. mask (np.ndarray[N, K]): Visibility of the target. False for invisible joints, and True for visible. Invisible joints will be ignored for accuracy calculation. Returns: float: Average end-point error. """ normalize = np.ones((pred.shape[0], pred.shape[2]), dtype=np.float32) distances = _calc_distances(pred, gt, mask, normalize) distance_valid = distances[distances != -1] return distance_valid.sum() / max(1, len(distance_valid)) ================================================ FILE: ppdet/modeling/lane_utils.py ================================================ import os import cv2 import numpy as np from scipy.interpolate import InterpolatedUnivariateSpline class Lane: def __init__(self, points=None, invalid_value=-2., metadata=None): super(Lane, self).__init__() self.curr_iter = 0 self.points = points self.invalid_value = invalid_value self.function = InterpolatedUnivariateSpline( points[:, 1], points[:, 0], k=min(3, len(points) - 1)) self.min_y = points[:, 1].min() - 0.01 self.max_y = points[:, 1].max() + 0.01 self.metadata = metadata or {} def __repr__(self): return '[Lane]\n' + str(self.points) + '\n[/Lane]' def __call__(self, lane_ys): lane_xs = self.function(lane_ys) lane_xs[(lane_ys < self.min_y) | (lane_ys > self.max_y )] = self.invalid_value return lane_xs def to_array(self, sample_y_range, img_w, img_h): self.sample_y = range(sample_y_range[0], sample_y_range[1], sample_y_range[2]) sample_y = self.sample_y img_w, img_h = img_w, img_h ys = np.array(sample_y) / float(img_h) xs = self(ys) valid_mask = (xs >= 0) & (xs < 1) lane_xs = xs[valid_mask] * img_w lane_ys = ys[valid_mask] * img_h lane = np.concatenate( (lane_xs.reshape(-1, 1), lane_ys.reshape(-1, 1)), axis=1) return lane def __iter__(self): return self def __next__(self): if self.curr_iter < len(self.points): self.curr_iter += 1 return self.points[self.curr_iter - 1] self.curr_iter = 0 raise StopIteration COLORS = [ (255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (255, 0, 255), (0, 255, 255), (128, 255, 0), (255, 128, 0), (128, 0, 255), (255, 0, 128), (0, 128, 255), (0, 255, 128), (128, 255, 255), (255, 128, 255), (255, 255, 128), (60, 180, 0), (180, 60, 0), (0, 60, 180), (0, 180, 60), (60, 0, 180), (180, 0, 60), (255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (255, 0, 255), (0, 255, 255), (128, 255, 0), (255, 128, 0), (128, 0, 255), ] def imshow_lanes(img, lanes, show=False, out_file=None, width=4): lanes_xys = [] for _, lane in enumerate(lanes): xys = [] for x, y in lane: if x <= 0 or y <= 0: continue x, y = int(x), int(y) xys.append((x, y)) lanes_xys.append(xys) lanes_xys.sort(key=lambda xys: xys[0][0] if len(xys) > 0 else 0) for idx, xys in enumerate(lanes_xys): for i in range(1, len(xys)): cv2.line(img, xys[i - 1], xys[i], COLORS[idx], thickness=width) if show: cv2.imshow('view', img) cv2.waitKey(0) if out_file: if not os.path.exists(os.path.dirname(out_file)): os.makedirs(os.path.dirname(out_file)) cv2.imwrite(out_file, img) ================================================ FILE: ppdet/modeling/layers.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math import six import numpy as np from numbers import Integral import paddle import paddle.nn as nn from paddle import ParamAttr from paddle import to_tensor import paddle.nn.functional as F from paddle.nn.initializer import Normal, Constant, XavierUniform from paddle.regularizer import L2Decay from ppdet.core.workspace import register, serializable from ppdet.modeling.bbox_utils import delta2bbox from . import ops from .initializer import xavier_uniform_, constant_ from paddle.vision.ops import DeformConv2D def _to_list(l): if isinstance(l, (list, tuple)): return list(l) return [l] class AlignConv(nn.Layer): def __init__(self, in_channels, out_channels, kernel_size=3, groups=1): super(AlignConv, self).__init__() self.kernel_size = kernel_size self.align_conv = paddle.vision.ops.DeformConv2D( in_channels, out_channels, kernel_size=self.kernel_size, padding=(self.kernel_size - 1) // 2, groups=groups, weight_attr=ParamAttr(initializer=Normal(0, 0.01)), bias_attr=None) @paddle.no_grad() def get_offset(self, anchors, featmap_size, stride): """ Args: anchors: [B, L, 5] xc,yc,w,h,angle featmap_size: (feat_h, feat_w) stride: 8 Returns: """ batch = anchors.shape[0] dtype = anchors.dtype feat_h, feat_w = featmap_size pad = (self.kernel_size - 1) // 2 idx = paddle.arange(-pad, pad + 1, dtype=dtype) yy, xx = paddle.meshgrid(idx, idx) xx = paddle.reshape(xx, [-1]) yy = paddle.reshape(yy, [-1]) # get sampling locations of default conv xc = paddle.arange(0, feat_w, dtype=dtype) yc = paddle.arange(0, feat_h, dtype=dtype) yc, xc = paddle.meshgrid(yc, xc) xc = paddle.reshape(xc, [-1, 1]) yc = paddle.reshape(yc, [-1, 1]) x_conv = xc + xx y_conv = yc + yy # get sampling locations of anchors x_ctr, y_ctr, w, h, a = paddle.split(anchors, 5, axis=-1) x_ctr = x_ctr / stride y_ctr = y_ctr / stride w_s = w / stride h_s = h / stride cos, sin = paddle.cos(a), paddle.sin(a) dw, dh = w_s / self.kernel_size, h_s / self.kernel_size x, y = dw * xx, dh * yy xr = cos * x - sin * y yr = sin * x + cos * y x_anchor, y_anchor = xr + x_ctr, yr + y_ctr # get offset filed offset_x = x_anchor - x_conv offset_y = y_anchor - y_conv offset = paddle.stack([offset_y, offset_x], axis=-1) offset = offset.reshape( [batch, feat_h, feat_w, self.kernel_size * self.kernel_size * 2]) offset = offset.transpose([0, 3, 1, 2]) return offset def forward(self, x, refine_anchors, featmap_size, stride): batch = x.shape[0].numpy() offset = self.get_offset(refine_anchors, featmap_size, stride) if self.training: x = F.relu(self.align_conv(x, offset.detach())) else: x = F.relu(self.align_conv(x, offset)) return x class DeformableConvV2(nn.Layer): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, weight_attr=None, bias_attr=None, lr_scale=1, regularizer=None, skip_quant=False, dcn_bias_regularizer=L2Decay(0.), dcn_bias_lr_scale=2.): super(DeformableConvV2, self).__init__() self.offset_channel = 2 * kernel_size**2 self.mask_channel = kernel_size**2 if lr_scale == 1 and regularizer is None: offset_bias_attr = ParamAttr(initializer=Constant(0.)) else: offset_bias_attr = ParamAttr( initializer=Constant(0.), learning_rate=lr_scale, regularizer=regularizer) self.conv_offset = nn.Conv2D( in_channels, 3 * kernel_size**2, kernel_size, stride=stride, padding=(kernel_size - 1) // 2, weight_attr=ParamAttr(initializer=Constant(0.0)), bias_attr=offset_bias_attr) if skip_quant: self.conv_offset.skip_quant = True if bias_attr: # in FCOS-DCN head, specifically need learning_rate and regularizer dcn_bias_attr = ParamAttr( initializer=Constant(value=0), regularizer=dcn_bias_regularizer, learning_rate=dcn_bias_lr_scale) else: # in ResNet backbone, do not need bias dcn_bias_attr = False self.conv_dcn = DeformConv2D( in_channels, out_channels, kernel_size, stride=stride, padding=(kernel_size - 1) // 2 * dilation, dilation=dilation, groups=groups, weight_attr=weight_attr, bias_attr=dcn_bias_attr) def forward(self, x): offset_mask = self.conv_offset(x) offset, mask = paddle.split( offset_mask, num_or_sections=[self.offset_channel, self.mask_channel], axis=1) mask = F.sigmoid(mask) y = self.conv_dcn(x, offset, mask=mask) return y class ConvNormLayer(nn.Layer): def __init__(self, ch_in, ch_out, filter_size, stride, groups=1, norm_type='bn', norm_decay=0., norm_groups=32, use_dcn=False, bias_on=False, lr_scale=1., freeze_norm=False, initializer=Normal( mean=0., std=0.01), skip_quant=False, dcn_lr_scale=2., dcn_regularizer=L2Decay(0.)): super(ConvNormLayer, self).__init__() assert norm_type in ['bn', 'sync_bn', 'gn', None] if bias_on: bias_attr = ParamAttr( initializer=Constant(value=0.), learning_rate=lr_scale) else: bias_attr = False if not use_dcn: self.conv = nn.Conv2D( in_channels=ch_in, out_channels=ch_out, kernel_size=filter_size, stride=stride, padding=(filter_size - 1) // 2, groups=groups, weight_attr=ParamAttr( initializer=initializer, learning_rate=1.), bias_attr=bias_attr) if skip_quant: self.conv.skip_quant = True else: # in FCOS-DCN head, specifically need learning_rate and regularizer self.conv = DeformableConvV2( in_channels=ch_in, out_channels=ch_out, kernel_size=filter_size, stride=stride, padding=(filter_size - 1) // 2, groups=groups, weight_attr=ParamAttr( initializer=initializer, learning_rate=1.), bias_attr=True, lr_scale=dcn_lr_scale, regularizer=dcn_regularizer, dcn_bias_regularizer=dcn_regularizer, dcn_bias_lr_scale=dcn_lr_scale, skip_quant=skip_quant) norm_lr = 0. if freeze_norm else 1. param_attr = ParamAttr( learning_rate=norm_lr, regularizer=L2Decay(norm_decay) if norm_decay is not None else None) bias_attr = ParamAttr( learning_rate=norm_lr, regularizer=L2Decay(norm_decay) if norm_decay is not None else None) if norm_type in ['bn', 'sync_bn']: self.norm = nn.BatchNorm2D( ch_out, weight_attr=param_attr, bias_attr=bias_attr) elif norm_type == 'gn': self.norm = nn.GroupNorm( num_groups=norm_groups, num_channels=ch_out, weight_attr=param_attr, bias_attr=bias_attr) else: self.norm = None def forward(self, inputs): out = self.conv(inputs) if self.norm is not None: out = self.norm(out) return out class LiteConv(nn.Layer): def __init__(self, in_channels, out_channels, stride=1, with_act=True, norm_type='sync_bn', name=None): super(LiteConv, self).__init__() self.lite_conv = nn.Sequential() conv1 = ConvNormLayer( in_channels, in_channels, filter_size=5, stride=stride, groups=in_channels, norm_type=norm_type, initializer=XavierUniform()) conv2 = ConvNormLayer( in_channels, out_channels, filter_size=1, stride=stride, norm_type=norm_type, initializer=XavierUniform()) conv3 = ConvNormLayer( out_channels, out_channels, filter_size=1, stride=stride, norm_type=norm_type, initializer=XavierUniform()) conv4 = ConvNormLayer( out_channels, out_channels, filter_size=5, stride=stride, groups=out_channels, norm_type=norm_type, initializer=XavierUniform()) conv_list = [conv1, conv2, conv3, conv4] self.lite_conv.add_sublayer('conv1', conv1) self.lite_conv.add_sublayer('relu6_1', nn.ReLU6()) self.lite_conv.add_sublayer('conv2', conv2) if with_act: self.lite_conv.add_sublayer('relu6_2', nn.ReLU6()) self.lite_conv.add_sublayer('conv3', conv3) self.lite_conv.add_sublayer('relu6_3', nn.ReLU6()) self.lite_conv.add_sublayer('conv4', conv4) if with_act: self.lite_conv.add_sublayer('relu6_4', nn.ReLU6()) def forward(self, inputs): out = self.lite_conv(inputs) return out class DropBlock(nn.Layer): def __init__(self, block_size, keep_prob, name=None, data_format='NCHW'): """ DropBlock layer, see https://arxiv.org/abs/1810.12890 Args: block_size (int): block size keep_prob (int): keep probability name (str): layer name data_format (str): data format, NCHW or NHWC """ super(DropBlock, self).__init__() self.block_size = block_size self.keep_prob = keep_prob self.name = name self.data_format = data_format def forward(self, x): if not self.training or self.keep_prob == 1: return x else: gamma = (1. - self.keep_prob) / (self.block_size**2) if self.data_format == 'NCHW': shape = x.shape[2:] else: shape = x.shape[1:3] for s in shape: gamma *= s / (s - self.block_size + 1) matrix = paddle.cast(paddle.rand(x.shape) < gamma, x.dtype) mask_inv = F.max_pool2d( matrix, self.block_size, stride=1, padding=self.block_size // 2, data_format=self.data_format) mask = 1. - mask_inv mask = mask.astype('float32') x = x.astype('float32') y = x * mask * (mask.numel() / mask.sum()) return y @register @serializable class AnchorGeneratorSSD(object): def __init__(self, steps=[8, 16, 32, 64, 100, 300], aspect_ratios=[[2.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]], min_ratio=15, max_ratio=90, base_size=300, min_sizes=[30.0, 60.0, 111.0, 162.0, 213.0, 264.0], max_sizes=[60.0, 111.0, 162.0, 213.0, 264.0, 315.0], offset=0.5, flip=True, clip=False, min_max_aspect_ratios_order=False): self.steps = steps self.aspect_ratios = aspect_ratios self.min_ratio = min_ratio self.max_ratio = max_ratio self.base_size = base_size self.min_sizes = min_sizes self.max_sizes = max_sizes self.offset = offset self.flip = flip self.clip = clip self.min_max_aspect_ratios_order = min_max_aspect_ratios_order if self.min_sizes == [] and self.max_sizes == []: num_layer = len(aspect_ratios) step = int( math.floor(((self.max_ratio - self.min_ratio)) / (num_layer - 2 ))) for ratio in six.moves.range(self.min_ratio, self.max_ratio + 1, step): self.min_sizes.append(self.base_size * ratio / 100.) self.max_sizes.append(self.base_size * (ratio + step) / 100.) self.min_sizes = [self.base_size * .10] + self.min_sizes self.max_sizes = [self.base_size * .20] + self.max_sizes self.num_priors = [] for aspect_ratio, min_size, max_size in zip( aspect_ratios, self.min_sizes, self.max_sizes): if isinstance(min_size, (list, tuple)): self.num_priors.append( len(_to_list(min_size)) + len(_to_list(max_size))) else: self.num_priors.append((len(aspect_ratio) * 2 + 1) * len( _to_list(min_size)) + len(_to_list(max_size))) def __call__(self, inputs, image): boxes = [] for input, min_size, max_size, aspect_ratio, step in zip( inputs, self.min_sizes, self.max_sizes, self.aspect_ratios, self.steps): box, _ = ops.prior_box( input=input, image=image, min_sizes=_to_list(min_size), max_sizes=_to_list(max_size), aspect_ratios=aspect_ratio, flip=self.flip, clip=self.clip, steps=[step, step], offset=self.offset, min_max_aspect_ratios_order=self.min_max_aspect_ratios_order) boxes.append(paddle.reshape(box, [-1, 4])) return boxes @register @serializable class RCNNBox(object): __shared__ = ['num_classes', 'export_onnx'] def __init__(self, prior_box_var=[10., 10., 5., 5.], code_type="decode_center_size", box_normalized=False, num_classes=80, export_onnx=False): super(RCNNBox, self).__init__() self.prior_box_var = prior_box_var self.code_type = code_type self.box_normalized = box_normalized self.num_classes = num_classes self.export_onnx = export_onnx def __call__(self, bbox_head_out, rois, im_shape, scale_factor): bbox_pred = bbox_head_out[0] cls_prob = bbox_head_out[1] roi = rois[0] rois_num = rois[1] if self.export_onnx: onnx_rois_num_per_im = rois_num[0] origin_shape = paddle.expand(im_shape[0, :], [onnx_rois_num_per_im, 2]) else: origin_shape_list = [] if isinstance(roi, list): batch_size = len(roi) else: batch_size = paddle.slice(paddle.shape(im_shape), [0], [0], [1]) # bbox_pred.shape: [N, C*4] for idx in range(batch_size): rois_num_per_im = rois_num[idx] expand_im_shape = paddle.expand(im_shape[idx, :], [rois_num_per_im, 2]) origin_shape_list.append(expand_im_shape) origin_shape = paddle.concat(origin_shape_list) # bbox_pred.shape: [N, C*4] # C=num_classes in faster/mask rcnn(bbox_head), C=1 in cascade rcnn(cascade_head) bbox = paddle.concat(roi) bbox = delta2bbox(bbox_pred, bbox, self.prior_box_var) scores = cls_prob[:, :-1] # bbox.shape: [N, C, 4] # bbox.shape[1] must be equal to scores.shape[1] total_num = bbox.shape[0] bbox_dim = bbox.shape[-1] bbox = paddle.expand(bbox, [total_num, self.num_classes, bbox_dim]) origin_h = paddle.unsqueeze(origin_shape[:, 0], axis=1) origin_w = paddle.unsqueeze(origin_shape[:, 1], axis=1) zeros = paddle.zeros_like(origin_h) x1 = paddle.maximum(paddle.minimum(bbox[:, :, 0], origin_w), zeros) y1 = paddle.maximum(paddle.minimum(bbox[:, :, 1], origin_h), zeros) x2 = paddle.maximum(paddle.minimum(bbox[:, :, 2], origin_w), zeros) y2 = paddle.maximum(paddle.minimum(bbox[:, :, 3], origin_h), zeros) bbox = paddle.stack([x1, y1, x2, y2], axis=-1) bboxes = (bbox, rois_num) return bboxes, scores @register @serializable class MultiClassNMS(object): def __init__(self, score_threshold=.05, nms_top_k=-1, keep_top_k=100, nms_threshold=.5, normalized=True, nms_eta=1.0, return_index=False, return_rois_num=True, trt=False, cpu=False): super(MultiClassNMS, self).__init__() self.score_threshold = score_threshold self.nms_top_k = nms_top_k self.keep_top_k = keep_top_k self.nms_threshold = nms_threshold self.normalized = normalized self.nms_eta = nms_eta self.return_index = return_index self.return_rois_num = return_rois_num self.trt = trt self.cpu = cpu def __call__(self, bboxes, score, background_label=-1): """ bboxes (Tensor|List[Tensor]): 1. (Tensor) Predicted bboxes with shape [N, M, 4], N is the batch size and M is the number of bboxes 2. (List[Tensor]) bboxes and bbox_num, bboxes have shape of [M, C, 4], C is the class number and bbox_num means the number of bboxes of each batch with shape [N,] score (Tensor): Predicted scores with shape [N, C, M] or [M, C] background_label (int): Ignore the background label; For example, RCNN is num_classes and YOLO is -1. """ kwargs = self.__dict__.copy() if isinstance(bboxes, tuple): bboxes, bbox_num = bboxes kwargs.update({'rois_num': bbox_num}) if background_label > -1: kwargs.update({'background_label': background_label}) kwargs.pop('trt') kwargs.pop('cpu') # TODO(wangxinxin08): paddle version should be develop or 2.3 and above to run nms on tensorrt if self.trt and (int(paddle.version.major) == 0 or (int(paddle.version.major) >= 2 and int(paddle.version.minor) >= 3)): # TODO(wangxinxin08): tricky switch to run nms on tensorrt kwargs.update({'nms_eta': 1.1}) bbox, bbox_num, _ = ops.multiclass_nms(bboxes, score, **kwargs) bbox = bbox.reshape([1, -1, 6]) idx = paddle.nonzero(bbox[..., 0] != -1) bbox = paddle.gather_nd(bbox, idx) return bbox, bbox_num, None else: if self.cpu: device = paddle.device.get_device() paddle.set_device('cpu') outputs = ops.multiclass_nms(bboxes, score, **kwargs) paddle.set_device(device) return outputs else: return ops.multiclass_nms(bboxes, score, **kwargs) @register @serializable class MatrixNMS(object): __append_doc__ = True def __init__(self, score_threshold=.05, post_threshold=.05, nms_top_k=-1, keep_top_k=100, use_gaussian=False, gaussian_sigma=2., normalized=False, background_label=0): super(MatrixNMS, self).__init__() self.score_threshold = score_threshold self.post_threshold = post_threshold self.nms_top_k = nms_top_k self.keep_top_k = keep_top_k self.normalized = normalized self.use_gaussian = use_gaussian self.gaussian_sigma = gaussian_sigma self.background_label = background_label def __call__(self, bbox, score, *args): return ops.matrix_nms( bboxes=bbox, scores=score, score_threshold=self.score_threshold, post_threshold=self.post_threshold, nms_top_k=self.nms_top_k, keep_top_k=self.keep_top_k, use_gaussian=self.use_gaussian, gaussian_sigma=self.gaussian_sigma, background_label=self.background_label, normalized=self.normalized) @register @serializable class YOLOBox(object): __shared__ = ['num_classes'] def __init__(self, num_classes=80, conf_thresh=0.005, downsample_ratio=32, clip_bbox=True, scale_x_y=1.): self.num_classes = num_classes self.conf_thresh = conf_thresh self.downsample_ratio = downsample_ratio self.clip_bbox = clip_bbox self.scale_x_y = scale_x_y def __call__(self, yolo_head_out, anchors, im_shape, scale_factor, var_weight=None): boxes_list = [] scores_list = [] origin_shape = im_shape / scale_factor origin_shape = paddle.cast(origin_shape, 'int32') for i, head_out in enumerate(yolo_head_out): boxes, scores = paddle.vision.ops.yolo_box( head_out, origin_shape, anchors[i], self.num_classes, self.conf_thresh, self.downsample_ratio // 2**i, self.clip_bbox, scale_x_y=self.scale_x_y) boxes_list.append(boxes) scores_list.append(paddle.transpose(scores, perm=[0, 2, 1])) yolo_boxes = paddle.concat(boxes_list, axis=1) yolo_scores = paddle.concat(scores_list, axis=2) return yolo_boxes, yolo_scores @register @serializable class SSDBox(object): def __init__(self, is_normalized=True, prior_box_var=[0.1, 0.1, 0.2, 0.2], use_fuse_decode=False): self.is_normalized = is_normalized self.norm_delta = float(not self.is_normalized) self.prior_box_var = prior_box_var self.use_fuse_decode = use_fuse_decode def __call__(self, preds, prior_boxes, im_shape, scale_factor, var_weight=None): boxes, scores = preds boxes = paddle.concat(boxes, axis=1) prior_boxes = paddle.concat(prior_boxes) if self.use_fuse_decode: output_boxes = ops.box_coder( prior_boxes, self.prior_box_var, boxes, code_type="decode_center_size", box_normalized=self.is_normalized) else: pb_w = prior_boxes[:, 2] - prior_boxes[:, 0] + self.norm_delta pb_h = prior_boxes[:, 3] - prior_boxes[:, 1] + self.norm_delta pb_x = prior_boxes[:, 0] + pb_w * 0.5 pb_y = prior_boxes[:, 1] + pb_h * 0.5 out_x = pb_x + boxes[:, :, 0] * pb_w * self.prior_box_var[0] out_y = pb_y + boxes[:, :, 1] * pb_h * self.prior_box_var[1] out_w = paddle.exp(boxes[:, :, 2] * self.prior_box_var[2]) * pb_w out_h = paddle.exp(boxes[:, :, 3] * self.prior_box_var[3]) * pb_h output_boxes = paddle.stack( [ out_x - out_w / 2., out_y - out_h / 2., out_x + out_w / 2., out_y + out_h / 2. ], axis=-1) if self.is_normalized: h = (im_shape[:, 0] / scale_factor[:, 0]).unsqueeze(-1) w = (im_shape[:, 1] / scale_factor[:, 1]).unsqueeze(-1) im_shape = paddle.stack([w, h, w, h], axis=-1) output_boxes *= im_shape else: output_boxes[..., -2:] -= 1.0 output_scores = F.softmax(paddle.concat( scores, axis=1)).transpose([0, 2, 1]) return output_boxes, output_scores @register class TTFBox(object): __shared__ = ['down_ratio'] def __init__(self, max_per_img=100, score_thresh=0.01, down_ratio=4): super(TTFBox, self).__init__() self.max_per_img = max_per_img self.score_thresh = score_thresh self.down_ratio = down_ratio def _simple_nms(self, heat, kernel=3): """ Use maxpool to filter the max score, get local peaks. """ pad = (kernel - 1) // 2 hmax = F.max_pool2d(heat, kernel, stride=1, padding=pad) keep = paddle.cast(hmax == heat, 'float32') return heat * keep def _topk(self, scores): """ Select top k scores and decode to get xy coordinates. """ k = self.max_per_img shape_fm = paddle.shape(scores) shape_fm.stop_gradient = True cat, height, width = shape_fm[1], shape_fm[2], shape_fm[3] # batch size is 1 scores_r = paddle.reshape(scores, [cat, -1]) topk_scores, topk_inds = paddle.topk(scores_r, k) topk_ys = topk_inds // width topk_xs = topk_inds % width topk_score_r = paddle.reshape(topk_scores, [-1]) topk_score, topk_ind = paddle.topk(topk_score_r, k) k_t = paddle.full(topk_ind.shape, k, dtype='int64') topk_clses = paddle.cast(paddle.floor_divide(topk_ind, k_t), 'float32') topk_inds = paddle.reshape(topk_inds, [-1]) topk_ys = paddle.reshape(topk_ys, [-1, 1]) topk_xs = paddle.reshape(topk_xs, [-1, 1]) topk_inds = paddle.gather(topk_inds, topk_ind) topk_ys = paddle.gather(topk_ys, topk_ind) topk_xs = paddle.gather(topk_xs, topk_ind) return topk_score, topk_inds, topk_clses, topk_ys, topk_xs def _decode(self, hm, wh, im_shape, scale_factor): heatmap = F.sigmoid(hm) heat = self._simple_nms(heatmap) scores, inds, clses, ys, xs = self._topk(heat) ys = paddle.cast(ys, 'float32') * self.down_ratio xs = paddle.cast(xs, 'float32') * self.down_ratio scores = paddle.tensor.unsqueeze(scores, [1]) clses = paddle.tensor.unsqueeze(clses, [1]) wh_t = paddle.transpose(wh, [0, 2, 3, 1]) wh = paddle.reshape(wh_t, [-1, wh_t.shape[-1]]) wh = paddle.gather(wh, inds) x1 = xs - wh[:, 0:1] y1 = ys - wh[:, 1:2] x2 = xs + wh[:, 2:3] y2 = ys + wh[:, 3:4] bboxes = paddle.concat([x1, y1, x2, y2], axis=1) scale_y = scale_factor[:, 0:1] scale_x = scale_factor[:, 1:2] scale_expand = paddle.concat( [scale_x, scale_y, scale_x, scale_y], axis=1) boxes_shape = paddle.shape(bboxes) boxes_shape.stop_gradient = True scale_expand = paddle.expand(scale_expand, shape=boxes_shape) bboxes = paddle.divide(bboxes, scale_expand) results = paddle.concat([clses, scores, bboxes], axis=1) # hack: append result with cls=-1 and score=1. to avoid all scores # are less than score_thresh which may cause error in gather. fill_r = paddle.to_tensor(np.array([[-1, 1, 0, 0, 0, 0]])) fill_r = paddle.cast(fill_r, results.dtype) results = paddle.concat([results, fill_r]) scores = results[:, 1] valid_ind = paddle.nonzero(scores > self.score_thresh) results = paddle.gather(results, valid_ind) return results, results.shape[0:1] def __call__(self, hm, wh, im_shape, scale_factor): results = [] results_num = [] for i in range(scale_factor.shape[0]): result, num = self._decode(hm[i:i + 1, ], wh[i:i + 1, ], im_shape[i:i + 1, ], scale_factor[i:i + 1, ]) results.append(result) results_num.append(num) results = paddle.concat(results, axis=0) results_num = paddle.concat(results_num, axis=0) return results, results_num @register @serializable class JDEBox(object): __shared__ = ['num_classes'] def __init__(self, num_classes=1, conf_thresh=0.3, downsample_ratio=32): self.num_classes = num_classes self.conf_thresh = conf_thresh self.downsample_ratio = downsample_ratio def generate_anchor(self, nGh, nGw, anchor_wh): nA = len(anchor_wh) yv, xv = paddle.meshgrid([paddle.arange(nGh), paddle.arange(nGw)]) mesh = paddle.stack( (xv, yv), axis=0).cast(dtype='float32') # 2 x nGh x nGw meshs = paddle.tile(mesh, [nA, 1, 1, 1]) anchor_offset_mesh = anchor_wh[:, :, None][:, :, :, None].repeat( int(nGh), axis=-2).repeat( int(nGw), axis=-1) anchor_offset_mesh = paddle.to_tensor( anchor_offset_mesh.astype(np.float32)) # nA x 2 x nGh x nGw anchor_mesh = paddle.concat([meshs, anchor_offset_mesh], axis=1) anchor_mesh = paddle.transpose(anchor_mesh, [0, 2, 3, 1]) # (nA x nGh x nGw) x 4 return anchor_mesh def decode_delta(self, delta, fg_anchor_list): px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \ fg_anchor_list[:, 2], fg_anchor_list[:,3] dx, dy, dw, dh = delta[:, 0], delta[:, 1], delta[:, 2], delta[:, 3] gx = pw * dx + px gy = ph * dy + py gw = pw * paddle.exp(dw) gh = ph * paddle.exp(dh) gx1 = gx - gw * 0.5 gy1 = gy - gh * 0.5 gx2 = gx + gw * 0.5 gy2 = gy + gh * 0.5 return paddle.stack([gx1, gy1, gx2, gy2], axis=1) def decode_delta_map(self, nA, nGh, nGw, delta_map, anchor_vec): anchor_mesh = self.generate_anchor(nGh, nGw, anchor_vec) anchor_mesh = paddle.unsqueeze(anchor_mesh, 0) pred_list = self.decode_delta( paddle.reshape( delta_map, shape=[-1, 4]), paddle.reshape( anchor_mesh, shape=[-1, 4])) pred_map = paddle.reshape(pred_list, shape=[nA * nGh * nGw, 4]) return pred_map def _postprocessing_by_level(self, nA, stride, head_out, anchor_vec): boxes_shape = head_out.shape # [nB, nA*6, nGh, nGw] nGh, nGw = boxes_shape[-2], boxes_shape[-1] nB = 1 # TODO: only support bs=1 now boxes_list, scores_list = [], [] for idx in range(nB): p = paddle.reshape( head_out[idx], shape=[nA, self.num_classes + 5, nGh, nGw]) p = paddle.transpose(p, perm=[0, 2, 3, 1]) # [nA, nGh, nGw, 6] delta_map = p[:, :, :, :4] boxes = self.decode_delta_map(nA, nGh, nGw, delta_map, anchor_vec) # [nA * nGh * nGw, 4] boxes_list.append(boxes * stride) p_conf = paddle.transpose( p[:, :, :, 4:6], perm=[3, 0, 1, 2]) # [2, nA, nGh, nGw] p_conf = F.softmax( p_conf, axis=0)[1, :, :, :].unsqueeze(-1) # [nA, nGh, nGw, 1] scores = paddle.reshape(p_conf, shape=[nA * nGh * nGw, 1]) scores_list.append(scores) boxes_results = paddle.stack(boxes_list) scores_results = paddle.stack(scores_list) return boxes_results, scores_results def __call__(self, yolo_head_out, anchors): bbox_pred_list = [] for i, head_out in enumerate(yolo_head_out): stride = self.downsample_ratio // 2**i anc_w, anc_h = anchors[i][0::2], anchors[i][1::2] anchor_vec = np.stack((anc_w, anc_h), axis=1) / stride nA = len(anc_w) boxes, scores = self._postprocessing_by_level(nA, stride, head_out, anchor_vec) bbox_pred_list.append(paddle.concat([boxes, scores], axis=-1)) yolo_boxes_scores = paddle.concat(bbox_pred_list, axis=1) boxes_idx_over_conf_thr = paddle.nonzero( yolo_boxes_scores[:, :, -1] > self.conf_thresh) boxes_idx_over_conf_thr.stop_gradient = True return boxes_idx_over_conf_thr, yolo_boxes_scores @register @serializable class MaskMatrixNMS(object): """ Matrix NMS for multi-class masks. Args: update_threshold (float): Updated threshold of categroy score in second time. pre_nms_top_n (int): Number of total instance to be kept per image before NMS post_nms_top_n (int): Number of total instance to be kept per image after NMS. kernel (str): 'linear' or 'gaussian'. sigma (float): std in gaussian method. Input: seg_preds (Variable): shape (n, h, w), segmentation feature maps seg_masks (Variable): shape (n, h, w), segmentation feature maps cate_labels (Variable): shape (n), mask labels in descending order cate_scores (Variable): shape (n), mask scores in descending order sum_masks (Variable): a float tensor of the sum of seg_masks Returns: Variable: cate_scores, tensors of shape (n) """ def __init__(self, update_threshold=0.05, pre_nms_top_n=500, post_nms_top_n=100, kernel='gaussian', sigma=2.0): super(MaskMatrixNMS, self).__init__() self.update_threshold = update_threshold self.pre_nms_top_n = pre_nms_top_n self.post_nms_top_n = post_nms_top_n self.kernel = kernel self.sigma = sigma def _sort_score(self, scores, top_num): if scores.shape[0] > top_num: return paddle.topk(scores, top_num)[1] else: return paddle.argsort(scores, descending=True) def __call__(self, seg_preds, seg_masks, cate_labels, cate_scores, sum_masks=None): # sort and keep top nms_pre sort_inds = self._sort_score(cate_scores, self.pre_nms_top_n) seg_masks = paddle.gather(seg_masks, index=sort_inds) seg_preds = paddle.gather(seg_preds, index=sort_inds) sum_masks = paddle.gather(sum_masks, index=sort_inds) cate_scores = paddle.gather(cate_scores, index=sort_inds) cate_labels = paddle.gather(cate_labels, index=sort_inds) seg_masks = paddle.flatten(seg_masks, start_axis=1, stop_axis=-1) # inter. inter_matrix = paddle.mm(seg_masks, paddle.transpose(seg_masks, [1, 0])) n_samples = cate_labels.shape n_samples = paddle.to_tensor(n_samples, dtype="int32") # union. sum_masks_x = paddle.expand(sum_masks, shape=[n_samples, n_samples]) # iou. iou_matrix = (inter_matrix / ( sum_masks_x + paddle.transpose(sum_masks_x, [1, 0]) - inter_matrix)) iou_matrix = paddle.triu(iou_matrix, diagonal=1) # label_specific matrix. cate_labels_x = paddle.expand(cate_labels, shape=[n_samples, n_samples]) label_matrix = paddle.cast( (cate_labels_x == paddle.transpose(cate_labels_x, [1, 0])), 'float32') label_matrix = paddle.triu(label_matrix, diagonal=1) # IoU compensation compensate_iou = paddle.max((iou_matrix * label_matrix), axis=0) compensate_iou = paddle.expand( compensate_iou, shape=[n_samples, n_samples]) compensate_iou = paddle.transpose(compensate_iou, [1, 0]) # IoU decay decay_iou = iou_matrix * label_matrix # matrix nms if self.kernel == 'gaussian': decay_matrix = paddle.exp(-1 * self.sigma * (decay_iou**2)) compensate_matrix = paddle.exp(-1 * self.sigma * (compensate_iou**2)) decay_coefficient = paddle.min(decay_matrix / compensate_matrix, axis=0) elif self.kernel == 'linear': decay_matrix = (1 - decay_iou) / (1 - compensate_iou) decay_coefficient = paddle.min(decay_matrix, axis=0) else: raise NotImplementedError # update the score. cate_scores = cate_scores * decay_coefficient y = paddle.zeros(shape=cate_scores.shape, dtype='float32') keep = paddle.where(cate_scores >= self.update_threshold, cate_scores, y) keep = paddle.nonzero(keep) keep = paddle.squeeze(keep, axis=[1]) # Prevent empty and increase fake data keep = paddle.concat( [keep, paddle.cast(paddle.shape(cate_scores)[0:1] - 1, 'int64')]) seg_preds = paddle.gather(seg_preds, index=keep) cate_scores = paddle.gather(cate_scores, index=keep) cate_labels = paddle.gather(cate_labels, index=keep) # sort and keep top_k sort_inds = self._sort_score(cate_scores, self.post_nms_top_n) seg_preds = paddle.gather(seg_preds, index=sort_inds) cate_scores = paddle.gather(cate_scores, index=sort_inds) cate_labels = paddle.gather(cate_labels, index=sort_inds) return seg_preds, cate_scores, cate_labels def Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, weight_init=Normal(std=0.001), bias_init=Constant(0.)): weight_attr = paddle.framework.ParamAttr(initializer=weight_init) if bias: bias_attr = paddle.framework.ParamAttr(initializer=bias_init) else: bias_attr = False conv = nn.Conv2D( in_channels, out_channels, kernel_size, stride, padding, dilation, groups, weight_attr=weight_attr, bias_attr=bias_attr) return conv def ConvTranspose2d(in_channels, out_channels, kernel_size, stride=1, padding=0, output_padding=0, groups=1, bias=True, dilation=1, weight_init=Normal(std=0.001), bias_init=Constant(0.)): weight_attr = paddle.framework.ParamAttr(initializer=weight_init) if bias: bias_attr = paddle.framework.ParamAttr(initializer=bias_init) else: bias_attr = False conv = nn.Conv2DTranspose( in_channels, out_channels, kernel_size, stride, padding, output_padding, dilation, groups, weight_attr=weight_attr, bias_attr=bias_attr) return conv def BatchNorm2d(num_features, eps=1e-05, momentum=0.9, affine=True): if not affine: weight_attr = False bias_attr = False else: weight_attr = None bias_attr = None batchnorm = nn.BatchNorm2D( num_features, momentum, eps, weight_attr=weight_attr, bias_attr=bias_attr) return batchnorm def ReLU(): return nn.ReLU() def Upsample(scale_factor=None, mode='nearest', align_corners=False): return nn.Upsample(None, scale_factor, mode, align_corners) def MaxPool(kernel_size, stride, padding, ceil_mode=False): return nn.MaxPool2D(kernel_size, stride, padding, ceil_mode=ceil_mode) class Concat(nn.Layer): def __init__(self, dim=0): super(Concat, self).__init__() self.dim = dim def forward(self, inputs): return paddle.concat(inputs, axis=self.dim) def extra_repr(self): return 'dim={}'.format(self.dim) def _convert_attention_mask(attn_mask, dtype): """ Convert the attention mask to the target dtype we expect. Parameters: attn_mask (Tensor, optional): A tensor used in multi-head attention to prevents attention to some unwanted positions, usually the paddings or the subsequent positions. It is a tensor with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. When the data type is bool, the unwanted positions have `False` values and the others have `True` values. When the data type is int, the unwanted positions have 0 values and the others have 1 values. When the data type is float, the unwanted positions have `-INF` values and the others have 0 values. It can be None when nothing wanted or needed to be prevented attention to. Default None. dtype (VarType): The target type of `attn_mask` we expect. Returns: Tensor: A Tensor with shape same as input `attn_mask`, with data type `dtype`. """ return nn.layer.transformer._convert_attention_mask(attn_mask, dtype) @register class MultiHeadAttention(nn.Layer): """ Attention mapps queries and a set of key-value pairs to outputs, and Multi-Head Attention performs multiple parallel attention to jointly attending to information from different representation subspaces. Please refer to `Attention Is All You Need `_ for more details. Parameters: embed_dim (int): The expected feature size in the input and output. num_heads (int): The number of heads in multi-head attention. dropout (float, optional): The dropout probability used on attention weights to drop some attention targets. 0 for no dropout. Default 0 kdim (int, optional): The feature size in key. If None, assumed equal to `embed_dim`. Default None. vdim (int, optional): The feature size in value. If None, assumed equal to `embed_dim`. Default None. need_weights (bool, optional): Indicate whether to return the attention weights. Default False. Examples: .. code-block:: python import paddle # encoder input: [batch_size, sequence_length, d_model] query = paddle.rand((2, 4, 128)) # self attention mask: [batch_size, num_heads, query_len, query_len] attn_mask = paddle.rand((2, 2, 4, 4)) multi_head_attn = paddle.nn.MultiHeadAttention(128, 2) output = multi_head_attn(query, None, None, attn_mask=attn_mask) # [2, 4, 128] """ def __init__(self, embed_dim, num_heads, dropout=0., kdim=None, vdim=None, need_weights=False): super(MultiHeadAttention, self).__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim self.num_heads = num_heads self.dropout = dropout self.need_weights = need_weights self.head_dim = embed_dim // num_heads assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" if self._qkv_same_embed_dim: self.in_proj_weight = self.create_parameter( shape=[embed_dim, 3 * embed_dim], attr=None, dtype=self._dtype, is_bias=False) self.in_proj_bias = self.create_parameter( shape=[3 * embed_dim], attr=None, dtype=self._dtype, is_bias=True) else: self.q_proj = nn.Linear(embed_dim, embed_dim) self.k_proj = nn.Linear(self.kdim, embed_dim) self.v_proj = nn.Linear(self.vdim, embed_dim) self.out_proj = nn.Linear(embed_dim, embed_dim) self._type_list = ('q_proj', 'k_proj', 'v_proj') self._reset_parameters() def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: xavier_uniform_(p) else: constant_(p) def compute_qkv(self, tensor, index): if self._qkv_same_embed_dim: tensor = F.linear( x=tensor, weight=self.in_proj_weight[:, index * self.embed_dim:(index + 1) * self.embed_dim], bias=self.in_proj_bias[index * self.embed_dim:(index + 1) * self.embed_dim] if self.in_proj_bias is not None else None) else: tensor = getattr(self, self._type_list[index])(tensor) tensor = tensor.reshape( [0, 0, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) return tensor def forward(self, query, key=None, value=None, attn_mask=None): r""" Applies multi-head attention to map queries and a set of key-value pairs to outputs. Parameters: query (Tensor): The queries for multi-head attention. It is a tensor with shape `[batch_size, query_length, embed_dim]`. The data type should be float32 or float64. key (Tensor, optional): The keys for multi-head attention. It is a tensor with shape `[batch_size, key_length, kdim]`. The data type should be float32 or float64. If None, use `query` as `key`. Default None. value (Tensor, optional): The values for multi-head attention. It is a tensor with shape `[batch_size, value_length, vdim]`. The data type should be float32 or float64. If None, use `query` as `value`. Default None. attn_mask (Tensor, optional): A tensor used in multi-head attention to prevents attention to some unwanted positions, usually the paddings or the subsequent positions. It is a tensor with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. When the data type is bool, the unwanted positions have `False` values and the others have `True` values. When the data type is int, the unwanted positions have 0 values and the others have 1 values. When the data type is float, the unwanted positions have `-INF` values and the others have 0 values. It can be None when nothing wanted or needed to be prevented attention to. Default None. Returns: Tensor|tuple: It is a tensor that has the same shape and data type \ as `query`, representing attention output. Or a tuple if \ `need_weights` is True or `cache` is not None. If `need_weights` \ is True, except for attention output, the tuple also includes \ the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \ If `cache` is not None, the tuple then includes the new cache \ having the same type as `cache`, and if it is `StaticCache`, it \ is same as the input `cache`, if it is `Cache`, the new cache \ reserves tensors concatanating raw tensors with intermediate \ results of current query. """ key = query if key is None else key value = query if value is None else value # compute q ,k ,v q, k, v = (self.compute_qkv(t, i) for i, t in enumerate([query, key, value])) # scale dot product attention product = paddle.matmul(x=q, y=k, transpose_y=True) scaling = float(self.head_dim)**-0.5 product = product * scaling if attn_mask is not None: # Support bool or int mask attn_mask = _convert_attention_mask(attn_mask, product.dtype) product = product + attn_mask weights = F.softmax(product) if self.dropout: weights = F.dropout( weights, self.dropout, training=self.training, mode="upscale_in_train") out = paddle.matmul(weights, v) # combine heads out = paddle.transpose(out, perm=[0, 2, 1, 3]) out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) # project to output out = self.out_proj(out) outs = [out] if self.need_weights: outs.append(weights) return out if len(outs) == 1 else tuple(outs) @register class ConvMixer(nn.Layer): def __init__( self, dim, depth, kernel_size=3, ): super().__init__() self.dim = dim self.depth = depth self.kernel_size = kernel_size self.mixer = self.conv_mixer(dim, depth, kernel_size) def forward(self, x): return self.mixer(x) @staticmethod def conv_mixer( dim, depth, kernel_size, ): Seq, ActBn = nn.Sequential, lambda x: Seq(x, nn.GELU(), nn.BatchNorm2D(dim)) Residual = type('Residual', (Seq, ), {'forward': lambda self, x: self[0](x) + x}) return Seq(*[ Seq(Residual( ActBn( nn.Conv2D( dim, dim, kernel_size, groups=dim, padding="same"))), ActBn(nn.Conv2D(dim, dim, 1))) for i in range(depth) ]) ================================================ FILE: ppdet/modeling/losses/__init__.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import yolo_loss from . import iou_aware_loss from . import iou_loss from . import ssd_loss from . import fcos_loss from . import solov2_loss from . import ctfocal_loss from . import keypoint_loss from . import jde_loss from . import fairmot_loss from . import gfocal_loss from . import detr_loss from . import sparsercnn_loss from . import focal_loss from . import smooth_l1_loss from . import probiou_loss from . import cot_loss from . import supcontrast from . import queryinst_loss from . import clrnet_loss from . import clrnet_line_iou_loss from .yolo_loss import * from .iou_aware_loss import * from .iou_loss import * from .ssd_loss import * from .fcos_loss import * from .solov2_loss import * from .ctfocal_loss import * from .keypoint_loss import * from .jde_loss import * from .fairmot_loss import * from .gfocal_loss import * from .detr_loss import * from .sparsercnn_loss import * from .focal_loss import * from .smooth_l1_loss import * from .pose3d_loss import * from .probiou_loss import * from .cot_loss import * from .supcontrast import * from .queryinst_loss import * from .clrnet_loss import * from .clrnet_line_iou_loss import * ================================================ FILE: ppdet/modeling/losses/clrnet_line_iou_loss.py ================================================ import paddle def line_iou(pred, target, img_w, length=15, aligned=True): ''' Calculate the line iou value between predictions and targets Args: pred: lane predictions, shape: (num_pred, 72) target: ground truth, shape: (num_target, 72) img_w: image width length: extended radius aligned: True for iou loss calculation, False for pair-wise ious in assign ''' px1 = pred - length px2 = pred + length tx1 = target - length tx2 = target + length if aligned: invalid_mask = target ovr = paddle.minimum(px2, tx2) - paddle.maximum(px1, tx1) union = paddle.maximum(px2, tx2) - paddle.minimum(px1, tx1) else: num_pred = pred.shape[0] invalid_mask = target.tile([num_pred, 1, 1]) ovr = (paddle.minimum(px2[:, None, :], tx2[None, ...]) - paddle.maximum( px1[:, None, :], tx1[None, ...])) union = (paddle.maximum(px2[:, None, :], tx2[None, ...]) - paddle.minimum(px1[:, None, :], tx1[None, ...])) invalid_masks = (invalid_mask < 0) | (invalid_mask >= img_w) ovr[invalid_masks] = 0. union[invalid_masks] = 0. iou = ovr.sum(axis=-1) / (union.sum(axis=-1) + 1e-9) return iou def liou_loss(pred, target, img_w, length=15): return (1 - line_iou(pred, target, img_w, length)).mean() ================================================ FILE: ppdet/modeling/losses/clrnet_loss.py ================================================ import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from ppdet.modeling.clrnet_utils import accuracy from ppdet.modeling.assigners.clrnet_assigner import assign from ppdet.modeling.losses.clrnet_line_iou_loss import liou_loss __all__ = ['CLRNetLoss'] class SoftmaxFocalLoss(nn.Layer): def __init__(self, gamma, ignore_lb=255, *args, **kwargs): super(SoftmaxFocalLoss, self).__init__() self.gamma = gamma self.nll = nn.NLLLoss(ignore_index=ignore_lb) def forward(self, logits, labels): scores = F.softmax(logits, dim=1) factor = paddle.pow(1. - scores, self.gamma) log_score = F.log_softmax(logits, dim=1) log_score = factor * log_score loss = self.nll(log_score, labels) return loss def focal_loss(input: paddle.Tensor, target: paddle.Tensor, alpha: float, gamma: float=2.0, reduction: str='none', eps: float=1e-8) -> paddle.Tensor: r"""Function that computes Focal loss. See :class:`~kornia.losses.FocalLoss` for details. """ if not paddle.is_tensor(input): raise TypeError("Input type is not a torch.Tensor. Got {}".format( type(input))) if not len(input.shape) >= 2: raise ValueError("Invalid input shape, we expect BxCx*. Got: {}".format( input.shape)) if input.shape[0] != target.shape[0]: raise ValueError( 'Expected input batch_size ({}) to match target batch_size ({}).'. format(input.shape[0], target.shape[0])) n = input.shape[0] out_size = (n, ) + tuple(input.shape[2:]) if target.shape[1:] != input.shape[2:]: raise ValueError('Expected target size {}, got {}'.format(out_size, target.shape)) if (isinstance(input.place, paddle.CUDAPlace) and isinstance(target.place, paddle.CPUPlace)) | (isinstance( input.place, paddle.CPUPlace) and isinstance(target.place, paddle.CUDAPlace)): raise ValueError( "input and target must be in the same device. Got: {} and {}". format(input.place, target.place)) # compute softmax over the classes axis input_soft: paddle.Tensor = F.softmax(input, axis=1) + eps # create the labels one hot tensor target_one_hot: paddle.Tensor = paddle.to_tensor( F.one_hot( target, num_classes=input.shape[1]).cast(input.dtype), place=input.place) # compute the actual focal loss weight = paddle.pow(-input_soft + 1., gamma) focal = -alpha * weight * paddle.log(input_soft) loss_tmp = paddle.sum(target_one_hot * focal, axis=1) if reduction == 'none': loss = loss_tmp elif reduction == 'mean': loss = paddle.mean(loss_tmp) elif reduction == 'sum': loss = paddle.sum(loss_tmp) else: raise NotImplementedError("Invalid reduction mode: {}".format( reduction)) return loss class FocalLoss(nn.Layer): r"""Criterion that computes Focal loss. According to [1], the Focal loss is computed as follows: .. math:: \text{FL}(p_t) = -\alpha_t (1 - p_t)^{\gamma} \, \text{log}(p_t) where: - :math:`p_t` is the model's estimated probability for each class. Arguments: alpha (float): Weighting factor :math:`\alpha \in [0, 1]`. gamma (float): Focusing parameter :math:`\gamma >= 0`. reduction (str, optional): Specifies the reduction to apply to the output: ‘none’ | ‘mean’ | ‘sum’. ‘none’: no reduction will be applied, ‘mean’: the sum of the output will be divided by the number of elements in the output, ‘sum’: the output will be summed. Default: ‘none’. Shape: - Input: :math:`(N, C, *)` where C = number of classes. - Target: :math:`(N, *)` where each value is :math:`0 ≤ targets[i] ≤ C−1`. Examples: >>> N = 5 # num_classes >>> kwargs = {"alpha": 0.5, "gamma": 2.0, "reduction": 'mean'} >>> loss = kornia.losses.FocalLoss(**kwargs) >>> input = torch.randn(1, N, 3, 5, requires_grad=True) >>> target = torch.empty(1, 3, 5, dtype=torch.long).random_(N) >>> output = loss(input, target) >>> output.backward() References: [1] https://arxiv.org/abs/1708.02002 """ def __init__(self, alpha: float, gamma: float=2.0, reduction: str='none') -> None: super(FocalLoss, self).__init__() self.alpha: float = alpha self.gamma: float = gamma self.reduction: str = reduction self.eps: float = 1e-6 def forward( # type: ignore self, input: paddle.Tensor, target: paddle.Tensor) -> paddle.Tensor: return focal_loss(input, target, self.alpha, self.gamma, self.reduction, self.eps) @register class CLRNetLoss(nn.Layer): __shared__ = ['img_w', 'img_h', 'num_classes', 'num_points'] def __init__(self, cls_loss_weight=2.0, xyt_loss_weight=0.2, iou_loss_weight=2.0, seg_loss_weight=1.0, refine_layers=3, num_points=72, img_w=800, img_h=320, num_classes=5, ignore_label=255, bg_weight=0.4): super(CLRNetLoss, self).__init__() self.cls_loss_weight = cls_loss_weight self.xyt_loss_weight = xyt_loss_weight self.iou_loss_weight = iou_loss_weight self.seg_loss_weight = seg_loss_weight self.refine_layers = refine_layers self.img_w = img_w self.img_h = img_h self.n_strips = num_points - 1 self.num_classes = num_classes self.ignore_label = ignore_label weights = paddle.ones(shape=[self.num_classes]) weights[0] = bg_weight self.criterion = nn.NLLLoss( ignore_index=self.ignore_label, weight=weights) def forward(self, output, batch): predictions_lists = output['predictions_lists'] targets = batch['lane_line'].clone() cls_criterion = FocalLoss(alpha=0.25, gamma=2.0) cls_loss = paddle.to_tensor(0.0) reg_xytl_loss = paddle.to_tensor(0.0) iou_loss = paddle.to_tensor(0.0) cls_acc = [] cls_acc_stage = [] for stage in range(self.refine_layers): predictions_list = predictions_lists[stage] for predictions, target in zip(predictions_list, targets): target = target[target[:, 1] == 1] if len(target) == 0: # If there are no targets, all predictions have to be negatives (i.e., 0 confidence) cls_target = paddle.zeros( [predictions.shape[0]], dtype='int64') cls_pred = predictions[:, :2] cls_loss = cls_loss + cls_criterion(cls_pred, cls_target).sum() continue with paddle.no_grad(): matched_row_inds, matched_col_inds = assign( predictions, target, self.img_w, self.img_h) # classification targets cls_target = paddle.zeros([predictions.shape[0]], dtype='int64') cls_target[matched_row_inds] = 1 cls_pred = predictions[:, :2] # regression targets -> [start_y, start_x, theta] (all transformed to absolute values), only on matched pairs reg_yxtl = predictions.index_select(matched_row_inds)[..., 2:6] reg_yxtl[:, 0] *= self.n_strips reg_yxtl[:, 1] *= (self.img_w - 1) reg_yxtl[:, 2] *= 180 reg_yxtl[:, 3] *= self.n_strips target_yxtl = target.index_select(matched_col_inds)[..., 2: 6].clone() # regression targets -> S coordinates (all transformed to absolute values) reg_pred = predictions.index_select(matched_row_inds)[..., 6:] reg_pred *= (self.img_w - 1) reg_targets = target.index_select(matched_col_inds)[..., 6:].clone() with paddle.no_grad(): predictions_starts = paddle.clip( (predictions.index_select(matched_row_inds)[..., 2] * self.n_strips).round().cast("int64"), min=0, max=self. n_strips) # ensure the predictions starts is valid target_starts = ( target.index_select(matched_col_inds)[..., 2] * self.n_strips).round().cast("int64") target_yxtl[:, -1] -= ( predictions_starts - target_starts) # reg length # Loss calculation cls_loss = cls_loss + cls_criterion( cls_pred, cls_target).sum() / target.shape[0] target_yxtl[:, 0] *= self.n_strips target_yxtl[:, 2] *= 180 reg_xytl_loss = reg_xytl_loss + F.smooth_l1_loss( input=reg_yxtl, label=target_yxtl, reduction='none').mean() iou_loss = iou_loss + liou_loss( reg_pred, reg_targets, self.img_w, length=15) cls_accuracy = accuracy(cls_pred, cls_target) cls_acc_stage.append(cls_accuracy) cls_acc.append(sum(cls_acc_stage) / (len(cls_acc_stage) + 1e-5)) # extra segmentation loss seg_loss = self.criterion( F.log_softmax( output['seg'], axis=1), batch['seg'].cast('int64')) cls_loss /= (len(targets) * self.refine_layers) reg_xytl_loss /= (len(targets) * self.refine_layers) iou_loss /= (len(targets) * self.refine_layers) loss = cls_loss * self.cls_loss_weight \ + reg_xytl_loss * self.xyt_loss_weight \ + seg_loss * self.seg_loss_weight \ + iou_loss * self.iou_loss_weight return_value = { 'loss': loss, 'cls_loss': cls_loss * self.cls_loss_weight, 'reg_xytl_loss': reg_xytl_loss * self.xyt_loss_weight, 'seg_loss': seg_loss * self.seg_loss_weight, 'iou_loss': iou_loss * self.iou_loss_weight } for i in range(self.refine_layers): if not isinstance(cls_acc[i], paddle.Tensor): cls_acc[i] = paddle.to_tensor(cls_acc[i]) return_value['stage_{}_acc'.format(i)] = cls_acc[i] return return_value ================================================ FILE: ppdet/modeling/losses/cot_loss.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F import numpy as np from ppdet.core.workspace import register __all__ = ['COTLoss'] @register class COTLoss(nn.Layer): __shared__ = ['num_classes'] def __init__(self, num_classes=80, cot_scale=1, cot_lambda=1): super(COTLoss, self).__init__() self.cot_scale = cot_scale self.cot_lambda = cot_lambda self.num_classes = num_classes def forward(self, scores, targets, cot_relation): cls_name = 'loss_bbox_cls_cot' loss_bbox = {} tgt_labels, tgt_bboxes, tgt_gt_inds = targets tgt_labels = paddle.concat(tgt_labels) if len( tgt_labels) > 1 else tgt_labels[0] mask = (tgt_labels < self.num_classes) valid_inds = paddle.nonzero(tgt_labels >= 0).flatten() if valid_inds.shape[0] == 0: loss_bbox[cls_name] = paddle.zeros([1], dtype='float32') else: tgt_labels = tgt_labels.cast('int64') valid_cot_targets = [] for i in range(tgt_labels.shape[0]): train_label = tgt_labels[i] if train_label < self.num_classes: valid_cot_targets.append(cot_relation[train_label]) coco_targets = paddle.to_tensor(valid_cot_targets) coco_targets.stop_gradient = True coco_loss = - coco_targets * F.log_softmax(scores[mask][:, :-1] * self.cot_scale) loss_bbox[cls_name] = self.cot_lambda * paddle.mean(paddle.sum(coco_loss, axis=-1)) return loss_bbox ================================================ FILE: ppdet/modeling/losses/ctfocal_loss.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle from ppdet.core.workspace import register, serializable __all__ = ['CTFocalLoss'] @register @serializable class CTFocalLoss(object): """ CTFocalLoss: CornerNet & CenterNet Focal Loss Args: loss_weight (float): loss weight gamma (float): gamma parameter for Focal Loss """ def __init__(self, loss_weight=1., gamma=2.0): self.loss_weight = loss_weight self.gamma = gamma def __call__(self, pred, target): """ Calculate the loss Args: pred (Tensor): heatmap prediction target (Tensor): target for positive samples Return: ct_focal_loss (Tensor): Focal Loss used in CornerNet & CenterNet. Note that the values in target are in [0, 1] since gaussian is used to reduce the punishment and we treat [0, 1) as neg example. """ fg_map = paddle.cast(target == 1, 'float32') fg_map.stop_gradient = True bg_map = paddle.cast(target < 1, 'float32') bg_map.stop_gradient = True neg_weights = paddle.pow(1 - target, 4) pos_loss = 0 - paddle.log(pred) * paddle.pow(1 - pred, self.gamma) * fg_map neg_loss = 0 - paddle.log(1 - pred) * paddle.pow( pred, self.gamma) * neg_weights * bg_map pos_loss = paddle.sum(pos_loss) neg_loss = paddle.sum(neg_loss) fg_num = paddle.sum(fg_map) ct_focal_loss = (pos_loss + neg_loss) / ( fg_num + paddle.cast(fg_num == 0, 'float32')) return ct_focal_loss * self.loss_weight ================================================ FILE: ppdet/modeling/losses/detr_loss.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from .iou_loss import GIoULoss from ..transformers import bbox_cxcywh_to_xyxy, sigmoid_focal_loss, varifocal_loss_with_logits from ..bbox_utils import bbox_iou __all__ = ['DETRLoss', 'DINOLoss', 'DINOv3Loss'] @register class DETRLoss(nn.Layer): __shared__ = ['num_classes', 'use_focal_loss'] __inject__ = ['matcher'] def __init__(self, num_classes=80, matcher='HungarianMatcher', loss_coeff={ 'class': 1, 'bbox': 5, 'giou': 2, 'no_object': 0.1, 'mask': 1, 'dice': 1 }, aux_loss=True, use_focal_loss=False, use_vfl=False, vfl_iou_type='bbox', use_uni_match=False, uni_match_ind=0): r""" Args: num_classes (int): The number of classes. matcher (HungarianMatcher): It computes an assignment between the targets and the predictions of the network. loss_coeff (dict): The coefficient of loss. aux_loss (bool): If 'aux_loss = True', loss at each decoder layer are to be used. use_focal_loss (bool): Use focal loss or not. """ super(DETRLoss, self).__init__() self.num_classes = num_classes self.matcher = matcher self.loss_coeff = loss_coeff self.aux_loss = aux_loss self.use_focal_loss = use_focal_loss self.use_vfl = use_vfl self.vfl_iou_type = vfl_iou_type self.use_uni_match = use_uni_match self.uni_match_ind = uni_match_ind if not self.use_focal_loss: self.loss_coeff['class'] = paddle.full([num_classes + 1], loss_coeff['class']) self.loss_coeff['class'][-1] = loss_coeff['no_object'] self.giou_loss = GIoULoss() def _get_loss_class(self, logits, gt_class, match_indices, bg_index, num_gts, postfix="", iou_score=None, gt_score=None): # logits: [b, query, num_classes], gt_class: list[[n, 1]] name_class = "loss_class" + postfix target_label = paddle.full(logits.shape[:2], bg_index, dtype='int64') bs, num_query_objects = target_label.shape num_gt = sum(len(a) for a in gt_class) if num_gt > 0: index, updates = self._get_index_updates(num_query_objects, gt_class, match_indices) target_label = paddle.scatter( target_label.reshape([-1, 1]), index, updates.astype('int64')) target_label = target_label.reshape([bs, num_query_objects]) if self.use_focal_loss: target_label = F.one_hot(target_label, self.num_classes + 1)[..., :-1] if iou_score is not None and self.use_vfl: if gt_score is not None: target_score = paddle.zeros([bs, num_query_objects]) target_score = paddle.scatter( target_score.reshape([-1, 1]), index, gt_score) target_score = target_score.reshape( [bs, num_query_objects, 1]) * target_label target_score_iou = paddle.zeros([bs, num_query_objects]) target_score_iou = paddle.scatter( target_score_iou.reshape([-1, 1]), index, iou_score) target_score_iou = target_score_iou.reshape( [bs, num_query_objects, 1]) * target_label target_score = paddle.multiply(target_score, target_score_iou) loss_ = self.loss_coeff[ 'class'] * varifocal_loss_with_logits( logits, target_score, target_label, num_gts / num_query_objects) else: target_score = paddle.zeros([bs, num_query_objects]) if num_gt > 0: target_score = paddle.scatter( target_score.reshape([-1, 1]), index, iou_score) target_score = target_score.reshape( [bs, num_query_objects, 1]) * target_label loss_ = self.loss_coeff[ 'class'] * varifocal_loss_with_logits( logits, target_score, target_label, num_gts / num_query_objects) else: loss_ = self.loss_coeff['class'] * sigmoid_focal_loss( logits, target_label, num_gts / num_query_objects) else: loss_ = F.cross_entropy( logits, target_label, weight=self.loss_coeff['class']) return {name_class: loss_} def _get_loss_bbox(self, boxes, gt_bbox, match_indices, num_gts, postfix=""): # boxes: [b, query, 4], gt_bbox: list[[n, 4]] name_bbox = "loss_bbox" + postfix name_giou = "loss_giou" + postfix loss = dict() if sum(len(a) for a in gt_bbox) == 0: loss[name_bbox] = paddle.to_tensor([0.]) loss[name_giou] = paddle.to_tensor([0.]) return loss src_bbox, target_bbox = self._get_src_target_assign(boxes, gt_bbox, match_indices) loss[name_bbox] = self.loss_coeff['bbox'] * F.l1_loss( src_bbox, target_bbox, reduction='sum') / num_gts loss[name_giou] = self.giou_loss( bbox_cxcywh_to_xyxy(src_bbox), bbox_cxcywh_to_xyxy(target_bbox)) loss[name_giou] = loss[name_giou].sum() / num_gts loss[name_giou] = self.loss_coeff['giou'] * loss[name_giou] return loss def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts, postfix=""): # masks: [b, query, h, w], gt_mask: list[[n, H, W]] name_mask = "loss_mask" + postfix name_dice = "loss_dice" + postfix loss = dict() if sum(len(a) for a in gt_mask) == 0: loss[name_mask] = paddle.to_tensor([0.]) loss[name_dice] = paddle.to_tensor([0.]) return loss src_masks, target_masks = self._get_src_target_assign(masks, gt_mask, match_indices) src_masks = F.interpolate( src_masks.unsqueeze(0), size=target_masks.shape[-2:], mode="bilinear")[0] loss[name_mask] = self.loss_coeff['mask'] * F.sigmoid_focal_loss( src_masks, target_masks, paddle.to_tensor( [num_gts], dtype='float32')) loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss( src_masks, target_masks, num_gts) return loss def _dice_loss(self, inputs, targets, num_gts): inputs = F.sigmoid(inputs) inputs = inputs.flatten(1) targets = targets.flatten(1) numerator = 2 * (inputs * targets).sum(1) denominator = inputs.sum(-1) + targets.sum(-1) loss = 1 - (numerator + 1) / (denominator + 1) return loss.sum() / num_gts def _get_loss_aux(self, boxes, logits, gt_bbox, gt_class, bg_index, num_gts, dn_match_indices=None, postfix="", masks=None, gt_mask=None, gt_score=None): loss_class = [] loss_bbox, loss_giou = [], [] loss_mask, loss_dice = [], [] if dn_match_indices is not None: match_indices = dn_match_indices elif self.use_uni_match: match_indices = self.matcher( boxes[self.uni_match_ind], logits[self.uni_match_ind], gt_bbox, gt_class, masks=masks[self.uni_match_ind] if masks is not None else None, gt_mask=gt_mask) for i, (aux_boxes, aux_logits) in enumerate(zip(boxes, logits)): aux_masks = masks[i] if masks is not None else None if not self.use_uni_match and dn_match_indices is None: match_indices = self.matcher( aux_boxes, aux_logits, gt_bbox, gt_class, masks=aux_masks, gt_mask=gt_mask) if self.use_vfl: if sum(len(a) for a in gt_bbox) > 0: src_bbox, target_bbox = self._get_src_target_assign( aux_boxes.detach(), gt_bbox, match_indices) iou_score = bbox_iou( bbox_cxcywh_to_xyxy(src_bbox).split(4, -1), bbox_cxcywh_to_xyxy(target_bbox).split(4, -1)) else: iou_score = None if gt_score is not None: _, target_score = self._get_src_target_assign( logits[-1].detach(), gt_score, match_indices) else: iou_score = None loss_class.append( self._get_loss_class( aux_logits, gt_class, match_indices, bg_index, num_gts, postfix, iou_score, gt_score=target_score if gt_score is not None else None)['loss_class' + postfix]) loss_ = self._get_loss_bbox(aux_boxes, gt_bbox, match_indices, num_gts, postfix) loss_bbox.append(loss_['loss_bbox' + postfix]) loss_giou.append(loss_['loss_giou' + postfix]) if masks is not None and gt_mask is not None: loss_ = self._get_loss_mask(aux_masks, gt_mask, match_indices, num_gts, postfix) loss_mask.append(loss_['loss_mask' + postfix]) loss_dice.append(loss_['loss_dice' + postfix]) loss = { "loss_class_aux" + postfix: paddle.add_n(loss_class), "loss_bbox_aux" + postfix: paddle.add_n(loss_bbox), "loss_giou_aux" + postfix: paddle.add_n(loss_giou) } if masks is not None and gt_mask is not None: loss["loss_mask_aux" + postfix] = paddle.add_n(loss_mask) loss["loss_dice_aux" + postfix] = paddle.add_n(loss_dice) return loss def _get_index_updates(self, num_query_objects, target, match_indices): batch_idx = paddle.concat([ paddle.full_like(src, i) for i, (src, _) in enumerate(match_indices) ]) src_idx = paddle.concat([src for (src, _) in match_indices]) src_idx += (batch_idx * num_query_objects) target_assign = paddle.concat([ paddle.gather( t, dst, axis=0) for t, (_, dst) in zip(target, match_indices) ]) return src_idx, target_assign def _get_src_target_assign(self, src, target, match_indices): src_assign = paddle.concat([ paddle.gather( t, I, axis=0) if len(I) > 0 else paddle.zeros([0, t.shape[-1]]) for t, (I, _) in zip(src, match_indices) ]) target_assign = paddle.concat([ paddle.gather( t, J, axis=0) if len(J) > 0 else paddle.zeros([0, t.shape[-1]]) for t, (_, J) in zip(target, match_indices) ]) return src_assign, target_assign def _get_num_gts(self, targets, dtype="float32"): num_gts = sum(len(a) for a in targets) num_gts = paddle.to_tensor([num_gts], dtype=dtype) if paddle.distributed.get_world_size() > 1: paddle.distributed.all_reduce(num_gts) num_gts /= paddle.distributed.get_world_size() num_gts = paddle.clip(num_gts, min=1.) return num_gts def _get_prediction_loss(self, boxes, logits, gt_bbox, gt_class, masks=None, gt_mask=None, postfix="", dn_match_indices=None, num_gts=1, gt_score=None): if dn_match_indices is None: match_indices = self.matcher( boxes, logits, gt_bbox, gt_class, masks=masks, gt_mask=gt_mask) else: match_indices = dn_match_indices if self.use_vfl: if gt_score is not None: #ssod _, target_score = self._get_src_target_assign( logits[-1].detach(), gt_score, match_indices) elif sum(len(a) for a in gt_bbox) > 0: if self.vfl_iou_type == 'bbox': src_bbox, target_bbox = self._get_src_target_assign( boxes.detach(), gt_bbox, match_indices) iou_score = bbox_iou( bbox_cxcywh_to_xyxy(src_bbox).split(4, -1), bbox_cxcywh_to_xyxy(target_bbox).split(4, -1)) elif self.vfl_iou_type == 'mask': assert (masks is not None and gt_mask is not None, 'Make sure the input has `mask` and `gt_mask`') assert sum(len(a) for a in gt_mask) > 0 src_mask, target_mask = self._get_src_target_assign( masks.detach(), gt_mask, match_indices) src_mask = F.interpolate( src_mask.unsqueeze(0), scale_factor=2, mode='bilinear', align_corners=False).squeeze(0) target_mask = F.interpolate( target_mask.unsqueeze(0), size=src_mask.shape[-2:], mode='bilinear', align_corners=False).squeeze(0) src_mask = src_mask.flatten(1) src_mask = F.sigmoid(src_mask) src_mask = paddle.where( src_mask > 0.5, 1., 0.).astype(masks.dtype) target_mask = target_mask.flatten(1) target_mask = paddle.where( target_mask > 0.5, 1., 0.).astype(masks.dtype) inter = (src_mask * target_mask).sum(1) union = src_mask.sum(1) + target_mask.sum(1) - inter iou_score = (inter + 1e-2) / (union + 1e-2) iou_score = iou_score.unsqueeze(-1) else: iou_score = None else: iou_score = None else: iou_score = None loss = dict() loss.update( self._get_loss_class( logits, gt_class, match_indices, self.num_classes, num_gts, postfix, iou_score, gt_score=target_score if gt_score is not None else None)) loss.update( self._get_loss_bbox(boxes, gt_bbox, match_indices, num_gts, postfix)) if masks is not None and gt_mask is not None: loss.update( self._get_loss_mask(masks, gt_mask, match_indices, num_gts, postfix)) return loss def forward(self, boxes, logits, gt_bbox, gt_class, masks=None, gt_mask=None, postfix="", gt_score=None, o2m=1, **kwargs): r""" Args: boxes (Tensor): [l, b, query, 4] logits (Tensor): [l, b, query, num_classes] gt_bbox (List(Tensor)): list[[n, 4]] gt_class (List(Tensor)): list[[n, 1]] masks (Tensor, optional): [l, b, query, h, w] gt_mask (List(Tensor), optional): list[[n, H, W]] postfix (str): postfix of loss name """ dn_match_indices = kwargs.get("dn_match_indices", None) num_gts = kwargs.get("num_gts", None) if num_gts is None: num_gts = self._get_num_gts(gt_class) total_loss = self._get_prediction_loss( boxes[-1], logits[-1], gt_bbox, gt_class, masks=masks[-1] if masks is not None else None, gt_mask=gt_mask, postfix=postfix, dn_match_indices=dn_match_indices, num_gts=num_gts, gt_score=gt_score if gt_score is not None else None) if self.aux_loss: total_loss.update( self._get_loss_aux( boxes[:-1], logits[:-1], gt_bbox, gt_class, self.num_classes, num_gts, dn_match_indices, postfix, masks=masks[:-1] if masks is not None else None, gt_mask=gt_mask, gt_score=gt_score if gt_score is not None else None)) return total_loss @register class DINOLoss(DETRLoss): def forward(self, boxes, logits, gt_bbox, gt_class, masks=None, gt_mask=None, postfix="", dn_out_bboxes=None, dn_out_logits=None, dn_meta=None, gt_score=None, **kwargs): num_gts = self._get_num_gts(gt_class) total_loss = super(DINOLoss, self).forward( boxes, logits, gt_bbox, gt_class, num_gts=num_gts, gt_score=gt_score) if dn_meta is not None: dn_positive_idx, dn_num_group = \ dn_meta["dn_positive_idx"], dn_meta["dn_num_group"] assert len(gt_class) == len(dn_positive_idx) # denoising match indices dn_match_indices = self.get_dn_match_indices( gt_class, dn_positive_idx, dn_num_group) # compute denoising training loss num_gts *= dn_num_group dn_loss = super(DINOLoss, self).forward( dn_out_bboxes, dn_out_logits, gt_bbox, gt_class, postfix="_dn", dn_match_indices=dn_match_indices, num_gts=num_gts, gt_score=gt_score) total_loss.update(dn_loss) else: total_loss.update( {k + '_dn': paddle.to_tensor([0.]) for k in total_loss.keys()}) return total_loss @staticmethod def get_dn_match_indices(labels, dn_positive_idx, dn_num_group): dn_match_indices = [] for i in range(len(labels)): num_gt = len(labels[i]) if num_gt > 0: gt_idx = paddle.arange(end=num_gt, dtype="int64") gt_idx = gt_idx.tile([dn_num_group]) assert len(dn_positive_idx[i]) == len(gt_idx) dn_match_indices.append((dn_positive_idx[i], gt_idx)) else: dn_match_indices.append((paddle.zeros( [0], dtype="int64"), paddle.zeros( [0], dtype="int64"))) return dn_match_indices @register class DINOv3Loss(DETRLoss): def forward(self, boxes, logits, gt_bbox, gt_class, masks=None, gt_mask=None, postfix="", dn_out_bboxes=None, dn_out_logits=None, dn_meta=None, gt_score=None, o2m=1, **kwargs): if o2m != 1: gt_boxes_copy = [box.tile([o2m, 1]) for box in gt_bbox] gt_class_copy = [label.tile([o2m, 1]) for label in gt_class] else: gt_boxes_copy = gt_bbox gt_class_copy = gt_class num_gts_copy = self._get_num_gts(gt_class_copy) total_loss = self._get_prediction_loss( boxes[-1], logits[-1], gt_boxes_copy, gt_class_copy, masks=masks[-1] if masks is not None else None, gt_mask=gt_mask, postfix=postfix, dn_match_indices=None, num_gts=num_gts_copy, gt_score=gt_score if gt_score is not None else None) if self.aux_loss: total_loss.update( self._get_loss_aux( boxes[:-1], logits[:-1], gt_boxes_copy, gt_class_copy, self.num_classes, num_gts_copy, dn_match_indices=None, postfix=postfix, masks=masks[:-1] if masks is not None else None, gt_mask=gt_mask, gt_score=gt_score if gt_score is not None else None)) if dn_meta is not None: num_gts = self._get_num_gts(gt_class) dn_positive_idx, dn_num_group = \ dn_meta["dn_positive_idx"], dn_meta["dn_num_group"] assert len(gt_class) == len(dn_positive_idx) # denoising match indices dn_match_indices = self.get_dn_match_indices( gt_class, dn_positive_idx, dn_num_group) # compute denoising training loss num_gts *= dn_num_group dn_loss = super(DINOv3Loss, self).forward( dn_out_bboxes, dn_out_logits, gt_bbox, gt_class, postfix="_dn", dn_match_indices=dn_match_indices, num_gts=num_gts, gt_score=gt_score) total_loss.update(dn_loss) else: total_loss.update( {k + '_dn': paddle.to_tensor([0.]) for k in total_loss.keys()}) return total_loss @staticmethod def get_dn_match_indices(labels, dn_positive_idx, dn_num_group): dn_match_indices = [] for i in range(len(labels)): num_gt = len(labels[i]) if num_gt > 0: gt_idx = paddle.arange(end=num_gt, dtype="int64") gt_idx = gt_idx.tile([dn_num_group]) assert len(dn_positive_idx[i]) == len(gt_idx) dn_match_indices.append((dn_positive_idx[i], gt_idx)) else: dn_match_indices.append((paddle.zeros( [0], dtype="int64"), paddle.zeros( [0], dtype="int64"))) return dn_match_indices @register class MaskDINOLoss(DETRLoss): __shared__ = ['num_classes', 'use_focal_loss', 'num_sample_points'] __inject__ = ['matcher'] def __init__(self, num_classes=80, matcher='HungarianMatcher', loss_coeff={ 'class': 4, 'bbox': 5, 'giou': 2, 'mask': 5, 'dice': 5 }, aux_loss=True, use_focal_loss=False, use_vfl=False, vfl_iou_type='bbox', num_sample_points=12544, oversample_ratio=3.0, important_sample_ratio=0.75): super(MaskDINOLoss, self).__init__(num_classes, matcher, loss_coeff, aux_loss, use_focal_loss, use_vfl, vfl_iou_type) assert oversample_ratio >= 1 assert important_sample_ratio <= 1 and important_sample_ratio >= 0 self.num_sample_points = num_sample_points self.oversample_ratio = oversample_ratio self.important_sample_ratio = important_sample_ratio self.num_oversample_points = int(num_sample_points * oversample_ratio) self.num_important_points = int(num_sample_points * important_sample_ratio) self.num_random_points = num_sample_points - self.num_important_points def forward(self, boxes, logits, gt_bbox, gt_class, masks=None, gt_mask=None, postfix="", dn_out_bboxes=None, dn_out_logits=None, dn_out_masks=None, dn_meta=None, **kwargs): num_gts = self._get_num_gts(gt_class) total_loss = super(MaskDINOLoss, self).forward( boxes, logits, gt_bbox, gt_class, masks=masks, gt_mask=gt_mask, num_gts=num_gts) if dn_meta is not None: dn_positive_idx, dn_num_group = \ dn_meta["dn_positive_idx"], dn_meta["dn_num_group"] assert len(gt_class) == len(dn_positive_idx) # denoising match indices dn_match_indices = DINOLoss.get_dn_match_indices( gt_class, dn_positive_idx, dn_num_group) # compute denoising training loss num_gts *= dn_num_group dn_loss = super(MaskDINOLoss, self).forward( dn_out_bboxes, dn_out_logits, gt_bbox, gt_class, masks=dn_out_masks, gt_mask=gt_mask, postfix="_dn", dn_match_indices=dn_match_indices, num_gts=num_gts) total_loss.update(dn_loss) else: total_loss.update( {k + '_dn': paddle.to_tensor([0.]) for k in total_loss.keys()}) return total_loss def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts, postfix=""): # masks: [b, query, h, w], gt_mask: list[[n, H, W]] name_mask = "loss_mask" + postfix name_dice = "loss_dice" + postfix loss = dict() if sum(len(a) for a in gt_mask) == 0: loss[name_mask] = paddle.to_tensor([0.]) loss[name_dice] = paddle.to_tensor([0.]) return loss src_masks, target_masks = self._get_src_target_assign(masks, gt_mask, match_indices) # sample points sample_points = self._get_point_coords_by_uncertainty(src_masks) sample_points = 2.0 * sample_points.unsqueeze(1) - 1.0 src_masks = F.grid_sample( src_masks.unsqueeze(1), sample_points, align_corners=False).squeeze([1, 2]) target_masks = F.grid_sample( target_masks.unsqueeze(1), sample_points, align_corners=False).squeeze([1, 2]).detach() loss[name_mask] = self.loss_coeff[ 'mask'] * F.binary_cross_entropy_with_logits( src_masks, target_masks, reduction='none').mean(1).sum() / num_gts loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss( src_masks, target_masks, num_gts) return loss def _get_point_coords_by_uncertainty(self, masks): # Sample points based on their uncertainty. masks = masks.detach() num_masks = masks.shape[0] sample_points = paddle.rand( [num_masks, 1, self.num_oversample_points, 2]) out_mask = F.grid_sample( masks.unsqueeze(1), 2.0 * sample_points - 1.0, align_corners=False).squeeze([1, 2]) out_mask = -paddle.abs(out_mask) _, topk_ind = paddle.topk(out_mask, self.num_important_points, axis=1) batch_ind = paddle.arange(end=num_masks, dtype=topk_ind.dtype) batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_important_points]) topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1) sample_points = paddle.gather_nd(sample_points.squeeze(1), topk_ind) if self.num_random_points > 0: sample_points = paddle.concat( [ sample_points, paddle.rand([num_masks, self.num_random_points, 2]) ], axis=1) return sample_points ================================================ FILE: ppdet/modeling/losses/fairmot_loss.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn from paddle.nn.initializer import Constant from ppdet.core.workspace import register __all__ = ['FairMOTLoss'] @register class FairMOTLoss(nn.Layer): def __init__(self): super(FairMOTLoss, self).__init__() self.det_weight = self.create_parameter( shape=[1], default_initializer=Constant(-1.85)) self.reid_weight = self.create_parameter( shape=[1], default_initializer=Constant(-1.05)) def forward(self, det_loss, reid_loss): loss = paddle.exp(-self.det_weight) * det_loss + paddle.exp( -self.reid_weight) * reid_loss + (self.det_weight + self.reid_weight ) loss *= 0.5 return {'loss': loss} ================================================ FILE: ppdet/modeling/losses/fcos_loss.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from ppdet.modeling import ops from functools import partial __all__ = ['FCOSLoss', 'FCOSLossMILC', 'FCOSLossCR'] def flatten_tensor(inputs, channel_first=False): """ Flatten a Tensor Args: inputs (Tensor): 4-D Tensor with shape [N, C, H, W] or [N, H, W, C] channel_first (bool): If true the dimension order of Tensor is [N, C, H, W], otherwise is [N, H, W, C] Return: output_channel_last (Tensor): The flattened Tensor in channel_last style """ if channel_first: input_channel_last = paddle.transpose(inputs, perm=[0, 2, 3, 1]) else: input_channel_last = inputs output_channel_last = paddle.flatten( input_channel_last, start_axis=0, stop_axis=2) return output_channel_last @register class FCOSLoss(nn.Layer): """ FCOSLoss Args: loss_alpha (float): alpha in focal loss loss_gamma (float): gamma in focal loss iou_loss_type (str): location loss type, IoU/GIoU/LINEAR_IoU reg_weights (float): weight for location loss quality (str): quality branch, centerness/iou """ def __init__(self, loss_alpha=0.25, loss_gamma=2.0, iou_loss_type="giou", reg_weights=1.0, quality='centerness'): super(FCOSLoss, self).__init__() self.loss_alpha = loss_alpha self.loss_gamma = loss_gamma self.iou_loss_type = iou_loss_type self.reg_weights = reg_weights self.quality = quality def _iou_loss(self, pred, targets, positive_mask, weights=None, return_iou=False): """ Calculate the loss for location prediction Args: pred (Tensor): bounding boxes prediction targets (Tensor): targets for positive samples positive_mask (Tensor): mask of positive samples weights (Tensor): weights for each positive samples Return: loss (Tensor): location loss """ plw = pred[:, 0] * positive_mask pth = pred[:, 1] * positive_mask prw = pred[:, 2] * positive_mask pbh = pred[:, 3] * positive_mask tlw = targets[:, 0] * positive_mask tth = targets[:, 1] * positive_mask trw = targets[:, 2] * positive_mask tbh = targets[:, 3] * positive_mask tlw.stop_gradient = True trw.stop_gradient = True tth.stop_gradient = True tbh.stop_gradient = True ilw = paddle.minimum(plw, tlw) irw = paddle.minimum(prw, trw) ith = paddle.minimum(pth, tth) ibh = paddle.minimum(pbh, tbh) clw = paddle.maximum(plw, tlw) crw = paddle.maximum(prw, trw) cth = paddle.maximum(pth, tth) cbh = paddle.maximum(pbh, tbh) area_predict = (plw + prw) * (pth + pbh) area_target = (tlw + trw) * (tth + tbh) area_inter = (ilw + irw) * (ith + ibh) ious = (area_inter + 1.0) / ( area_predict + area_target - area_inter + 1.0) ious = ious * positive_mask if return_iou: return ious if self.iou_loss_type.lower() == "linear_iou": loss = 1.0 - ious elif self.iou_loss_type.lower() == "giou": area_uniou = area_predict + area_target - area_inter area_circum = (clw + crw) * (cth + cbh) + 1e-7 giou = ious - (area_circum - area_uniou) / area_circum loss = 1.0 - giou elif self.iou_loss_type.lower() == "iou": loss = 0.0 - paddle.log(ious) else: raise KeyError if weights is not None: loss = loss * weights return loss def forward(self, cls_logits, bboxes_reg, centerness, tag_labels, tag_bboxes, tag_center): """ Calculate the loss for classification, location and centerness Args: cls_logits (list): list of Tensor, which is predicted score for all anchor points with shape [N, M, C] bboxes_reg (list): list of Tensor, which is predicted offsets for all anchor points with shape [N, M, 4] centerness (list): list of Tensor, which is predicted centerness for all anchor points with shape [N, M, 1] tag_labels (list): list of Tensor, which is category targets for each anchor point tag_bboxes (list): list of Tensor, which is bounding boxes targets for positive samples tag_center (list): list of Tensor, which is centerness targets for positive samples Return: loss (dict): loss composed by classification loss, bounding box """ cls_logits_flatten_list = [] bboxes_reg_flatten_list = [] centerness_flatten_list = [] tag_labels_flatten_list = [] tag_bboxes_flatten_list = [] tag_center_flatten_list = [] num_lvl = len(cls_logits) for lvl in range(num_lvl): cls_logits_flatten_list.append( flatten_tensor(cls_logits[lvl], True)) bboxes_reg_flatten_list.append( flatten_tensor(bboxes_reg[lvl], True)) centerness_flatten_list.append( flatten_tensor(centerness[lvl], True)) tag_labels_flatten_list.append( flatten_tensor(tag_labels[lvl], False)) tag_bboxes_flatten_list.append( flatten_tensor(tag_bboxes[lvl], False)) tag_center_flatten_list.append( flatten_tensor(tag_center[lvl], False)) cls_logits_flatten = paddle.concat(cls_logits_flatten_list, axis=0) bboxes_reg_flatten = paddle.concat(bboxes_reg_flatten_list, axis=0) centerness_flatten = paddle.concat(centerness_flatten_list, axis=0) tag_labels_flatten = paddle.concat(tag_labels_flatten_list, axis=0) tag_bboxes_flatten = paddle.concat(tag_bboxes_flatten_list, axis=0) tag_center_flatten = paddle.concat(tag_center_flatten_list, axis=0) tag_labels_flatten.stop_gradient = True tag_bboxes_flatten.stop_gradient = True tag_center_flatten.stop_gradient = True mask_positive_bool = tag_labels_flatten > 0 mask_positive_bool.stop_gradient = True mask_positive_float = paddle.cast(mask_positive_bool, dtype="float32") mask_positive_float.stop_gradient = True num_positive_fp32 = paddle.sum(mask_positive_float) num_positive_fp32.stop_gradient = True num_positive_int32 = paddle.cast(num_positive_fp32, dtype="int32") num_positive_int32 = num_positive_int32 * 0 + 1 num_positive_int32.stop_gradient = True normalize_sum = paddle.sum(tag_center_flatten * mask_positive_float) normalize_sum.stop_gradient = True # 1. cls_logits: sigmoid_focal_loss # expand onehot labels num_classes = cls_logits_flatten.shape[-1] tag_labels_flatten = paddle.squeeze(tag_labels_flatten, axis=-1) tag_labels_flatten_bin = F.one_hot( tag_labels_flatten, num_classes=1 + num_classes) tag_labels_flatten_bin = tag_labels_flatten_bin[:, 1:] # sigmoid_focal_loss cls_loss = F.sigmoid_focal_loss( cls_logits_flatten, tag_labels_flatten_bin) / num_positive_fp32 if self.quality == 'centerness': # 2. bboxes_reg: giou_loss mask_positive_float = paddle.squeeze(mask_positive_float, axis=-1) tag_center_flatten = paddle.squeeze(tag_center_flatten, axis=-1) reg_loss = self._iou_loss( bboxes_reg_flatten, tag_bboxes_flatten, mask_positive_float, weights=tag_center_flatten) reg_loss = reg_loss * mask_positive_float / normalize_sum # 3. centerness: sigmoid_cross_entropy_with_logits_loss centerness_flatten = paddle.squeeze(centerness_flatten, axis=-1) quality_loss = ops.sigmoid_cross_entropy_with_logits( centerness_flatten, tag_center_flatten) quality_loss = quality_loss * mask_positive_float / num_positive_fp32 elif self.quality == 'iou': # 2. bboxes_reg: giou_loss mask_positive_float = paddle.squeeze(mask_positive_float, axis=-1) tag_center_flatten = paddle.squeeze(tag_center_flatten, axis=-1) reg_loss = self._iou_loss( bboxes_reg_flatten, tag_bboxes_flatten, mask_positive_float, weights=None) reg_loss = reg_loss * mask_positive_float / num_positive_fp32 # num_positive_fp32 is num_foreground # 3. centerness: sigmoid_cross_entropy_with_logits_loss centerness_flatten = paddle.squeeze(centerness_flatten, axis=-1) gt_ious = self._iou_loss( bboxes_reg_flatten, tag_bboxes_flatten, mask_positive_float, weights=None, return_iou=True) quality_loss = ops.sigmoid_cross_entropy_with_logits( centerness_flatten, gt_ious) quality_loss = quality_loss * mask_positive_float / num_positive_fp32 else: raise Exception(f'Unknown quality type: {self.quality}') loss_all = { "loss_cls": paddle.sum(cls_loss), "loss_box": paddle.sum(reg_loss), "loss_quality": paddle.sum(quality_loss), } return loss_all @register class FCOSLossMILC(FCOSLoss): """ FCOSLossMILC for ARSL in semi-det(ssod) Args: loss_alpha (float): alpha in focal loss loss_gamma (float): gamma in focal loss iou_loss_type (str): location loss type, IoU/GIoU/LINEAR_IoU reg_weights (float): weight for location loss """ def __init__(self, loss_alpha=0.25, loss_gamma=2.0, iou_loss_type="giou", reg_weights=1.0): super(FCOSLossMILC, self).__init__() self.loss_alpha = loss_alpha self.loss_gamma = loss_gamma self.iou_loss_type = iou_loss_type self.reg_weights = reg_weights def iou_loss(self, pred, targets, weights=None, avg_factor=None): """ Calculate the loss for location prediction Args: pred (Tensor): bounding boxes prediction targets (Tensor): targets for positive samples weights (Tensor): weights for each positive samples Return: loss (Tensor): location loss """ plw = pred[:, 0] pth = pred[:, 1] prw = pred[:, 2] pbh = pred[:, 3] tlw = targets[:, 0] tth = targets[:, 1] trw = targets[:, 2] tbh = targets[:, 3] tlw.stop_gradient = True trw.stop_gradient = True tth.stop_gradient = True tbh.stop_gradient = True ilw = paddle.minimum(plw, tlw) irw = paddle.minimum(prw, trw) ith = paddle.minimum(pth, tth) ibh = paddle.minimum(pbh, tbh) clw = paddle.maximum(plw, tlw) crw = paddle.maximum(prw, trw) cth = paddle.maximum(pth, tth) cbh = paddle.maximum(pbh, tbh) area_predict = (plw + prw) * (pth + pbh) area_target = (tlw + trw) * (tth + tbh) area_inter = (ilw + irw) * (ith + ibh) ious = (area_inter + 1.0) / ( area_predict + area_target - area_inter + 1.0) ious = ious if self.iou_loss_type.lower() == "linear_iou": loss = 1.0 - ious elif self.iou_loss_type.lower() == "giou": area_uniou = area_predict + area_target - area_inter area_circum = (clw + crw) * (cth + cbh) + 1e-7 giou = ious - (area_circum - area_uniou) / area_circum loss = 1.0 - giou elif self.iou_loss_type.lower() == "iou": loss = 0.0 - paddle.log(ious) else: raise KeyError if weights is not None: loss = loss * weights loss = paddle.sum(loss) if avg_factor is not None: loss = loss / avg_factor return loss # temp function: calcualate iou between bbox and target def _bbox_overlap_align(self, pred, targets): assert pred.shape[0] == targets.shape[0], \ 'the pred should be aligned with target.' plw = pred[:, 0] pth = pred[:, 1] prw = pred[:, 2] pbh = pred[:, 3] tlw = targets[:, 0] tth = targets[:, 1] trw = targets[:, 2] tbh = targets[:, 3] ilw = paddle.minimum(plw, tlw) irw = paddle.minimum(prw, trw) ith = paddle.minimum(pth, tth) ibh = paddle.minimum(pbh, tbh) area_predict = (plw + prw) * (pth + pbh) area_target = (tlw + trw) * (tth + tbh) area_inter = (ilw + irw) * (ith + ibh) ious = (area_inter + 1.0) / ( area_predict + area_target - area_inter + 1.0) return ious def iou_based_soft_label_loss(self, pred, target, alpha=0.75, gamma=2.0, iou_weighted=False, implicit_iou=None, avg_factor=None): assert pred.shape == target.shape pred = F.sigmoid(pred) target = target.cast(pred.dtype) if implicit_iou is not None: pred = pred * implicit_iou if iou_weighted: focal_weight = (pred - target).abs().pow(gamma) * target * (target > 0.0).cast('float32') + \ alpha * (pred - target).abs().pow(gamma) * \ (target <= 0.0).cast('float32') else: focal_weight = (pred - target).abs().pow(gamma) * (target > 0.0).cast('float32') + \ alpha * (pred - target).abs().pow(gamma) * \ (target <= 0.0).cast('float32') # focal loss loss = F.binary_cross_entropy( pred, target, reduction='none') * focal_weight if avg_factor is not None: loss = loss / avg_factor return loss def forward(self, cls_logits, bboxes_reg, centerness, tag_labels, tag_bboxes, tag_center): """ Calculate the loss for classification, location and centerness Args: cls_logits (list): list of Tensor, which is predicted score for all anchor points with shape [N, M, C] bboxes_reg (list): list of Tensor, which is predicted offsets for all anchor points with shape [N, M, 4] centerness (list): list of Tensor, which is predicted centerness for all anchor points with shape [N, M, 1] tag_labels (list): list of Tensor, which is category targets for each anchor point tag_bboxes (list): list of Tensor, which is bounding boxes targets for positive samples tag_center (list): list of Tensor, which is centerness targets for positive samples Return: loss (dict): loss composed by classification loss, bounding box """ cls_logits_flatten_list = [] bboxes_reg_flatten_list = [] centerness_flatten_list = [] tag_labels_flatten_list = [] tag_bboxes_flatten_list = [] tag_center_flatten_list = [] num_lvl = len(cls_logits) for lvl in range(num_lvl): cls_logits_flatten_list.append( flatten_tensor(cls_logits[lvl], True)) bboxes_reg_flatten_list.append( flatten_tensor(bboxes_reg[lvl], True)) centerness_flatten_list.append( flatten_tensor(centerness[lvl], True)) tag_labels_flatten_list.append( flatten_tensor(tag_labels[lvl], False)) tag_bboxes_flatten_list.append( flatten_tensor(tag_bboxes[lvl], False)) tag_center_flatten_list.append( flatten_tensor(tag_center[lvl], False)) cls_logits_flatten = paddle.concat(cls_logits_flatten_list, axis=0) bboxes_reg_flatten = paddle.concat(bboxes_reg_flatten_list, axis=0) centerness_flatten = paddle.concat(centerness_flatten_list, axis=0) tag_labels_flatten = paddle.concat(tag_labels_flatten_list, axis=0) tag_bboxes_flatten = paddle.concat(tag_bboxes_flatten_list, axis=0) tag_center_flatten = paddle.concat(tag_center_flatten_list, axis=0) tag_labels_flatten.stop_gradient = True tag_bboxes_flatten.stop_gradient = True tag_center_flatten.stop_gradient = True # find positive index mask_positive_bool = tag_labels_flatten > 0 mask_positive_bool.stop_gradient = True mask_positive_float = paddle.cast(mask_positive_bool, dtype="float32") mask_positive_float.stop_gradient = True num_positive_fp32 = paddle.sum(mask_positive_float) num_positive_fp32.stop_gradient = True num_positive_int32 = paddle.cast(num_positive_fp32, dtype="int32") num_positive_int32 = num_positive_int32 * 0 + 1 num_positive_int32.stop_gradient = True # centerness target is used as reg weight normalize_sum = paddle.sum(tag_center_flatten * mask_positive_float) normalize_sum.stop_gradient = True # 1. IoU-Based soft label loss # calculate iou with paddle.no_grad(): pos_ind = paddle.nonzero( tag_labels_flatten.reshape([-1]) > 0).reshape([-1]) pos_pred = bboxes_reg_flatten[pos_ind] pos_target = tag_bboxes_flatten[pos_ind] bbox_iou = self._bbox_overlap_align(pos_pred, pos_target) # pos labels pos_labels = tag_labels_flatten[pos_ind].squeeze(1) cls_target = paddle.zeros(cls_logits_flatten.shape) cls_target[pos_ind, pos_labels - 1] = bbox_iou cls_loss = self.iou_based_soft_label_loss( cls_logits_flatten, cls_target, implicit_iou=F.sigmoid(centerness_flatten), avg_factor=num_positive_fp32) # 2. bboxes_reg: giou_loss mask_positive_float = paddle.squeeze(mask_positive_float, axis=-1) tag_center_flatten = paddle.squeeze(tag_center_flatten, axis=-1) reg_loss = self._iou_loss( bboxes_reg_flatten, tag_bboxes_flatten, mask_positive_float, weights=tag_center_flatten) reg_loss = reg_loss * mask_positive_float / normalize_sum # 3. iou loss pos_iou_pred = paddle.squeeze(centerness_flatten, axis=-1)[pos_ind] loss_iou = ops.sigmoid_cross_entropy_with_logits(pos_iou_pred, bbox_iou) loss_iou = loss_iou / num_positive_fp32 * 0.5 loss_all = { "loss_cls": paddle.sum(cls_loss), "loss_box": paddle.sum(reg_loss), 'loss_iou': paddle.sum(loss_iou), } return loss_all # Concat multi-level feature maps by image def levels_to_images(mlvl_tensor): batch_size = mlvl_tensor[0].shape[0] batch_list = [[] for _ in range(batch_size)] channels = mlvl_tensor[0].shape[1] for t in mlvl_tensor: t = t.transpose([0, 2, 3, 1]) t = t.reshape([batch_size, -1, channels]) for img in range(batch_size): batch_list[img].append(t[img]) return [paddle.concat(item, axis=0) for item in batch_list] def multi_apply(func, *args, **kwargs): """Apply function to a list of arguments. Note: This function applies the ``func`` to multiple inputs and map the multiple outputs of the ``func`` into different list. Each list contains the same type of outputs corresponding to different inputs. Args: func (Function): A function that will be applied to a list of arguments Returns: tuple(list): A tuple containing multiple list, each list contains \ a kind of returned results by the function """ pfunc = partial(func, **kwargs) if kwargs else func map_results = map(pfunc, *args) return tuple(map(list, zip(*map_results))) @register class FCOSLossCR(FCOSLossMILC): """ FCOSLoss of Consistency Regularization """ def __init__(self, iou_loss_type="giou", cls_weight=2.0, reg_weight=2.0, iou_weight=0.5, hard_neg_mining_flag=True): super(FCOSLossCR, self).__init__() self.iou_loss_type = iou_loss_type self.cls_weight = cls_weight self.reg_weight = reg_weight self.iou_weight = iou_weight self.hard_neg_mining_flag = hard_neg_mining_flag def iou_loss(self, pred, targets, weights=None, avg_factor=None): """ Calculate the loss for location prediction Args: pred (Tensor): bounding boxes prediction targets (Tensor): targets for positive samples weights (Tensor): weights for each positive samples Return: loss (Tensor): location loss """ plw = pred[:, 0] pth = pred[:, 1] prw = pred[:, 2] pbh = pred[:, 3] tlw = targets[:, 0] tth = targets[:, 1] trw = targets[:, 2] tbh = targets[:, 3] tlw.stop_gradient = True trw.stop_gradient = True tth.stop_gradient = True tbh.stop_gradient = True ilw = paddle.minimum(plw, tlw) irw = paddle.minimum(prw, trw) ith = paddle.minimum(pth, tth) ibh = paddle.minimum(pbh, tbh) clw = paddle.maximum(plw, tlw) crw = paddle.maximum(prw, trw) cth = paddle.maximum(pth, tth) cbh = paddle.maximum(pbh, tbh) area_predict = (plw + prw) * (pth + pbh) area_target = (tlw + trw) * (tth + tbh) area_inter = (ilw + irw) * (ith + ibh) ious = (area_inter + 1.0) / ( area_predict + area_target - area_inter + 1.0) ious = ious if self.iou_loss_type.lower() == "linear_iou": loss = 1.0 - ious elif self.iou_loss_type.lower() == "giou": area_uniou = area_predict + area_target - area_inter area_circum = (clw + crw) * (cth + cbh) + 1e-7 giou = ious - (area_circum - area_uniou) / area_circum loss = 1.0 - giou elif self.iou_loss_type.lower() == "iou": loss = 0.0 - paddle.log(ious) else: raise KeyError if weights is not None: loss = loss * weights loss = paddle.sum(loss) if avg_factor is not None: loss = loss / avg_factor return loss # calcualate iou between bbox and target def bbox_overlap_align(self, pred, targets): assert pred.shape[0] == targets.shape[0], \ 'the pred should be aligned with target.' plw = pred[:, 0] pth = pred[:, 1] prw = pred[:, 2] pbh = pred[:, 3] tlw = targets[:, 0] tth = targets[:, 1] trw = targets[:, 2] tbh = targets[:, 3] ilw = paddle.minimum(plw, tlw) irw = paddle.minimum(prw, trw) ith = paddle.minimum(pth, tth) ibh = paddle.minimum(pbh, tbh) area_predict = (plw + prw) * (pth + pbh) area_target = (tlw + trw) * (tth + tbh) area_inter = (ilw + irw) * (ith + ibh) ious = (area_inter + 1.0) / ( area_predict + area_target - area_inter + 1.0) return ious # cls loss: iou-based soft lable with joint iou def quality_focal_loss(self, stu_cls, targets, quality=None, weights=None, alpha=0.75, gamma=2.0, avg_factor='sum'): stu_cls = F.sigmoid(stu_cls) if quality is not None: stu_cls = stu_cls * F.sigmoid(quality) focal_weight = (stu_cls - targets).abs().pow(gamma) * (targets > 0.0).cast('float32') + \ alpha * (stu_cls - targets).abs().pow(gamma) * \ (targets <= 0.0).cast('float32') loss = F.binary_cross_entropy( stu_cls, targets, reduction='none') * focal_weight if weights is not None: loss = loss * weights.reshape([-1, 1]) loss = paddle.sum(loss) if avg_factor is not None: loss = loss / avg_factor return loss # generate points according to feature maps def compute_locations_by_level(self, fpn_stride, h, w): """ Compute locations of anchor points of each FPN layer Return: Anchor points locations of current FPN feature map """ shift_x = paddle.arange(0, w * fpn_stride, fpn_stride) shift_y = paddle.arange(0, h * fpn_stride, fpn_stride) shift_x = paddle.unsqueeze(shift_x, axis=0) shift_y = paddle.unsqueeze(shift_y, axis=1) shift_x = paddle.expand(shift_x, shape=[h, w]) shift_y = paddle.expand(shift_y, shape=[h, w]) shift_x = paddle.reshape(shift_x, shape=[-1]) shift_y = paddle.reshape(shift_y, shape=[-1]) location = paddle.stack( [shift_x, shift_y], axis=-1) + float(fpn_stride) / 2 return location # decode bbox from ltrb to x1y1x2y2 def decode_bbox(self, ltrb, points): assert ltrb.shape[0] == points.shape[0], \ "When decoding bbox in one image, the num of loc should be same with points." bbox_decoding = paddle.stack( [ points[:, 0] - ltrb[:, 0], points[:, 1] - ltrb[:, 1], points[:, 0] + ltrb[:, 2], points[:, 1] + ltrb[:, 3] ], axis=1) return bbox_decoding # encode bbox from x1y1x2y2 to ltrb def encode_bbox(self, bbox, points): assert bbox.shape[0] == points.shape[0], \ "When encoding bbox in one image, the num of bbox should be same with points." bbox_encoding = paddle.stack( [ points[:, 0] - bbox[:, 0], points[:, 1] - bbox[:, 1], bbox[:, 2] - points[:, 0], bbox[:, 3] - points[:, 1] ], axis=1) return bbox_encoding def calcualate_iou(self, gt_bbox, predict_bbox): # bbox area gt_area = (gt_bbox[:, 2] - gt_bbox[:, 0]) * \ (gt_bbox[:, 3] - gt_bbox[:, 1]) predict_area = (predict_bbox[:, 2] - predict_bbox[:, 0]) * \ (predict_bbox[:, 3] - predict_bbox[:, 1]) # overlop area lt = paddle.fmax(gt_bbox[:, None, :2], predict_bbox[None, :, :2]) rb = paddle.fmin(gt_bbox[:, None, 2:], predict_bbox[None, :, 2:]) wh = paddle.clip(rb - lt, min=0) overlap = wh[..., 0] * wh[..., 1] # iou iou = overlap / (gt_area[:, None] + predict_area[None, :] - overlap) return iou # select potential positives from hard negatives def hard_neg_mining(self, cls_score, loc_ltrb, quality, pos_ind, hard_neg_ind, loc_mask, loc_targets, iou_thresh=0.6): # get points locations and strides points_list = [] strides_list = [] scale_list = [] scale = [0, 1, 2, 3, 4] for fpn_scale, fpn_stride, HW in zip(scale, self.fpn_stride, self.lvl_hw): h, w = HW lvl_points = self.compute_locations_by_level(fpn_stride, h, w) points_list.append(lvl_points) lvl_strides = paddle.full([h * w, 1], fpn_stride) strides_list.append(lvl_strides) lvl_scales = paddle.full([h * w, 1], fpn_scale) scale_list.append(lvl_scales) points = paddle.concat(points_list, axis=0) strides = paddle.concat(strides_list, axis=0) scales = paddle.concat(scale_list, axis=0) # cls scores cls_vals = F.sigmoid(cls_score) * F.sigmoid(quality) max_vals = paddle.max(cls_vals, axis=-1) class_ind = paddle.argmax(cls_vals, axis=-1) ### calculate iou between positive and hard negative # decode pos bbox pos_cls = max_vals[pos_ind] pos_loc = loc_ltrb[pos_ind].reshape([-1, 4]) pos_strides = strides[pos_ind] pos_points = points[pos_ind].reshape([-1, 2]) pos_loc = pos_loc * pos_strides pos_bbox = self.decode_bbox(pos_loc, pos_points) pos_scales = scales[pos_ind] # decode hard negative bbox hard_neg_loc = loc_ltrb[hard_neg_ind].reshape([-1, 4]) hard_neg_strides = strides[hard_neg_ind] hard_neg_points = points[hard_neg_ind].reshape([-1, 2]) hard_neg_loc = hard_neg_loc * hard_neg_strides hard_neg_bbox = self.decode_bbox(hard_neg_loc, hard_neg_points) hard_neg_scales = scales[hard_neg_ind] # iou between pos bbox and hard negative bbox hard_neg_pos_iou = self.calcualate_iou(hard_neg_bbox, pos_bbox) ### select potential positives from hard negatives # scale flag scale_temp = paddle.abs( pos_scales.reshape([-1])[None, :] - hard_neg_scales.reshape([-1]) [:, None]) scale_flag = (scale_temp <= 1.) # iou flag iou_flag = (hard_neg_pos_iou >= iou_thresh) # same class flag pos_class = class_ind[pos_ind] hard_neg_class = class_ind[hard_neg_ind] class_flag = pos_class[None, :] - hard_neg_class[:, None] class_flag = (class_flag == 0) # hard negative point inside positive bbox flag ltrb_temp = paddle.stack( [ hard_neg_points[:, None, 0] - pos_bbox[None, :, 0], hard_neg_points[:, None, 1] - pos_bbox[None, :, 1], pos_bbox[None, :, 2] - hard_neg_points[:, None, 0], pos_bbox[None, :, 3] - hard_neg_points[:, None, 1] ], axis=-1) inside_flag = ltrb_temp.min(axis=-1) > 0 # reset iou valid_flag = (iou_flag & class_flag & inside_flag & scale_flag) invalid_iou = paddle.zeros_like(hard_neg_pos_iou) hard_neg_pos_iou = paddle.where(valid_flag, hard_neg_pos_iou, invalid_iou) pos_hard_neg_max_iou = hard_neg_pos_iou.max(axis=-1) # selece potential pos potential_pos_ind = (pos_hard_neg_max_iou > 0.) num_potential_pos = paddle.nonzero(potential_pos_ind).shape[0] if num_potential_pos == 0: return None ### calculate loc target:aggregate all matching bboxes as the bbox targets of potential pos # prepare data potential_points = hard_neg_points[potential_pos_ind].reshape([-1, 2]) potential_strides = hard_neg_strides[potential_pos_ind] potential_valid_flag = valid_flag[potential_pos_ind] potential_pos_ind = hard_neg_ind[potential_pos_ind] # get cls and box of matching positives pos_cls = max_vals[pos_ind] expand_pos_bbox = paddle.expand( pos_bbox, shape=[num_potential_pos, pos_bbox.shape[0], pos_bbox.shape[1]]) expand_pos_cls = paddle.expand( pos_cls, shape=[num_potential_pos, pos_cls.shape[0]]) invalid_cls = paddle.zeros_like(expand_pos_cls) expand_pos_cls = paddle.where(potential_valid_flag, expand_pos_cls, invalid_cls) expand_pos_cls = paddle.unsqueeze(expand_pos_cls, axis=-1) # aggregate box based on cls_score agg_bbox = (expand_pos_bbox * expand_pos_cls).sum(axis=1) \ / expand_pos_cls.sum(axis=1) agg_ltrb = self.encode_bbox(agg_bbox, potential_points) agg_ltrb = agg_ltrb / potential_strides # loc target for all pos loc_targets[potential_pos_ind] = agg_ltrb loc_mask[potential_pos_ind] = 1. return loc_mask, loc_targets # get training targets def get_targets_per_img(self, tea_cls, tea_loc, tea_iou, stu_cls, stu_loc, stu_iou): ### sample selection # prepare datas tea_cls_scores = F.sigmoid(tea_cls) * F.sigmoid(tea_iou) class_ind = paddle.argmax(tea_cls_scores, axis=-1) max_vals = paddle.max(tea_cls_scores, axis=-1) cls_mask = paddle.zeros_like( max_vals ) # set cls valid mask: pos is 1, hard_negative and negative are 0. num_pos, num_hard_neg = 0, 0 # mean-std selection # using nonzero to turn index from bool to int, because the index will be used to compose two-dim index in following. # using squeeze rather than reshape to avoid errors when no score is larger than thresh. candidate_ind = paddle.nonzero(max_vals >= 0.1).squeeze(axis=-1) num_candidate = candidate_ind.shape[0] if num_candidate > 0: # pos thresh = mean + std to select pos samples candidate_score = max_vals[candidate_ind] candidate_score_mean = candidate_score.mean() candidate_score_std = candidate_score.std() pos_thresh = (candidate_score_mean + candidate_score_std).clip( max=0.4) # select pos pos_ind = paddle.nonzero(max_vals >= pos_thresh).squeeze(axis=-1) num_pos = pos_ind.shape[0] # select hard negatives as potential pos hard_neg_ind = (max_vals >= 0.1) & (max_vals < pos_thresh) hard_neg_ind = paddle.nonzero(hard_neg_ind).squeeze(axis=-1) num_hard_neg = hard_neg_ind.shape[0] # if not positive, directly select top-10 as pos. if (num_pos == 0): num_pos = 10 _, pos_ind = paddle.topk(max_vals, k=num_pos) cls_mask[pos_ind] = 1. ### Consistency Regularization Training targets # cls targets pos_class_ind = class_ind[pos_ind] cls_targets = paddle.zeros_like(tea_cls) cls_targets[pos_ind, pos_class_ind] = tea_cls_scores[pos_ind, pos_class_ind] # hard negative cls target if num_hard_neg != 0: cls_targets[hard_neg_ind] = tea_cls_scores[hard_neg_ind] # loc targets loc_targets = paddle.zeros_like(tea_loc) loc_targets[pos_ind] = tea_loc[pos_ind] # iou targets iou_targets = paddle.zeros( shape=[tea_iou.shape[0]], dtype=tea_iou.dtype) iou_targets[pos_ind] = F.sigmoid( paddle.squeeze( tea_iou, axis=-1)[pos_ind]) loc_mask = cls_mask.clone() # select potential positive from hard negatives for loc_task training if (num_hard_neg > 0) and self.hard_neg_mining_flag: results = self.hard_neg_mining(tea_cls, tea_loc, tea_iou, pos_ind, hard_neg_ind, loc_mask, loc_targets) if results is not None: loc_mask, loc_targets = results loc_pos_ind = paddle.nonzero(loc_mask > 0.).squeeze(axis=-1) iou_targets[loc_pos_ind] = F.sigmoid( paddle.squeeze( tea_iou, axis=-1)[loc_pos_ind]) return cls_mask, loc_mask, \ cls_targets, loc_targets, iou_targets def forward(self, student_prediction, teacher_prediction): stu_cls_lvl, stu_loc_lvl, stu_iou_lvl = student_prediction tea_cls_lvl, tea_loc_lvl, tea_iou_lvl, self.fpn_stride = teacher_prediction # H and W of level (used for aggregating targets) self.lvl_hw = [] for t in tea_cls_lvl: _, _, H, W = t.shape self.lvl_hw.append([H, W]) # levels to images stu_cls_img = levels_to_images(stu_cls_lvl) stu_loc_img = levels_to_images(stu_loc_lvl) stu_iou_img = levels_to_images(stu_iou_lvl) tea_cls_img = levels_to_images(tea_cls_lvl) tea_loc_img = levels_to_images(tea_loc_lvl) tea_iou_img = levels_to_images(tea_iou_lvl) with paddle.no_grad(): cls_mask, loc_mask, \ cls_targets, loc_targets, iou_targets = multi_apply( self.get_targets_per_img, tea_cls_img, tea_loc_img, tea_iou_img, stu_cls_img, stu_loc_img, stu_iou_img ) # flatten preditction stu_cls = paddle.concat(stu_cls_img, axis=0) stu_loc = paddle.concat(stu_loc_img, axis=0) stu_iou = paddle.concat(stu_iou_img, axis=0) # flatten targets cls_mask = paddle.concat(cls_mask, axis=0) loc_mask = paddle.concat(loc_mask, axis=0) cls_targets = paddle.concat(cls_targets, axis=0) loc_targets = paddle.concat(loc_targets, axis=0) iou_targets = paddle.concat(iou_targets, axis=0) ### Training Weights and avg factor # find positives cls_pos_ind = paddle.nonzero(cls_mask > 0.).squeeze(axis=-1) loc_pos_ind = paddle.nonzero(loc_mask > 0.).squeeze(axis=-1) # cls weight cls_sample_weights = paddle.ones([cls_targets.shape[0]]) cls_avg_factor = paddle.max(cls_targets[cls_pos_ind], axis=-1).sum().item() # loc weight loc_sample_weights = paddle.max(cls_targets[loc_pos_ind], axis=-1) loc_avg_factor = loc_sample_weights.sum().item() # iou weight iou_sample_weights = paddle.ones([loc_pos_ind.shape[0]]) iou_avg_factor = loc_pos_ind.shape[0] ### unsupervised loss # cls loss loss_cls = self.quality_focal_loss( stu_cls, cls_targets, quality=stu_iou, weights=cls_sample_weights, avg_factor=cls_avg_factor) * self.cls_weight # iou loss pos_stu_iou = paddle.squeeze(stu_iou, axis=-1)[loc_pos_ind] pos_iou_targets = iou_targets[loc_pos_ind] loss_iou = F.binary_cross_entropy( F.sigmoid(pos_stu_iou), pos_iou_targets, reduction='none') * iou_sample_weights loss_iou = loss_iou.sum() / iou_avg_factor * self.iou_weight # box loss pos_stu_loc = stu_loc[loc_pos_ind] pos_loc_targets = loc_targets[loc_pos_ind] loss_box = self.iou_loss( pos_stu_loc, pos_loc_targets, weights=loc_sample_weights, avg_factor=loc_avg_factor) loss_box = loss_box * self.reg_weight loss_all = { "loss_cls": loss_cls, "loss_box": loss_box, "loss_iou": loss_iou, } return loss_all ================================================ FILE: ppdet/modeling/losses/focal_loss.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn.functional as F import paddle.nn as nn from ppdet.core.workspace import register __all__ = ['FocalLoss', 'Weighted_FocalLoss'] @register class FocalLoss(nn.Layer): """A wrapper around paddle.nn.functional.sigmoid_focal_loss. Args: use_sigmoid (bool): currently only support use_sigmoid=True alpha (float): parameter alpha in Focal Loss gamma (float): parameter gamma in Focal Loss loss_weight (float): final loss will be multiplied by this """ def __init__(self, use_sigmoid=True, alpha=0.25, gamma=2.0, loss_weight=1.0): super(FocalLoss, self).__init__() assert use_sigmoid == True, \ 'Focal Loss only supports sigmoid at the moment' self.use_sigmoid = use_sigmoid self.alpha = alpha self.gamma = gamma self.loss_weight = loss_weight def forward(self, pred, target, reduction='none'): """forward function. Args: pred (Tensor): logits of class prediction, of shape (N, num_classes) target (Tensor): target class label, of shape (N, ) reduction (str): the way to reduce loss, one of (none, sum, mean) """ num_classes = pred.shape[1] target = F.one_hot(target, num_classes+1).cast(pred.dtype) target = target[:, :-1].detach() loss = F.sigmoid_focal_loss( pred, target, alpha=self.alpha, gamma=self.gamma, reduction=reduction) return loss * self.loss_weight @register class Weighted_FocalLoss(FocalLoss): """A wrapper around paddle.nn.functional.sigmoid_focal_loss. Args: use_sigmoid (bool): currently only support use_sigmoid=True alpha (float): parameter alpha in Focal Loss gamma (float): parameter gamma in Focal Loss loss_weight (float): final loss will be multiplied by this """ def __init__(self, use_sigmoid=True, alpha=0.25, gamma=2.0, loss_weight=1.0, reduction="mean"): super(FocalLoss, self).__init__() assert use_sigmoid == True, \ 'Focal Loss only supports sigmoid at the moment' self.use_sigmoid = use_sigmoid self.alpha = alpha self.gamma = gamma self.loss_weight = loss_weight self.reduction = reduction def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None): """forward function. Args: pred (Tensor): logits of class prediction, of shape (N, num_classes) target (Tensor): target class label, of shape (N, ) reduction (str): the way to reduce loss, one of (none, sum, mean) """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) num_classes = pred.shape[1] target = F.one_hot(target, num_classes + 1).astype(pred.dtype) target = target[:, :-1].detach() loss = F.sigmoid_focal_loss( pred, target, alpha=self.alpha, gamma=self.gamma, reduction='none') if weight is not None: if weight.shape != loss.shape: if weight.shape[0] == loss.shape[0]: # For most cases, weight is of shape (num_priors, ), # which means it does not have the second axis num_class weight = weight.reshape((-1, 1)) else: # Sometimes, weight per anchor per class is also needed. e.g. # in FSAF. But it may be flattened of shape # (num_priors x num_class, ), while loss is still of shape # (num_priors, num_class). assert weight.numel() == loss.numel() weight = weight.reshape((loss.shape[0], -1)) assert weight.ndim == loss.ndim loss = loss * weight # if avg_factor is not specified, just reduce the loss if avg_factor is None: if reduction == 'mean': loss = loss.mean() elif reduction == 'sum': loss = loss.sum() else: # if reduction is mean, then average the loss by avg_factor if reduction == 'mean': # Avoid causing ZeroDivisionError when avg_factor is 0.0, # i.e., all labels of an image belong to ignore index. eps = 1e-10 loss = loss.sum() / (avg_factor + eps) # if reduction is 'none', then do nothing, otherwise raise an error elif reduction != 'none': raise ValueError('avg_factor can not be used with reduction="sum"') return loss * self.loss_weight ================================================ FILE: ppdet/modeling/losses/gfocal_loss.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # The code is based on: # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/gfocal_loss.py from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register, serializable from ppdet.modeling import ops __all__ = ['QualityFocalLoss', 'DistributionFocalLoss'] def quality_focal_loss(pred, target, beta=2.0, use_sigmoid=True): """ Quality Focal Loss (QFL) is from `Generalized Focal Loss: Learning Qualified and Distributed Bounding Boxes for Dense Object Detection `_. Args: pred (Tensor): Predicted joint representation of classification and quality (IoU) estimation with shape (N, C), C is the number of classes. target (tuple([Tensor])): Target category label with shape (N,) and target quality label with shape (N,). beta (float): The beta parameter for calculating the modulating factor. Defaults to 2.0. Returns: Tensor: Loss tensor with shape (N,). """ assert len(target) == 2, """target for QFL must be a tuple of two elements, including category label and quality label, respectively""" # label denotes the category id, score denotes the quality score label, score = target if use_sigmoid: func = F.binary_cross_entropy_with_logits else: func = F.binary_cross_entropy # negatives are supervised by 0 quality score pred_sigmoid = F.sigmoid(pred) if use_sigmoid else pred scale_factor = pred_sigmoid zerolabel = paddle.zeros(pred.shape, dtype='float32') loss = func(pred, zerolabel, reduction='none') * scale_factor.pow(beta) # FG cat_id: [0, num_classes -1], BG cat_id: num_classes bg_class_ind = pred.shape[1] pos = paddle.logical_and((label >= 0), (label < bg_class_ind)).nonzero().squeeze(1) if pos.shape[0] == 0: return loss.sum(axis=1) pos_label = paddle.gather(label, pos, axis=0) pos_mask = np.zeros(pred.shape, dtype=np.int32) pos_mask[pos.numpy(), pos_label.numpy()] = 1 pos_mask = paddle.to_tensor(pos_mask, dtype='bool') score = score.unsqueeze(-1).expand([-1, pred.shape[1]]).cast('float32') # positives are supervised by bbox quality (IoU) score scale_factor_new = score - pred_sigmoid loss_pos = func( pred, score, reduction='none') * scale_factor_new.abs().pow(beta) loss = loss * paddle.logical_not(pos_mask).astype(loss.dtype) + loss_pos * pos_mask.astype(loss.dtype) loss = loss.sum(axis=1) return loss def distribution_focal_loss(pred, label): """Distribution Focal Loss (DFL) is from `Generalized Focal Loss: Learning Qualified and Distributed Bounding Boxes for Dense Object Detection `_. Args: pred (Tensor): Predicted general distribution of bounding boxes (before softmax) with shape (N, n+1), n is the max value of the integral set `{0, ..., n}` in paper. label (Tensor): Target distance label for bounding boxes with shape (N,). Returns: Tensor: Loss tensor with shape (N,). """ dis_left = label.cast('int64') dis_right = dis_left + 1 weight_left = dis_right.cast('float32') - label weight_right = label - dis_left.cast('float32') loss = F.cross_entropy(pred, dis_left, reduction='none') * weight_left \ + F.cross_entropy(pred, dis_right, reduction='none') * weight_right return loss @register @serializable class QualityFocalLoss(nn.Layer): r"""Quality Focal Loss (QFL) is a variant of `Generalized Focal Loss: Learning Qualified and Distributed Bounding Boxes for Dense Object Detection `_. Args: use_sigmoid (bool): Whether sigmoid operation is conducted in QFL. Defaults to True. beta (float): The beta parameter for calculating the modulating factor. Defaults to 2.0. reduction (str): Options are "none", "mean" and "sum". loss_weight (float): Loss weight of current loss. """ def __init__(self, use_sigmoid=True, beta=2.0, reduction='mean', loss_weight=1.0): super(QualityFocalLoss, self).__init__() self.use_sigmoid = use_sigmoid self.beta = beta assert reduction in ('none', 'mean', 'sum') self.reduction = reduction self.loss_weight = loss_weight def forward(self, pred, target, weight=None, avg_factor=None): """Forward function. Args: pred (Tensor): Predicted joint representation of classification and quality (IoU) estimation with shape (N, C), C is the number of classes. target (tuple([Tensor])): Target category label with shape (N,) and target quality label with shape (N,). weight (Tensor, optional): The weight of loss for each prediction. Defaults to None. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. """ loss = self.loss_weight * quality_focal_loss( pred, target, beta=self.beta, use_sigmoid=self.use_sigmoid) if weight is not None: loss = loss * weight if avg_factor is None: if self.reduction == 'none': return loss elif self.reduction == 'mean': return loss.mean() elif self.reduction == 'sum': return loss.sum() else: # if reduction is mean, then average the loss by avg_factor if self.reduction == 'mean': loss = loss.sum() / avg_factor # if reduction is 'none', then do nothing, otherwise raise an error elif self.reduction != 'none': raise ValueError( 'avg_factor can not be used with reduction="sum"') return loss @register @serializable class DistributionFocalLoss(nn.Layer): """Distribution Focal Loss (DFL) is a variant of `Generalized Focal Loss: Learning Qualified and Distributed Bounding Boxes for Dense Object Detection `_. Args: reduction (str): Options are `'none'`, `'mean'` and `'sum'`. loss_weight (float): Loss weight of current loss. """ def __init__(self, reduction='mean', loss_weight=1.0): super(DistributionFocalLoss, self).__init__() assert reduction in ('none', 'mean', 'sum') self.reduction = reduction self.loss_weight = loss_weight def forward(self, pred, target, weight=None, avg_factor=None): """Forward function. Args: pred (Tensor): Predicted general distribution of bounding boxes (before softmax) with shape (N, n+1), n is the max value of the integral set `{0, ..., n}` in paper. target (Tensor): Target distance label for bounding boxes with shape (N,). weight (Tensor, optional): The weight of loss for each prediction. Defaults to None. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. """ loss = self.loss_weight * distribution_focal_loss(pred, target) if weight is not None: loss = loss * weight if avg_factor is None: if self.reduction == 'none': return loss elif self.reduction == 'mean': return loss.mean() elif self.reduction == 'sum': return loss.sum() else: # if reduction is mean, then average the loss by avg_factor if self.reduction == 'mean': loss = loss.sum() / avg_factor # if reduction is 'none', then do nothing, otherwise raise an error elif self.reduction != 'none': raise ValueError( 'avg_factor can not be used with reduction="sum"') return loss ================================================ FILE: ppdet/modeling/losses/iou_aware_loss.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle.nn.functional as F from ppdet.core.workspace import register, serializable from .iou_loss import IouLoss from ..bbox_utils import bbox_iou @register @serializable class IouAwareLoss(IouLoss): """ iou aware loss, see https://arxiv.org/abs/1912.05992 Args: loss_weight (float): iou aware loss weight, default is 1.0 max_height (int): max height of input to support random shape input max_width (int): max width of input to support random shape input """ def __init__(self, loss_weight=1.0, giou=False, diou=False, ciou=False): super(IouAwareLoss, self).__init__( loss_weight=loss_weight, giou=giou, diou=diou, ciou=ciou) def __call__(self, ioup, pbox, gbox): iou = bbox_iou( pbox, gbox, giou=self.giou, diou=self.diou, ciou=self.ciou) iou.stop_gradient = True loss_iou_aware = F.binary_cross_entropy_with_logits( ioup, iou, reduction='none') loss_iou_aware = loss_iou_aware * self.loss_weight return loss_iou_aware ================================================ FILE: ppdet/modeling/losses/iou_loss.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import math import paddle from ppdet.core.workspace import register, serializable from ..bbox_utils import bbox_iou __all__ = ['IouLoss', 'GIoULoss', 'DIouLoss', 'SIoULoss'] @register @serializable class IouLoss(object): """ iou loss, see https://arxiv.org/abs/1908.03851 loss = 1.0 - iou * iou Args: loss_weight (float): iou loss weight, default is 2.5 max_height (int): max height of input to support random shape input max_width (int): max width of input to support random shape input ciou_term (bool): whether to add ciou_term loss_square (bool): whether to square the iou term """ def __init__(self, loss_weight=2.5, giou=False, diou=False, ciou=False, loss_square=True): self.loss_weight = loss_weight self.giou = giou self.diou = diou self.ciou = ciou self.loss_square = loss_square def __call__(self, pbox, gbox): iou = bbox_iou( pbox, gbox, giou=self.giou, diou=self.diou, ciou=self.ciou) if self.loss_square: loss_iou = 1 - iou * iou else: loss_iou = 1 - iou loss_iou = loss_iou * self.loss_weight return loss_iou @register @serializable class GIoULoss(object): """ Generalized Intersection over Union, see https://arxiv.org/abs/1902.09630 Args: loss_weight (float): giou loss weight, default as 1 eps (float): epsilon to avoid divide by zero, default as 1e-10 reduction (string): Options are "none", "mean" and "sum". default as none """ def __init__(self, loss_weight=1., eps=1e-10, reduction='none'): self.loss_weight = loss_weight self.eps = eps assert reduction in ('none', 'mean', 'sum') self.reduction = reduction def bbox_overlap(self, box1, box2, eps=1e-10): """calculate the iou of box1 and box2 Args: box1 (Tensor): box1 with the shape (..., 4) box2 (Tensor): box1 with the shape (..., 4) eps (float): epsilon to avoid divide by zero Return: iou (Tensor): iou of box1 and box2 overlap (Tensor): overlap of box1 and box2 union (Tensor): union of box1 and box2 """ x1, y1, x2, y2 = box1 x1g, y1g, x2g, y2g = box2 xkis1 = paddle.maximum(x1, x1g) ykis1 = paddle.maximum(y1, y1g) xkis2 = paddle.minimum(x2, x2g) ykis2 = paddle.minimum(y2, y2g) w_inter = (xkis2 - xkis1).clip(0) h_inter = (ykis2 - ykis1).clip(0) overlap = w_inter * h_inter area1 = (x2 - x1) * (y2 - y1) area2 = (x2g - x1g) * (y2g - y1g) union = area1 + area2 - overlap + eps iou = overlap / union return iou, overlap, union def __call__(self, pbox, gbox, iou_weight=1., loc_reweight=None): x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1) x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1) box1 = [x1, y1, x2, y2] box2 = [x1g, y1g, x2g, y2g] iou, overlap, union = self.bbox_overlap(box1, box2, self.eps) xc1 = paddle.minimum(x1, x1g) yc1 = paddle.minimum(y1, y1g) xc2 = paddle.maximum(x2, x2g) yc2 = paddle.maximum(y2, y2g) area_c = (xc2 - xc1) * (yc2 - yc1) + self.eps miou = iou - ((area_c - union) / area_c) if loc_reweight is not None: loc_reweight = paddle.reshape(loc_reweight, shape=(-1, 1)) loc_thresh = 0.9 giou = 1 - (1 - loc_thresh ) * miou - loc_thresh * miou * loc_reweight else: giou = 1 - miou if self.reduction == 'none': loss = giou elif self.reduction == 'sum': loss = paddle.sum(giou * iou_weight) else: loss = paddle.mean(giou * iou_weight) return loss * self.loss_weight @register @serializable class DIouLoss(GIoULoss): """ Distance-IoU Loss, see https://arxiv.org/abs/1911.08287 Args: loss_weight (float): giou loss weight, default as 1 eps (float): epsilon to avoid divide by zero, default as 1e-10 use_complete_iou_loss (bool): whether to use complete iou loss """ def __init__(self, loss_weight=1., eps=1e-10, use_complete_iou_loss=True): super(DIouLoss, self).__init__(loss_weight=loss_weight, eps=eps) self.use_complete_iou_loss = use_complete_iou_loss def __call__(self, pbox, gbox, iou_weight=1.): x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1) x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1) cx = (x1 + x2) / 2 cy = (y1 + y2) / 2 w = x2 - x1 h = y2 - y1 cxg = (x1g + x2g) / 2 cyg = (y1g + y2g) / 2 wg = x2g - x1g hg = y2g - y1g x2 = paddle.maximum(x1, x2) y2 = paddle.maximum(y1, y2) # A and B xkis1 = paddle.maximum(x1, x1g) ykis1 = paddle.maximum(y1, y1g) xkis2 = paddle.minimum(x2, x2g) ykis2 = paddle.minimum(y2, y2g) # A or B xc1 = paddle.minimum(x1, x1g) yc1 = paddle.minimum(y1, y1g) xc2 = paddle.maximum(x2, x2g) yc2 = paddle.maximum(y2, y2g) intsctk = (xkis2 - xkis1) * (ykis2 - ykis1) intsctk = intsctk * paddle.greater_than( xkis2, xkis1).astype(intsctk.dtype) * paddle.greater_than(ykis2, ykis1).astype(intsctk.dtype) unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g ) - intsctk + self.eps iouk = intsctk / unionk # DIOU term dist_intersection = (cx - cxg) * (cx - cxg) + (cy - cyg) * (cy - cyg) dist_union = (xc2 - xc1) * (xc2 - xc1) + (yc2 - yc1) * (yc2 - yc1) diou_term = (dist_intersection + self.eps) / (dist_union + self.eps) # CIOU term ciou_term = 0 if self.use_complete_iou_loss: ar_gt = wg / hg ar_pred = w / h arctan = paddle.atan(ar_gt) - paddle.atan(ar_pred) ar_loss = 4. / np.pi / np.pi * arctan * arctan alpha = ar_loss / (1 - iouk + ar_loss + self.eps) alpha.stop_gradient = True ciou_term = alpha * ar_loss diou = paddle.mean((1 - iouk + ciou_term + diou_term) * iou_weight) return diou * self.loss_weight @register @serializable class SIoULoss(GIoULoss): """ see https://arxiv.org/pdf/2205.12740.pdf Args: loss_weight (float): siou loss weight, default as 1 eps (float): epsilon to avoid divide by zero, default as 1e-10 theta (float): default as 4 reduction (str): Options are "none", "mean" and "sum". default as none """ def __init__(self, loss_weight=1., eps=1e-10, theta=4., reduction='none'): super(SIoULoss, self).__init__(loss_weight=loss_weight, eps=eps) self.loss_weight = loss_weight self.eps = eps self.theta = theta self.reduction = reduction def __call__(self, pbox, gbox): x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1) x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1) box1 = [x1, y1, x2, y2] box2 = [x1g, y1g, x2g, y2g] iou = bbox_iou(box1, box2) cx = (x1 + x2) / 2 cy = (y1 + y2) / 2 w = x2 - x1 + self.eps h = y2 - y1 + self.eps cxg = (x1g + x2g) / 2 cyg = (y1g + y2g) / 2 wg = x2g - x1g + self.eps hg = y2g - y1g + self.eps x2 = paddle.maximum(x1, x2) y2 = paddle.maximum(y1, y2) # A or B xc1 = paddle.minimum(x1, x1g) yc1 = paddle.minimum(y1, y1g) xc2 = paddle.maximum(x2, x2g) yc2 = paddle.maximum(y2, y2g) cw_out = xc2 - xc1 ch_out = yc2 - yc1 ch = paddle.maximum(cy, cyg) - paddle.minimum(cy, cyg) cw = paddle.maximum(cx, cxg) - paddle.minimum(cx, cxg) # angle cost dist_intersection = paddle.sqrt((cx - cxg)**2 + (cy - cyg)**2) sin_angle_alpha = ch / dist_intersection sin_angle_beta = cw / dist_intersection thred = paddle.pow(paddle.to_tensor(2), 0.5) / 2 thred.stop_gradient = True sin_alpha = paddle.where(sin_angle_alpha > thred, sin_angle_beta, sin_angle_alpha) angle_cost = paddle.cos(paddle.asin(sin_alpha) * 2 - math.pi / 2) # distance cost gamma = 2 - angle_cost # gamma.stop_gradient = True beta_x = ((cxg - cx) / cw_out)**2 beta_y = ((cyg - cy) / ch_out)**2 dist_cost = 1 - paddle.exp(-gamma * beta_x) + 1 - paddle.exp(-gamma * beta_y) # shape cost omega_w = paddle.abs(w - wg) / paddle.maximum(w, wg) omega_h = paddle.abs(hg - h) / paddle.maximum(h, hg) omega = (1 - paddle.exp(-omega_w))**self.theta + ( 1 - paddle.exp(-omega_h))**self.theta siou_loss = 1 - iou + (omega + dist_cost) / 2 if self.reduction == 'mean': siou_loss = paddle.mean(siou_loss) elif self.reduction == 'sum': siou_loss = paddle.sum(siou_loss) return siou_loss * self.loss_weight ================================================ FILE: ppdet/modeling/losses/jde_loss.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register __all__ = ['JDEDetectionLoss', 'JDEEmbeddingLoss', 'JDELoss'] @register class JDEDetectionLoss(nn.Layer): __shared__ = ['num_classes'] def __init__(self, num_classes=1, for_mot=True): super(JDEDetectionLoss, self).__init__() self.num_classes = num_classes self.for_mot = for_mot def det_loss(self, p_det, anchor, t_conf, t_box): pshape = paddle.shape(p_det) pshape.stop_gradient = True nB, nGh, nGw = pshape[0], pshape[-2], pshape[-1] nA = len(anchor) p_det = paddle.reshape( p_det, [nB, nA, self.num_classes + 5, nGh, nGw]).transpose( (0, 1, 3, 4, 2)) # 1. loss_conf: cross_entropy p_conf = p_det[:, :, :, :, 4:6] p_conf_flatten = paddle.reshape(p_conf, [-1, 2]) t_conf_flatten = t_conf.flatten() t_conf_flatten = paddle.cast(t_conf_flatten, dtype="int64") t_conf_flatten.stop_gradient = True loss_conf = F.cross_entropy( p_conf_flatten, t_conf_flatten, ignore_index=-1, reduction='mean') loss_conf.stop_gradient = False # 2. loss_box: smooth_l1_loss p_box = p_det[:, :, :, :, :4] p_box_flatten = paddle.reshape(p_box, [-1, 4]) t_box_flatten = paddle.reshape(t_box, [-1, 4]) fg_inds = paddle.nonzero(t_conf_flatten > 0).flatten() if fg_inds.numel() > 0: reg_delta = paddle.gather(p_box_flatten, fg_inds) reg_target = paddle.gather(t_box_flatten, fg_inds) else: reg_delta = paddle.to_tensor([0, 0, 0, 0], dtype='float32') reg_delta.stop_gradient = False reg_target = paddle.to_tensor([0, 0, 0, 0], dtype='float32') reg_target.stop_gradient = True loss_box = F.smooth_l1_loss( reg_delta, reg_target, reduction='mean', delta=1.0) loss_box.stop_gradient = False return loss_conf, loss_box def forward(self, det_outs, targets, anchors): """ Args: det_outs (list[Tensor]): output from detection head, each one is a 4-D Tensor with shape [N, C, H, W]. targets (dict): contains 'im_id', 'gt_bbox', 'gt_ide', 'image', 'im_shape', 'scale_factor' and 'tbox', 'tconf', 'tide' of each FPN level. anchors (list[list]): anchor setting of JDE model, N row M col, N is the anchor levels(FPN levels), M is the anchor scales each level. """ assert len(det_outs) == len(anchors) loss_confs = [] loss_boxes = [] for i, (p_det, anchor) in enumerate(zip(det_outs, anchors)): t_conf = targets['tconf{}'.format(i)] t_box = targets['tbox{}'.format(i)] loss_conf, loss_box = self.det_loss(p_det, anchor, t_conf, t_box) loss_confs.append(loss_conf) loss_boxes.append(loss_box) if self.for_mot: return {'loss_confs': loss_confs, 'loss_boxes': loss_boxes} else: jde_conf_losses = sum(loss_confs) jde_box_losses = sum(loss_boxes) jde_det_losses = { "loss_conf": jde_conf_losses, "loss_box": jde_box_losses, "loss": jde_conf_losses + jde_box_losses, } return jde_det_losses @register class JDEEmbeddingLoss(nn.Layer): def __init__(self, ): super(JDEEmbeddingLoss, self).__init__() self.phony = self.create_parameter(shape=[1], dtype="float32") def emb_loss(self, p_ide, t_conf, t_ide, emb_scale, classifier): emb_dim = p_ide.shape[1] p_ide = p_ide.transpose((0, 2, 3, 1)) p_ide_flatten = paddle.reshape(p_ide, [-1, emb_dim]) mask = t_conf > 0 mask = paddle.cast(mask, dtype="int64") mask.stop_gradient = True emb_mask = mask.max(1).flatten() emb_mask_inds = paddle.nonzero(emb_mask > 0).flatten() emb_mask_inds.stop_gradient = True # use max(1) to decide the id, TODO: more reseanable strategy t_ide_flatten = t_ide.max(1).flatten() t_ide_flatten = paddle.cast(t_ide_flatten, dtype="int64") valid_inds = paddle.nonzero(t_ide_flatten != -1).flatten() if emb_mask_inds.numel() == 0 or valid_inds.numel() == 0: # loss_ide = paddle.to_tensor([0]) # will be error in gradient backward loss_ide = self.phony * 0 # todo else: embedding = paddle.gather(p_ide_flatten, emb_mask_inds) embedding = emb_scale * F.normalize(embedding) logits = classifier(embedding) ide_target = paddle.gather(t_ide_flatten, emb_mask_inds) loss_ide = F.cross_entropy( logits, ide_target, ignore_index=-1, reduction='mean') loss_ide.stop_gradient = False return loss_ide def forward(self, ide_outs, targets, emb_scale, classifier): loss_ides = [] for i, p_ide in enumerate(ide_outs): t_conf = targets['tconf{}'.format(i)] t_ide = targets['tide{}'.format(i)] loss_ide = self.emb_loss(p_ide, t_conf, t_ide, emb_scale, classifier) loss_ides.append(loss_ide) return loss_ides @register class JDELoss(nn.Layer): def __init__(self): super(JDELoss, self).__init__() def forward(self, loss_confs, loss_boxes, loss_ides, loss_params_cls, loss_params_reg, loss_params_ide, targets): assert len(loss_confs) == len(loss_boxes) == len(loss_ides) assert len(loss_params_cls) == len(loss_params_reg) == len( loss_params_ide) assert len(loss_confs) == len(loss_params_cls) batchsize = targets['gt_bbox'].shape[0] nTargets = paddle.nonzero(paddle.sum(targets['gt_bbox'], axis=2)).shape[ 0] / batchsize nTargets = paddle.to_tensor(nTargets, dtype='float32') nTargets.stop_gradient = True jde_losses = [] for i, (loss_conf, loss_box, loss_ide, l_conf_p, l_box_p, l_ide_p) in enumerate( zip(loss_confs, loss_boxes, loss_ides, loss_params_cls, loss_params_reg, loss_params_ide)): jde_loss = l_conf_p(loss_conf) + l_box_p(loss_box) + l_ide_p( loss_ide) jde_losses.append(jde_loss) loss_all = { "loss_conf": sum(loss_confs), "loss_box": sum(loss_boxes), "loss_ide": sum(loss_ides), "loss": sum(jde_losses), "nTargets": nTargets, } return loss_all ================================================ FILE: ppdet/modeling/losses/keypoint_loss.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function from itertools import cycle, islice from collections import abc import numpy as np import paddle import paddle.nn as nn from ppdet.core.workspace import register, serializable __all__ = ['HrHRNetLoss', 'KeyPointMSELoss', 'OKSLoss', 'CenterFocalLoss', 'L1Loss'] @register @serializable class KeyPointMSELoss(nn.Layer): def __init__(self, use_target_weight=True, loss_scale=0.5): """ KeyPointMSELoss layer Args: use_target_weight (bool): whether to use target weight """ super(KeyPointMSELoss, self).__init__() self.criterion = nn.MSELoss(reduction='mean') self.use_target_weight = use_target_weight self.loss_scale = loss_scale def forward(self, output, records): target = records['target'] target_weight = records['target_weight'] batch_size = output.shape[0] num_joints = output.shape[1] heatmaps_pred = output.reshape( (batch_size, num_joints, -1)).split(num_joints, 1) heatmaps_gt = target.reshape( (batch_size, num_joints, -1)).split(num_joints, 1) loss = 0 for idx in range(num_joints): heatmap_pred = heatmaps_pred[idx].squeeze() heatmap_gt = heatmaps_gt[idx].squeeze() if self.use_target_weight: loss += self.loss_scale * self.criterion( heatmap_pred.multiply(target_weight[:, idx]), heatmap_gt.multiply(target_weight[:, idx])) else: loss += self.loss_scale * self.criterion(heatmap_pred, heatmap_gt) keypoint_losses = dict() keypoint_losses['loss'] = loss / num_joints return keypoint_losses @register @serializable class HrHRNetLoss(nn.Layer): def __init__(self, num_joints, swahr): """ HrHRNetLoss layer Args: num_joints (int): number of keypoints """ super(HrHRNetLoss, self).__init__() if swahr: self.heatmaploss = HeatMapSWAHRLoss(num_joints) else: self.heatmaploss = HeatMapLoss() self.aeloss = AELoss() self.ziploss = ZipLoss( [self.heatmaploss, self.heatmaploss, self.aeloss]) def forward(self, inputs, records): targets = [] targets.append([records['heatmap_gt1x'], records['mask_1x']]) targets.append([records['heatmap_gt2x'], records['mask_2x']]) targets.append(records['tagmap']) keypoint_losses = dict() loss = self.ziploss(inputs, targets) keypoint_losses['heatmap_loss'] = loss[0] + loss[1] keypoint_losses['pull_loss'] = loss[2][0] keypoint_losses['push_loss'] = loss[2][1] keypoint_losses['loss'] = recursive_sum(loss) return keypoint_losses class HeatMapLoss(object): def __init__(self, loss_factor=1.0): super(HeatMapLoss, self).__init__() self.loss_factor = loss_factor def __call__(self, preds, targets): heatmap, mask = targets loss = ((preds - heatmap)**2 * mask.cast('float').unsqueeze(1)) loss = paddle.clip(loss, min=0, max=2).mean() loss *= self.loss_factor return loss class HeatMapSWAHRLoss(object): def __init__(self, num_joints, loss_factor=1.0): super(HeatMapSWAHRLoss, self).__init__() self.loss_factor = loss_factor self.num_joints = num_joints def __call__(self, preds, targets): heatmaps_gt, mask = targets heatmaps_pred = preds[0] scalemaps_pred = preds[1] heatmaps_scaled_gt = paddle.where(heatmaps_gt > 0, 0.5 * heatmaps_gt * ( 1 + (1 + (scalemaps_pred - 1.) * paddle.log(heatmaps_gt + 1e-10))**2), heatmaps_gt) regularizer_loss = paddle.mean( paddle.pow((scalemaps_pred - 1.) * (heatmaps_gt > 0).astype(float), 2)) omiga = 0.01 # thres = 2**(-1/omiga), threshold for positive weight hm_weight = heatmaps_scaled_gt**( omiga ) * paddle.abs(1 - heatmaps_pred) + paddle.abs(heatmaps_pred) * ( 1 - heatmaps_scaled_gt**(omiga)) loss = (((heatmaps_pred - heatmaps_scaled_gt)**2) * mask.cast('float').unsqueeze(1)) * hm_weight loss = loss.mean() loss = self.loss_factor * (loss + 1.0 * regularizer_loss) return loss class AELoss(object): def __init__(self, pull_factor=0.001, push_factor=0.001): super(AELoss, self).__init__() self.pull_factor = pull_factor self.push_factor = push_factor def apply_single(self, pred, tagmap): if tagmap.numpy()[:, :, 3].sum() == 0: return (paddle.zeros([1]), paddle.zeros([1])) nonzero = paddle.nonzero(tagmap[:, :, 3] > 0) if nonzero.shape[0] == 0: return (paddle.zeros([1]), paddle.zeros([1])) p_inds = paddle.unique(nonzero[:, 0]) num_person = p_inds.shape[0] if num_person == 0: return (paddle.zeros([1]), paddle.zeros([1])) pull = 0 tagpull_num = 0 embs_all = [] person_unvalid = 0 for person_idx in p_inds.numpy(): valid_single = tagmap[person_idx.item()] validkpts = paddle.nonzero(valid_single[:, 3] > 0) valid_single = paddle.index_select(valid_single, validkpts) emb = paddle.gather_nd(pred, valid_single[:, :3]) if emb.shape[0] == 1: person_unvalid += 1 mean = paddle.mean(emb, axis=0) embs_all.append(mean) pull += paddle.mean(paddle.pow(emb - mean, 2), axis=0) tagpull_num += emb.shape[0] pull /= max(num_person - person_unvalid, 1) if num_person < 2: return pull, paddle.zeros([1]) embs_all = paddle.stack(embs_all) A = embs_all.expand([num_person, num_person]) B = A.transpose([1, 0]) diff = A - B diff = paddle.pow(diff, 2) push = paddle.exp(-diff) push = paddle.sum(push) - num_person push /= 2 * num_person * (num_person - 1) return pull, push def __call__(self, preds, tagmaps): bs = preds.shape[0] losses = [ self.apply_single(preds[i:i + 1].squeeze(), tagmaps[i:i + 1].squeeze()) for i in range(bs) ] pull = self.pull_factor * sum(loss[0] for loss in losses) / len(losses) push = self.push_factor * sum(loss[1] for loss in losses) / len(losses) return pull, push class ZipLoss(object): def __init__(self, loss_funcs): super(ZipLoss, self).__init__() self.loss_funcs = loss_funcs def __call__(self, inputs, targets): assert len(self.loss_funcs) == len(targets) >= len(inputs) def zip_repeat(*args): longest = max(map(len, args)) filled = [islice(cycle(x), longest) for x in args] return zip(*filled) return tuple( fn(x, y) for x, y, fn in zip_repeat(inputs, targets, self.loss_funcs)) def recursive_sum(inputs): if isinstance(inputs, abc.Sequence): return sum([recursive_sum(x) for x in inputs]) return inputs def oks_overlaps(kpt_preds, kpt_gts, kpt_valids, kpt_areas, sigmas): if not kpt_gts.astype('bool').any(): return kpt_preds.sum()*0 sigmas = paddle.to_tensor(sigmas, dtype=kpt_preds.dtype) variances = (sigmas * 2)**2 assert kpt_preds.shape[0] == kpt_gts.shape[0] kpt_preds = kpt_preds.reshape((-1, kpt_preds.shape[-1] // 2, 2)) kpt_gts = kpt_gts.reshape((-1, kpt_gts.shape[-1] // 2, 2)) squared_distance = (kpt_preds[:, :, 0] - kpt_gts[:, :, 0]) ** 2 + \ (kpt_preds[:, :, 1] - kpt_gts[:, :, 1]) ** 2 assert (kpt_valids.sum(-1) > 0).all() squared_distance0 = squared_distance / ( kpt_areas[:, None] * variances[None, :] * 2) squared_distance1 = paddle.exp(-squared_distance0) squared_distance1 = squared_distance1 * kpt_valids oks = squared_distance1.sum(axis=1) / kpt_valids.sum(axis=1) return oks def oks_loss(pred, target, weight, valid=None, area=None, linear=False, sigmas=None, eps=1e-6, avg_factor=None, reduction=None): """Oks loss. Computing the oks loss between a set of predicted poses and target poses. The loss is calculated as negative log of oks. Args: pred (Tensor): Predicted poses of format (x1, y1, x2, y2, ...), shape (n, K*2). target (Tensor): Corresponding gt poses, shape (n, K*2). linear (bool, optional): If True, use linear scale of loss instead of log scale. Default: False. eps (float): Eps to avoid log(0). Returns: Tensor: Loss tensor. """ oks = oks_overlaps(pred, target, valid, area, sigmas).clip(min=eps) if linear: loss = 1 - oks else: loss = -oks.log() if weight is not None: if weight.shape != loss.shape: if weight.shape[0] == loss.shape[0]: # For most cases, weight is of shape (num_priors, ), # which means it does not have the second axis num_class weight = weight.reshape((-1, 1)) else: # Sometimes, weight per anchor per class is also needed. e.g. # in FSAF. But it may be flattened of shape # (num_priors x num_class, ), while loss is still of shape # (num_priors, num_class). assert weight.numel() == loss.numel() weight = weight.reshape((loss.shape[0], -1)) assert weight.ndim == loss.ndim loss = loss * weight # if avg_factor is not specified, just reduce the loss if avg_factor is None: if reduction == 'mean': loss = loss.mean() elif reduction == 'sum': loss = loss.sum() else: # if reduction is mean, then average the loss by avg_factor if reduction == 'mean': # Avoid causing ZeroDivisionError when avg_factor is 0.0, # i.e., all labels of an image belong to ignore index. eps = 1e-10 loss = loss.sum() / (avg_factor + eps) # if reduction is 'none', then do nothing, otherwise raise an error elif reduction != 'none': raise ValueError('avg_factor can not be used with reduction="sum"') return loss @register @serializable class OKSLoss(nn.Layer): """OKSLoss. Computing the oks loss between a set of predicted poses and target poses. Args: linear (bool): If True, use linear scale of loss instead of log scale. Default: False. eps (float): Eps to avoid log(0). reduction (str): Options are "none", "mean" and "sum". loss_weight (float): Weight of loss. """ def __init__(self, linear=False, num_keypoints=17, eps=1e-6, reduction='mean', loss_weight=1.0): super(OKSLoss, self).__init__() self.linear = linear self.eps = eps self.reduction = reduction self.loss_weight = loss_weight if num_keypoints == 17: self.sigmas = np.array([ .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89 ], dtype=np.float32) / 10.0 elif num_keypoints == 14: self.sigmas = np.array([ .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89, .79, .79 ]) / 10.0 else: raise ValueError(f'Unsupported keypoints number {num_keypoints}') def forward(self, pred, target, valid, area, weight=None, avg_factor=None, reduction_override=None, **kwargs): """Forward function. Args: pred (Tensor): The prediction. target (Tensor): The learning target of the prediction. valid (Tensor): The visible flag of the target pose. area (Tensor): The area of the target pose. weight (Tensor, optional): The weight of loss for each prediction. Defaults to None. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. reduction_override (str, optional): The reduction method used to override the original reduction method of the loss. Defaults to None. Options are "none", "mean" and "sum". """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) if (weight is not None) and (not paddle.any(weight > 0)) and ( reduction != 'none'): if pred.dim() == weight.dim() + 1: weight = weight.unsqueeze(1) return (pred * weight).sum() # 0 if weight is not None and weight.dim() > 1: # TODO: remove this in the future # reduce the weight of shape (n, 4) to (n,) to match the # iou_loss of shape (n,) assert weight.shape == pred.shape weight = weight.mean(-1) loss = self.loss_weight * oks_loss( pred, target, weight, valid=valid, area=area, linear=self.linear, sigmas=self.sigmas, eps=self.eps, reduction=reduction, avg_factor=avg_factor, **kwargs) return loss def center_focal_loss(pred, gt, weight=None, mask=None, avg_factor=None, reduction=None): """Modified focal loss. Exactly the same as CornerNet. Runs faster and costs a little bit more memory. Args: pred (Tensor): The prediction with shape [bs, c, h, w]. gt (Tensor): The learning target of the prediction in gaussian distribution, with shape [bs, c, h, w]. mask (Tensor): The valid mask. Defaults to None. """ if not gt.astype('bool').any(): return pred.sum()*0 pos_inds = gt.equal(1).astype('float32') if mask is None: neg_inds = gt.less_than(paddle.to_tensor([1], dtype='float32')).astype('float32') else: neg_inds = gt.less_than(paddle.to_tensor([1], dtype='float32')).astype('float32') * mask.equal(0).astype('float32') neg_weights = paddle.pow(1 - gt, 4) loss = 0 pos_loss = paddle.log(pred) * paddle.pow(1 - pred, 2) * pos_inds neg_loss = paddle.log(1 - pred) * paddle.pow(pred, 2) * neg_weights * \ neg_inds num_pos = pos_inds.astype('float32').sum() pos_loss = pos_loss.sum() neg_loss = neg_loss.sum() if num_pos == 0: loss = loss - neg_loss else: loss = loss - (pos_loss + neg_loss) / num_pos if weight is not None: if weight.shape != loss.shape: if weight.shape[0] == loss.shape[0]: # For most cases, weight is of shape (num_priors, ), # which means it does not have the second axis num_class weight = weight.reshape((-1, 1)) else: # Sometimes, weight per anchor per class is also needed. e.g. # in FSAF. But it may be flattened of shape # (num_priors x num_class, ), while loss is still of shape # (num_priors, num_class). assert weight.numel() == loss.numel() weight = weight.reshape((loss.shape[0], -1)) assert weight.ndim == loss.ndim loss = loss * weight # if avg_factor is not specified, just reduce the loss if avg_factor is None: if reduction == 'mean': loss = loss.mean() elif reduction == 'sum': loss = loss.sum() else: # if reduction is mean, then average the loss by avg_factor if reduction == 'mean': # Avoid causing ZeroDivisionError when avg_factor is 0.0, # i.e., all labels of an image belong to ignore index. eps = 1e-10 loss = loss.sum() / (avg_factor + eps) # if reduction is 'none', then do nothing, otherwise raise an error elif reduction != 'none': raise ValueError('avg_factor can not be used with reduction="sum"') return loss @register @serializable class CenterFocalLoss(nn.Layer): """CenterFocalLoss is a variant of focal loss. More details can be found in the `paper `_ Args: reduction (str): Options are "none", "mean" and "sum". loss_weight (float): Loss weight of current loss. """ def __init__(self, reduction='none', loss_weight=1.0): super(CenterFocalLoss, self).__init__() self.reduction = reduction self.loss_weight = loss_weight def forward(self, pred, target, weight=None, mask=None, avg_factor=None, reduction_override=None): """Forward function. Args: pred (Tensor): The prediction. target (Tensor): The learning target of the prediction in gaussian distribution. weight (Tensor, optional): The weight of loss for each prediction. Defaults to None. mask (Tensor): The valid mask. Defaults to None. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. reduction_override (str, optional): The reduction method used to override the original reduction method of the loss. Defaults to None. """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) loss_reg = self.loss_weight * center_focal_loss( pred, target, weight, mask=mask, reduction=reduction, avg_factor=avg_factor) return loss_reg def l1_loss(pred, target, weight=None, reduction='mean', avg_factor=None): """L1 loss. Args: pred (Tensor): The prediction. target (Tensor): The learning target of the prediction. Returns: Tensor: Calculated loss """ if not target.astype('bool').any(): return pred.sum() * 0 assert pred.shape == target.shape loss = paddle.abs(pred - target) if weight is not None: if weight.shape != loss.shape: if weight.shape[0] == loss.shape[0]: # For most cases, weight is of shape (num_priors, ), # which means it does not have the second axis num_class weight = weight.reshape((-1, 1)) else: # Sometimes, weight per anchor per class is also needed. e.g. # in FSAF. But it may be flattened of shape # (num_priors x num_class, ), while loss is still of shape # (num_priors, num_class). assert weight.numel() == loss.numel() weight = weight.reshape((loss.shape[0], -1)) assert weight.ndim == loss.ndim loss = loss * weight # if avg_factor is not specified, just reduce the loss if avg_factor is None: if reduction == 'mean': loss = loss.mean() elif reduction == 'sum': loss = loss.sum() else: # if reduction is mean, then average the loss by avg_factor if reduction == 'mean': # Avoid causing ZeroDivisionError when avg_factor is 0.0, # i.e., all labels of an image belong to ignore index. eps = 1e-10 loss = loss.sum() / (avg_factor + eps) # if reduction is 'none', then do nothing, otherwise raise an error elif reduction != 'none': raise ValueError('avg_factor can not be used with reduction="sum"') return loss @register @serializable class L1Loss(nn.Layer): """L1 loss. Args: reduction (str, optional): The method to reduce the loss. Options are "none", "mean" and "sum". loss_weight (float, optional): The weight of loss. """ def __init__(self, reduction='mean', loss_weight=1.0): super(L1Loss, self).__init__() self.reduction = reduction self.loss_weight = loss_weight def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None): """Forward function. Args: pred (Tensor): The prediction. target (Tensor): The learning target of the prediction. weight (Tensor, optional): The weight of loss for each prediction. Defaults to None. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. reduction_override (str, optional): The reduction method used to override the original reduction method of the loss. Defaults to None. """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) loss_bbox = self.loss_weight * l1_loss( pred, target, weight, reduction=reduction, avg_factor=avg_factor) return loss_bbox ================================================ FILE: ppdet/modeling/losses/pose3d_loss.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function from itertools import cycle, islice from collections import abc import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register, serializable from ppdet.utils.logger import setup_logger logger = setup_logger('ppdet.engine') __all__ = ['Pose3DLoss'] @register @serializable class Pose3DLoss(nn.Layer): def __init__(self, weight_3d=1.0, weight_2d=0.0, reduction='none'): """ KeyPointMSELoss layer Args: weight_3d (float): weight of 3d loss weight_2d (float): weight of 2d loss reduction (bool): whether use reduction to loss """ super(Pose3DLoss, self).__init__() self.weight_3d = weight_3d self.weight_2d = weight_2d self.criterion_2dpose = nn.MSELoss(reduction=reduction) self.criterion_3dpose = nn.L1Loss(reduction=reduction) self.criterion_smoothl1 = nn.SmoothL1Loss( reduction=reduction, delta=1.0) self.criterion_vertices = nn.L1Loss() def forward(self, pred3d, pred2d, inputs): """ mpjpe: mpjpe loss between 3d joints keypoint_2d_loss: 2d joints loss compute by criterion_2dpose """ gt_3d_joints = inputs['joints_3d'] gt_2d_joints = inputs['joints_2d'] has_3d_joints = inputs['has_3d_joints'] has_2d_joints = inputs['has_2d_joints'] loss_3d = mpjpe_focal(pred3d, gt_3d_joints, has_3d_joints) loss = self.weight_3d * loss_3d epoch = inputs['epoch_id'] if self.weight_2d > 0: weight = self.weight_2d * pow(0.1, (epoch // 8)) if epoch > 8: weight = 0 loss_2d = keypoint_2d_loss(self.criterion_2dpose, pred2d, gt_2d_joints, has_2d_joints) loss += weight * loss_2d return loss def filter_3d_joints(pred, gt, has_3d_joints): """ filter 3d joints """ gt = gt[has_3d_joints == 1] gt = gt[:, :, :3] pred = pred[has_3d_joints == 1] gt_pelvis = (gt[:, 2, :] + gt[:, 3, :]) / 2 gt = gt - gt_pelvis[:, None, :] pred_pelvis = (pred[:, 2, :] + pred[:, 3, :]) / 2 pred = pred - pred_pelvis[:, None, :] return pred, gt def mpjpe(pred, gt, has_3d_joints): """ mPJPE loss """ pred, gt = filter_3d_joints(pred, gt, has_3d_joints) error = paddle.sqrt((paddle.minimum((pred - gt), paddle.to_tensor(1.2))**2 ).sum(axis=-1)).mean() return error def mpjpe_focal(pred, gt, has_3d_joints): """ mPJPE loss """ pred, gt = filter_3d_joints(pred, gt, has_3d_joints) mse_error = ((pred - gt)**2).sum(axis=-1) mpjpe_error = paddle.sqrt(mse_error) mean = mpjpe_error.mean() std = mpjpe_error.std() atte = 2 * F.sigmoid(6 * (mpjpe_error - mean) / std) mse_error *= atte return mse_error.mean() def mpjpe_mse(pred, gt, has_3d_joints, weight=1.): """ mPJPE loss """ pred, gt = filter_3d_joints(pred, gt, has_3d_joints) error = (((pred - gt)**2).sum(axis=-1)).mean() return error def mpjpe_criterion(pred, gt, has_3d_joints, criterion_pose3d): """ mPJPE loss of self define criterion """ pred, gt = filter_3d_joints(pred, gt, has_3d_joints) error = paddle.sqrt(criterion_pose3d(pred, gt)).mean() return error @register @serializable def weighted_mpjpe(pred, gt, has_3d_joints): """ Weighted_mPJPE """ pred, gt = filter_3d_joints(pred, gt, has_3d_joints) weight = paddle.linalg.norm(pred, p=2, axis=-1) weight = paddle.to_tensor( [1.5, 1.3, 1.2, 1.2, 1.3, 1.5, 1.5, 1.3, 1.2, 1.2, 1.3, 1.5, 1., 1.]) error = (weight * paddle.linalg.norm(pred - gt, p=2, axis=-1)).mean() return error @register @serializable def normed_mpjpe(pred, gt, has_3d_joints): """ Normalized MPJPE (scale only), adapted from: https://github.com/hrhodin/UnsupervisedGeometryAwareRepresentationLearning/blob/master/losses/poses.py """ assert pred.shape == gt.shape pred, gt = filter_3d_joints(pred, gt, has_3d_joints) norm_predicted = paddle.mean( paddle.sum(pred**2, axis=3, keepdim=True), axis=2, keepdim=True) norm_target = paddle.mean( paddle.sum(gt * pred, axis=3, keepdim=True), axis=2, keepdim=True) scale = norm_target / norm_predicted return mpjpe(scale * pred, gt) @register @serializable def mpjpe_np(pred, gt, has_3d_joints): """ mPJPE_NP """ pred, gt = filter_3d_joints(pred, gt, has_3d_joints) error = np.sqrt(((pred - gt)**2).sum(axis=-1)).mean() return error @register @serializable def mean_per_vertex_error(pred, gt, has_smpl): """ Compute mPVE """ pred = pred[has_smpl == 1] gt = gt[has_smpl == 1] with paddle.no_grad(): error = paddle.sqrt(((pred - gt)**2).sum(axis=-1)).mean() return error @register @serializable def keypoint_2d_loss(criterion_keypoints, pred_keypoints_2d, gt_keypoints_2d, has_pose_2d): """ Compute 2D reprojection loss if 2D keypoint annotations are available. The confidence (conf) is binary and indicates whether the keypoints exist or not. """ conf = gt_keypoints_2d[:, :, -1].unsqueeze(-1).clone() loss = (conf * criterion_keypoints( pred_keypoints_2d, gt_keypoints_2d[:, :, :-1] * 0.001)).mean() return loss @register @serializable def keypoint_3d_loss(criterion_keypoints, pred_keypoints_3d, gt_keypoints_3d, has_pose_3d): """ Compute 3D keypoint loss if 3D keypoint annotations are available. """ conf = gt_keypoints_3d[:, :, -1].unsqueeze(-1).clone() gt_keypoints_3d = gt_keypoints_3d[:, :, :-1].clone() gt_keypoints_3d = gt_keypoints_3d[has_pose_3d == 1] conf = conf[has_pose_3d == 1] pred_keypoints_3d = pred_keypoints_3d[has_pose_3d == 1] if len(gt_keypoints_3d) > 0: gt_pelvis = (gt_keypoints_3d[:, 2, :] + gt_keypoints_3d[:, 3, :]) / 2 gt_keypoints_3d = gt_keypoints_3d - gt_pelvis[:, None, :] pred_pelvis = ( pred_keypoints_3d[:, 2, :] + pred_keypoints_3d[:, 3, :]) / 2 pred_keypoints_3d = pred_keypoints_3d - pred_pelvis[:, None, :] return (conf * criterion_keypoints(pred_keypoints_3d, gt_keypoints_3d)).mean() else: return paddle.to_tensor([1.]).fill_(0.) @register @serializable def vertices_loss(criterion_vertices, pred_vertices, gt_vertices, has_smpl): """ Compute per-vertex loss if vertex annotations are available. """ pred_vertices_with_shape = pred_vertices[has_smpl == 1] gt_vertices_with_shape = gt_vertices[has_smpl == 1] if len(gt_vertices_with_shape) > 0: return criterion_vertices(pred_vertices_with_shape, gt_vertices_with_shape) else: return paddle.to_tensor([1.]).fill_(0.) @register @serializable def rectify_pose(pose): pose = pose.copy() R_mod = cv2.Rodrigues(np.array([np.pi, 0, 0]))[0] R_root = cv2.Rodrigues(pose[:3])[0] new_root = R_root.dot(R_mod) pose[:3] = cv2.Rodrigues(new_root)[0].reshape(3) return pose ================================================ FILE: ppdet/modeling/losses/probiou_loss.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import paddle import paddle.nn.functional as F from ppdet.core.workspace import register, serializable __all__ = ['ProbIoULoss'] def gbb_form(boxes): xy, wh, angle = paddle.split(boxes, [2, 2, 1], axis=-1) return paddle.concat([xy, wh.pow(2) / 12., angle], axis=-1) def rotated_form(a_, b_, angles): cos_a = paddle.cos(angles) sin_a = paddle.sin(angles) a = a_ * paddle.pow(cos_a, 2) + b_ * paddle.pow(sin_a, 2) b = a_ * paddle.pow(sin_a, 2) + b_ * paddle.pow(cos_a, 2) c = (a_ - b_) * cos_a * sin_a return a, b, c def probiou_loss(pred, target, eps=1e-3, mode='l1'): """ pred -> a matrix [N,5](x,y,w,h,angle - in radians) containing ours predicted box ;in case of HBB angle == 0 target -> a matrix [N,5](x,y,w,h,angle - in radians) containing ours target box ;in case of HBB angle == 0 eps -> threshold to avoid infinite values mode -> ('l1' in [0,1] or 'l2' in [0,inf]) metrics according our paper """ gbboxes1 = gbb_form(pred) gbboxes2 = gbb_form(target) x1, y1, a1_, b1_, c1_ = gbboxes1[:, 0], gbboxes1[:, 1], gbboxes1[:, 2], gbboxes1[:, 3], gbboxes1[:, 4] x2, y2, a2_, b2_, c2_ = gbboxes2[:, 0], gbboxes2[:, 1], gbboxes2[:, 2], gbboxes2[:, 3], gbboxes2[:, 4] a1, b1, c1 = rotated_form(a1_, b1_, c1_) a2, b2, c2 = rotated_form(a2_, b2_, c2_) t1 = 0.25 * ((a1 + a2) * (paddle.pow(y1 - y2, 2)) + (b1 + b2) * (paddle.pow(x1 - x2, 2))) + \ 0.5 * ((c1+c2)*(x2-x1)*(y1-y2)) t2 = (a1 + a2) * (b1 + b2) - paddle.pow(c1 + c2, 2) t3_ = (a1 * b1 - c1 * c1) * (a2 * b2 - c2 * c2) t3 = 0.5 * paddle.log(t2 / (4 * paddle.sqrt(F.relu(t3_)) + eps)) B_d = (t1 / t2) + t3 # B_d = t1 + t2 + t3 B_d = paddle.clip(B_d, min=eps, max=100.0) l1 = paddle.sqrt(1.0 - paddle.exp(-B_d) + eps) l_i = paddle.pow(l1, 2.0) l2 = -paddle.log(1.0 - l_i + eps) if mode == 'l1': probiou = l1 if mode == 'l2': probiou = l2 return probiou @serializable @register class ProbIoULoss(object): """ ProbIoU Loss, refer to https://arxiv.org/abs/2106.06072 for details """ def __init__(self, mode='l1', eps=1e-3): super(ProbIoULoss, self).__init__() self.mode = mode self.eps = eps def __call__(self, pred_rboxes, assigned_rboxes): return probiou_loss(pred_rboxes, assigned_rboxes, self.eps, self.mode) ================================================ FILE: ppdet/modeling/losses/queryinst_loss.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn.functional as F from ppdet.core.workspace import register from ppdet.modeling.losses.iou_loss import GIoULoss from .sparsercnn_loss import HungarianMatcher __all__ = ['QueryInstLoss'] @register class QueryInstLoss(object): __shared__ = ['num_classes'] def __init__(self, num_classes=80, focal_loss_alpha=0.25, focal_loss_gamma=2.0, class_weight=2.0, l1_weight=5.0, giou_weight=2.0, mask_weight=8.0): super(QueryInstLoss, self).__init__() self.num_classes = num_classes self.focal_loss_alpha = focal_loss_alpha self.focal_loss_gamma = focal_loss_gamma self.loss_weights = { "loss_cls": class_weight, "loss_bbox": l1_weight, "loss_giou": giou_weight, "loss_mask": mask_weight } self.giou_loss = GIoULoss(eps=1e-6, reduction='sum') self.matcher = HungarianMatcher(focal_loss_alpha, focal_loss_gamma, class_weight, l1_weight, giou_weight) def loss_classes(self, class_logits, targets, indices, avg_factor): tgt_labels = paddle.full( class_logits.shape[:2], self.num_classes, dtype='int32') if sum(len(v['labels']) for v in targets) > 0: tgt_classes = paddle.concat([ paddle.gather( tgt['labels'], tgt_idx, axis=0) for tgt, (_, tgt_idx) in zip(targets, indices) ]) batch_idx, src_idx = self._get_src_permutation_idx(indices) for i, (batch_i, src_i) in enumerate(zip(batch_idx, src_idx)): tgt_labels[int(batch_i), int(src_i)] = tgt_classes[i] tgt_labels = tgt_labels.flatten(0, 1).unsqueeze(-1) tgt_labels_onehot = paddle.cast( tgt_labels == paddle.arange(0, self.num_classes), dtype='float32') tgt_labels_onehot.stop_gradient = True src_logits = class_logits.flatten(0, 1) loss_cls = F.sigmoid_focal_loss( src_logits, tgt_labels_onehot, alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction='sum') / avg_factor losses = {'loss_cls': loss_cls * self.loss_weights['loss_cls']} return losses def loss_bboxes(self, bbox_pred, targets, indices, avg_factor): bboxes = paddle.concat([ paddle.gather( src, src_idx, axis=0) for src, (src_idx, _) in zip(bbox_pred, indices) ]) tgt_bboxes = paddle.concat([ paddle.gather( tgt['boxes'], tgt_idx, axis=0) for tgt, (_, tgt_idx) in zip(targets, indices) ]) tgt_bboxes.stop_gradient = True im_shapes = paddle.concat([tgt['img_whwh_tgt'] for tgt in targets]) bboxes_norm = bboxes / im_shapes tgt_bboxes_norm = tgt_bboxes / im_shapes loss_giou = self.giou_loss(bboxes, tgt_bboxes) / avg_factor loss_bbox = F.l1_loss( bboxes_norm, tgt_bboxes_norm, reduction='sum') / avg_factor losses = { 'loss_bbox': loss_bbox * self.loss_weights['loss_bbox'], 'loss_giou': loss_giou * self.loss_weights['loss_giou'] } return losses def loss_masks(self, pos_bbox_pred, mask_logits, targets, indices, avg_factor): tgt_segm = [ paddle.gather( tgt['gt_segm'], tgt_idx, axis=0) for tgt, (_, tgt_idx) in zip(targets, indices) ] tgt_masks = [] for i in range(len(indices)): gt_segm = tgt_segm[i].unsqueeze(1) if len(gt_segm) == 0: continue boxes = pos_bbox_pred[i] boxes[:, 0::2] = paddle.clip( boxes[:, 0::2], min=0, max=gt_segm.shape[3]) boxes[:, 1::2] = paddle.clip( boxes[:, 1::2], min=0, max=gt_segm.shape[2]) boxes_num = paddle.to_tensor([1] * len(boxes), dtype='int32') gt_mask = paddle.vision.ops.roi_align( gt_segm, boxes, boxes_num, output_size=mask_logits.shape[-2:], aligned=True) tgt_masks.append(gt_mask) tgt_masks = paddle.concat(tgt_masks).squeeze(1) tgt_masks = paddle.cast(tgt_masks >= 0.5, dtype='float32') tgt_masks.stop_gradient = True tgt_labels = paddle.concat([ paddle.gather( tgt['labels'], tgt_idx, axis=0) for tgt, (_, tgt_idx) in zip(targets, indices) ]) mask_label = F.one_hot(tgt_labels, self.num_classes).unsqueeze([2, 3]) mask_label = paddle.expand_as(mask_label, mask_logits) mask_label.stop_gradient = True src_masks = paddle.gather_nd(mask_logits, paddle.nonzero(mask_label)) shape = mask_logits.shape src_masks = paddle.reshape(src_masks, [shape[0], shape[2], shape[3]]) src_masks = F.sigmoid(src_masks) X = src_masks.flatten(1) Y = tgt_masks.flatten(1) inter = paddle.sum(X * Y, 1) union = paddle.sum(X * X, 1) + paddle.sum(Y * Y, 1) dice = (2 * inter) / (union + 2e-5) loss_mask = (1 - dice).sum() / avg_factor losses = {'loss_mask': loss_mask * self.loss_weights['loss_mask']} return losses @staticmethod def _get_src_permutation_idx(indices): batch_idx = paddle.concat( [paddle.full_like(src, i) for i, (src, _) in enumerate(indices)]) src_idx = paddle.concat([src for (src, _) in indices]) return batch_idx, src_idx ================================================ FILE: ppdet/modeling/losses/smooth_l1_loss.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register __all__ = ['SmoothL1Loss'] @register class SmoothL1Loss(nn.Layer): """Smooth L1 Loss. Args: beta (float): controls smooth region, it becomes L1 Loss when beta=0.0 loss_weight (float): the final loss will be multiplied by this """ def __init__(self, beta=1.0, loss_weight=1.0): super(SmoothL1Loss, self).__init__() assert beta >= 0 self.beta = beta self.loss_weight = loss_weight def forward(self, pred, target, reduction='none'): """forward function, based on fvcore. Args: pred (Tensor): prediction tensor target (Tensor): target tensor, pred.shape must be the same as target.shape reduction (str): the way to reduce loss, one of (none, sum, mean) """ assert reduction in ('none', 'sum', 'mean') target = target.detach() if self.beta < 1e-5: loss = paddle.abs(pred - target) else: n = paddle.abs(pred - target) cond = n < self.beta loss = paddle.where(cond, 0.5 * n ** 2 / self.beta, n - 0.5 * self.beta) if reduction == 'mean': loss = loss.mean() if loss.size > 0 else 0.0 * loss.sum() elif reduction == 'sum': loss = loss.sum() return loss * self.loss_weight ================================================ FILE: ppdet/modeling/losses/solov2_loss.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn.functional as F from ppdet.core.workspace import register, serializable __all__ = ['SOLOv2Loss'] @register @serializable class SOLOv2Loss(object): """ SOLOv2Loss Args: ins_loss_weight (float): Weight of instance loss. focal_loss_gamma (float): Gamma parameter for focal loss. focal_loss_alpha (float): Alpha parameter for focal loss. """ def __init__(self, ins_loss_weight=3.0, focal_loss_gamma=2.0, focal_loss_alpha=0.25): self.ins_loss_weight = ins_loss_weight self.focal_loss_gamma = focal_loss_gamma self.focal_loss_alpha = focal_loss_alpha def _dice_loss(self, input, target): input = paddle.reshape(input, shape=(input.shape[0], -1)) target = paddle.reshape(target, shape=(target.shape[0], -1)) a = paddle.sum(input * target, axis=1) b = paddle.sum(input * input, axis=1) + 0.001 c = paddle.sum(target * target, axis=1) + 0.001 d = (2 * a) / (b + c) return 1 - d def __call__(self, ins_pred_list, ins_label_list, cate_preds, cate_labels, num_ins): """ Get loss of network of SOLOv2. Args: ins_pred_list (list): Variable list of instance branch output. ins_label_list (list): List of instance labels pre batch. cate_preds (list): Concat Variable list of categroy branch output. cate_labels (list): Concat list of categroy labels pre batch. num_ins (int): Number of positive samples in a mini-batch. Returns: loss_ins (Variable): The instance loss Variable of SOLOv2 network. loss_cate (Variable): The category loss Variable of SOLOv2 network. """ #1. Ues dice_loss to calculate instance loss loss_ins = [] total_weights = paddle.zeros(shape=[1], dtype='float32') for input, target in zip(ins_pred_list, ins_label_list): if input is None: continue target = paddle.cast(target, 'float32') target = paddle.reshape( target, shape=[-1, input.shape[-2], input.shape[-1]]) weights = paddle.cast( paddle.sum(target, axis=[1, 2]) > 0, 'float32') input = F.sigmoid(input) dice_out = paddle.multiply(self._dice_loss(input, target), weights) total_weights += paddle.sum(weights) loss_ins.append(dice_out) loss_ins = paddle.sum(paddle.concat(loss_ins)) / total_weights loss_ins = loss_ins * self.ins_loss_weight #2. Ues sigmoid_focal_loss to calculate category loss # expand onehot labels num_classes = cate_preds.shape[-1] cate_labels_bin = F.one_hot(cate_labels, num_classes=num_classes + 1) cate_labels_bin = cate_labels_bin[:, 1:] loss_cate = F.sigmoid_focal_loss( cate_preds, label=cate_labels_bin, normalizer=num_ins + 1., gamma=self.focal_loss_gamma, alpha=self.focal_loss_alpha) return loss_ins, loss_cate ================================================ FILE: ppdet/modeling/losses/sparsercnn_loss.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This code is based on https://github.com/PeizeSun/SparseR-CNN/blob/main/projects/SparseRCNN/sparsercnn/loss.py Ths copyright of PeizeSun/SparseR-CNN is as follows: MIT License [see LICENSE for details] """ from __future__ import absolute_import from __future__ import division from __future__ import print_function from scipy.optimize import linear_sum_assignment import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle.metric import accuracy from ppdet.core.workspace import register from ppdet.modeling.losses.iou_loss import GIoULoss __all__ = ["SparseRCNNLoss"] @register class SparseRCNNLoss(nn.Layer): """ This class computes the loss for SparseRCNN. The process happens in two steps: 1) we compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched ground-truth / prediction (supervise class and box) """ __shared__ = ['num_classes'] def __init__(self, losses, focal_loss_alpha, focal_loss_gamma, num_classes=80, class_weight=2., l1_weight=5., giou_weight=2.): """ Create the criterion. Parameters: num_classes: number of object categories, omitting the special no-object category weight_dict: dict containing as key the names of the losses and as values their relative weight. losses: list of all the losses to be applied. See get_loss for list of available losses. matcher: module able to compute a matching between targets and proposals """ super().__init__() self.num_classes = num_classes weight_dict = { "loss_ce": class_weight, "loss_bbox": l1_weight, "loss_giou": giou_weight } self.weight_dict = weight_dict self.losses = losses self.giou_loss = GIoULoss(reduction="sum") self.focal_loss_alpha = focal_loss_alpha self.focal_loss_gamma = focal_loss_gamma self.matcher = HungarianMatcher(focal_loss_alpha, focal_loss_gamma, class_weight, l1_weight, giou_weight) def loss_labels(self, outputs, targets, indices, num_boxes, log=True): """Classification loss (NLL) targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes] """ assert 'pred_logits' in outputs src_logits = outputs['pred_logits'] idx = self._get_src_permutation_idx(indices) target_classes_o = paddle.concat([ paddle.gather( t["labels"], J, axis=0) for t, (_, J) in zip(targets, indices) ]) target_classes = paddle.full( src_logits.shape[:2], self.num_classes, dtype="int32") for i, ind in enumerate(zip(idx[0], idx[1])): target_classes[int(ind[0]), int(ind[1])] = target_classes_o[i] target_classes.stop_gradient = True src_logits = src_logits.flatten(start_axis=0, stop_axis=1) # prepare one_hot target. target_classes = target_classes.flatten(start_axis=0, stop_axis=1) class_ids = paddle.arange(0, self.num_classes) labels = (target_classes.unsqueeze(-1) == class_ids.astype(target_classes.dtype)).astype("float32") labels.stop_gradient = True # comp focal loss. class_loss = sigmoid_focal_loss( src_logits, labels, alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", ) / num_boxes losses = {'loss_ce': class_loss} if log: label_acc = target_classes_o.unsqueeze(-1) src_idx = [src for (src, _) in indices] pred_list = [] for i in range(outputs["pred_logits"].shape[0]): pred_list.append( paddle.gather( outputs["pred_logits"][i], src_idx[i], axis=0)) pred = F.sigmoid(paddle.concat(pred_list, axis=0)) acc = accuracy(pred, label_acc.astype("int64")) losses["acc"] = acc return losses def loss_boxes(self, outputs, targets, indices, num_boxes): """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4] The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size. """ assert 'pred_boxes' in outputs # [batch_size, num_proposals, 4] src_idx = [src for (src, _) in indices] src_boxes_list = [] for i in range(outputs["pred_boxes"].shape[0]): src_boxes_list.append( paddle.gather( outputs["pred_boxes"][i], src_idx[i], axis=0)) src_boxes = paddle.concat(src_boxes_list, axis=0) target_boxes = paddle.concat( [ paddle.gather( t['boxes'], I, axis=0) for t, (_, I) in zip(targets, indices) ], axis=0) target_boxes.stop_gradient = True losses = {} losses['loss_giou'] = self.giou_loss(src_boxes, target_boxes) / num_boxes image_size = paddle.concat([v["img_whwh_tgt"] for v in targets]) src_boxes_ = src_boxes / image_size target_boxes_ = target_boxes / image_size loss_bbox = F.l1_loss(src_boxes_, target_boxes_, reduction='sum') losses['loss_bbox'] = loss_bbox / num_boxes return losses def _get_src_permutation_idx(self, indices): # permute predictions following indices batch_idx = paddle.concat( [paddle.full_like(src, i) for i, (src, _) in enumerate(indices)]) src_idx = paddle.concat([src for (src, _) in indices]) return batch_idx, src_idx def _get_tgt_permutation_idx(self, indices): # permute targets following indices batch_idx = paddle.concat( [paddle.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]) tgt_idx = paddle.concat([tgt for (_, tgt) in indices]) return batch_idx, tgt_idx def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs): loss_map = { 'labels': self.loss_labels, 'boxes': self.loss_boxes, } assert loss in loss_map, f'do you really want to compute {loss} loss?' return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs) def forward(self, outputs, targets): """ This performs the loss computation. Parameters: outputs: dict of tensors, see the output specification of the model for the format targets: list of dicts, such that len(targets) == batch_size. The expected keys in each dict depends on the losses applied, see each loss' doc """ outputs_without_aux = { k: v for k, v in outputs.items() if k != 'aux_outputs' } # Retrieve the matching between the outputs of the last layer and the targets indices = self.matcher(outputs_without_aux, targets) # Compute the average number of target boxes across all nodes, for normalization purposes num_boxes = sum(len(t["labels"]) for t in targets) num_boxes = paddle.to_tensor( [num_boxes], dtype="float32", place=next(iter(outputs.values())).place) # Compute all the requested losses losses = {} for loss in self.losses: losses.update( self.get_loss(loss, outputs, targets, indices, num_boxes)) # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. if 'aux_outputs' in outputs: for i, aux_outputs in enumerate(outputs['aux_outputs']): indices = self.matcher(aux_outputs, targets) for loss in self.losses: kwargs = {} if loss == 'labels': # Logging is enabled only for the last layer kwargs = {'log': False} l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs) w_dict = {} for k in l_dict.keys(): if k in self.weight_dict: w_dict[k + f'_{i}'] = l_dict[k] * self.weight_dict[ k] else: w_dict[k + f'_{i}'] = l_dict[k] losses.update(w_dict) return losses class HungarianMatcher(nn.Layer): """This class computes an assignment between the targets and the predictions of the network For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are un-matched (and thus treated as non-objects). """ def __init__(self, focal_loss_alpha, focal_loss_gamma, cost_class: float=1, cost_bbox: float=1, cost_giou: float=1): """Creates the matcher Params: cost_class: This is the relative weight of the classification error in the matching cost cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost """ super().__init__() self.cost_class = cost_class self.cost_bbox = cost_bbox self.cost_giou = cost_giou self.focal_loss_alpha = focal_loss_alpha self.focal_loss_gamma = focal_loss_gamma assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0" @paddle.no_grad() def forward(self, outputs, targets): """ Performs the matching Args: outputs: This is a dict that contains at least these entries: "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates eg. outputs = {"pred_logits": pred_logits, "pred_boxes": pred_boxes} targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth objects in the target) containing the class labels "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates eg. targets = [{"labels":labels, "boxes": boxes}, ...,{"labels":labels, "boxes": boxes}] Returns: A list of size batch_size, containing tuples of (index_i, index_j) where: - index_i is the indices of the selected predictions (in order) - index_j is the indices of the corresponding selected targets (in order) For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) """ bs, num_queries = outputs["pred_logits"].shape[:2] if sum(len(v["labels"]) for v in targets) == 0: return [(paddle.to_tensor( [], dtype=paddle.int64), paddle.to_tensor( [], dtype=paddle.int64)) for _ in range(bs)] # We flatten to compute the cost matrices in a batch out_prob = F.sigmoid(outputs["pred_logits"].flatten( start_axis=0, stop_axis=1)) out_bbox = outputs["pred_boxes"].flatten(start_axis=0, stop_axis=1) # Also concat the target labels and boxes tgt_ids = paddle.concat([v["labels"] for v in targets]) assert (tgt_ids > -1).all() tgt_bbox = paddle.concat([v["boxes"] for v in targets]) # Compute the classification cost. Contrary to the loss, we don't use the NLL, # but approximate it in 1 - proba[target class]. # The 1 is a constant that doesn't change the matching, it can be ommitted. # Compute the classification cost. alpha = self.focal_loss_alpha gamma = self.focal_loss_gamma neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-( 1 - out_prob + 1e-8).log()) pos_cost_class = alpha * ((1 - out_prob) **gamma) * (-(out_prob + 1e-8).log()) cost_class = paddle.gather( pos_cost_class, tgt_ids, axis=1) - paddle.gather( neg_cost_class, tgt_ids, axis=1) # Compute the L1 cost between boxes image_size_out = paddle.concat( [v["img_whwh"].unsqueeze(0) for v in targets]) image_size_out = image_size_out.unsqueeze(1).tile( [1, num_queries, 1]).flatten( start_axis=0, stop_axis=1) image_size_tgt = paddle.concat([v["img_whwh_tgt"] for v in targets]) out_bbox_ = out_bbox / image_size_out tgt_bbox_ = tgt_bbox / image_size_tgt cost_bbox = F.l1_loss( out_bbox_.unsqueeze(-2), tgt_bbox_, reduction='none').sum(-1) # [batch_size * num_queries, num_tgts] # Compute the giou cost betwen boxes cost_giou = -get_bboxes_giou(out_bbox, tgt_bbox) # Final cost matrix C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou C = C.reshape([bs, num_queries, -1]) sizes = [len(v["boxes"]) for v in targets] indices = [ linear_sum_assignment(c[i].numpy()) for i, c in enumerate(C.split(sizes, -1)) ] return [(paddle.to_tensor( i, dtype="int32"), paddle.to_tensor( j, dtype="int32")) for i, j in indices] def box_area(boxes): assert (boxes[:, 2:] >= boxes[:, :2]).all() wh = boxes[:, 2:] - boxes[:, :2] return wh[:, 0] * wh[:, 1] def boxes_iou(boxes1, boxes2): ''' Compute iou Args: boxes1 (paddle.tensor) shape (N, 4) boxes2 (paddle.tensor) shape (M, 4) Return: (paddle.tensor) shape (N, M) ''' area1 = box_area(boxes1) area2 = box_area(boxes2) lt = paddle.maximum(boxes1.unsqueeze(-2)[:, :, :2], boxes2[:, :2]) rb = paddle.minimum(boxes1.unsqueeze(-2)[:, :, 2:], boxes2[:, 2:]) wh = (rb - lt).astype("float32").clip(min=1e-9) inter = wh[:, :, 0] * wh[:, :, 1] union = area1.unsqueeze(-1) + area2 - inter + 1e-9 iou = inter / union return iou, union def get_bboxes_giou(boxes1, boxes2, eps=1e-9): """calculate the ious of boxes1 and boxes2 Args: boxes1 (Tensor): shape [N, 4] boxes2 (Tensor): shape [M, 4] eps (float): epsilon to avoid divide by zero Return: ious (Tensor): ious of boxes1 and boxes2, with the shape [N, M] """ assert (boxes1[:, 2:] >= boxes1[:, :2]).all() assert (boxes2[:, 2:] >= boxes2[:, :2]).all() iou, union = boxes_iou(boxes1, boxes2) lt = paddle.minimum(boxes1.unsqueeze(-2)[:, :, :2], boxes2[:, :2]) rb = paddle.maximum(boxes1.unsqueeze(-2)[:, :, 2:], boxes2[:, 2:]) wh = (rb - lt).astype("float32").clip(min=eps) enclose_area = wh[:, :, 0] * wh[:, :, 1] giou = iou - (enclose_area - union) / enclose_area return giou def sigmoid_focal_loss(inputs, targets, alpha, gamma, reduction="sum"): assert reduction in ["sum", "mean" ], f'do not support this {reduction} reduction?' p = F.sigmoid(inputs) ce_loss = F.binary_cross_entropy_with_logits( inputs, targets, reduction="none") p_t = p * targets + (1 - p) * (1 - targets) loss = ce_loss * ((1 - p_t)**gamma) if alpha >= 0: alpha_t = alpha * targets + (1 - alpha) * (1 - targets) loss = alpha_t * loss if reduction == "mean": loss = loss.mean() elif reduction == "sum": loss = loss.sum() return loss ================================================ FILE: ppdet/modeling/losses/ssd_loss.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from ..bbox_utils import iou_similarity, bbox2delta __all__ = ['SSDLoss'] @register class SSDLoss(nn.Layer): """ SSDLoss Args: overlap_threshold (float32, optional): IoU threshold for negative bboxes and positive bboxes, 0.5 by default. neg_pos_ratio (float): The ratio of negative samples / positive samples. loc_loss_weight (float): The weight of loc_loss. conf_loss_weight (float): The weight of conf_loss. prior_box_var (list): Variances corresponding to prior box coord, [0.1, 0.1, 0.2, 0.2] by default. """ def __init__(self, overlap_threshold=0.5, neg_pos_ratio=3.0, loc_loss_weight=1.0, conf_loss_weight=1.0, prior_box_var=[0.1, 0.1, 0.2, 0.2]): super(SSDLoss, self).__init__() self.overlap_threshold = overlap_threshold self.neg_pos_ratio = neg_pos_ratio self.loc_loss_weight = loc_loss_weight self.conf_loss_weight = conf_loss_weight self.prior_box_var = [1. / a for a in prior_box_var] def _bipartite_match_for_batch(self, gt_bbox, gt_label, prior_boxes, bg_index): """ Args: gt_bbox (Tensor): [B, N, 4] gt_label (Tensor): [B, N, 1] prior_boxes (Tensor): [A, 4] bg_index (int): Background class index """ batch_size, num_priors = gt_bbox.shape[0], prior_boxes.shape[0] ious = iou_similarity(gt_bbox.reshape((-1, 4)), prior_boxes).reshape( (batch_size, -1, num_priors)) # For each prior box, get the max IoU of all GTs. prior_max_iou, prior_argmax_iou = ious.max(axis=1), ious.argmax(axis=1) # For each GT, get the max IoU of all prior boxes. gt_max_iou, gt_argmax_iou = ious.max(axis=2), ious.argmax(axis=2) # Gather target bbox and label according to 'prior_argmax_iou' index. batch_ind = paddle.arange(end=batch_size, dtype='int64').unsqueeze(-1) prior_argmax_iou = paddle.stack( [batch_ind.tile([1, num_priors]), prior_argmax_iou], axis=-1) targets_bbox = paddle.gather_nd(gt_bbox, prior_argmax_iou) targets_label = paddle.gather_nd(gt_label, prior_argmax_iou) # Assign negative bg_index_tensor = paddle.full([batch_size, num_priors, 1], bg_index, 'int64') targets_label = paddle.where( prior_max_iou.unsqueeze(-1) < self.overlap_threshold, bg_index_tensor, targets_label) # Ensure each GT can match the max IoU prior box. batch_ind = (batch_ind * num_priors + gt_argmax_iou).flatten() targets_bbox = paddle.scatter( targets_bbox.reshape([-1, 4]), batch_ind, gt_bbox.reshape([-1, 4])).reshape([batch_size, -1, 4]) targets_label = paddle.scatter( targets_label.reshape([-1, 1]), batch_ind, gt_label.reshape([-1, 1])).reshape([batch_size, -1, 1]) targets_label[:, :1] = bg_index # Encode box prior_boxes = prior_boxes.unsqueeze(0).tile([batch_size, 1, 1]) targets_bbox = bbox2delta( prior_boxes.reshape([-1, 4]), targets_bbox.reshape([-1, 4]), self.prior_box_var) targets_bbox = targets_bbox.reshape([batch_size, -1, 4]) return targets_bbox, targets_label def _mine_hard_example(self, conf_loss, targets_label, bg_index, mine_neg_ratio=0.01): pos = (targets_label != bg_index).astype(conf_loss.dtype) num_pos = pos.sum(axis=1, keepdim=True) neg = (targets_label == bg_index).astype(conf_loss.dtype) conf_loss = conf_loss.detach() * neg loss_idx = conf_loss.argsort(axis=1, descending=True) idx_rank = loss_idx.argsort(axis=1) num_negs = [] for i in range(conf_loss.shape[0]): cur_num_pos = num_pos[i] num_neg = paddle.clip( cur_num_pos * self.neg_pos_ratio, max=pos.shape[1]) num_neg = num_neg if num_neg > 0 else paddle.to_tensor( [pos.shape[1] * mine_neg_ratio]) num_negs.append(num_neg) num_negs = paddle.stack(num_negs).expand_as(idx_rank) neg_mask = (idx_rank.astype(num_negs.dtype) < num_negs).astype(conf_loss.dtype) return (neg_mask + pos).astype('bool') def forward(self, boxes, scores, gt_bbox, gt_label, prior_boxes): boxes = paddle.concat(boxes, axis=1) scores = paddle.concat(scores, axis=1) gt_label = gt_label.unsqueeze(-1).astype('int64') prior_boxes = paddle.concat(prior_boxes, axis=0) bg_index = scores.shape[-1] - 1 # Match bbox and get targets. targets_bbox, targets_label = \ self._bipartite_match_for_batch(gt_bbox, gt_label, prior_boxes, bg_index) targets_bbox.stop_gradient = True targets_label.stop_gradient = True # Compute regression loss. # Select positive samples. bbox_mask = paddle.tile(targets_label != bg_index, [1, 1, 4]) if bbox_mask.astype(boxes.dtype).sum() > 0: location = paddle.masked_select(boxes, bbox_mask) targets_bbox_tmp = paddle.masked_select(targets_bbox, bbox_mask) loc_loss = F.smooth_l1_loss(location, targets_bbox_tmp, reduction='sum') loc_loss = loc_loss * self.loc_loss_weight else: loc_loss = paddle.zeros([]) # Compute confidence loss. conf_loss = F.cross_entropy(scores, targets_label, reduction="none") # Mining hard examples. label_mask = self._mine_hard_example( conf_loss.squeeze(-1), targets_label.squeeze(-1), bg_index) conf_loss = paddle.masked_select(conf_loss, label_mask.unsqueeze(-1)) conf_loss = conf_loss.sum() * self.conf_loss_weight # Compute overall weighted loss. normalizer = (targets_label != bg_index).astype('float32').sum().clip( min=1) loss = (conf_loss + loc_loss) / normalizer return loss ================================================ FILE: ppdet/modeling/losses/supcontrast.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F import random from ppdet.core.workspace import register __all__ = ['SupContrast'] @register class SupContrast(nn.Layer): __shared__ = [ 'num_classes' ] def __init__(self, num_classes=80, temperature=2.5, sample_num=4096, thresh=0.75): super(SupContrast, self).__init__() self.num_classes = num_classes self.temperature = temperature self.sample_num = sample_num self.thresh = thresh def forward(self, features, labels, scores): assert features.shape[0] == labels.shape[0] == scores.shape[0] positive_mask = (labels < self.num_classes) positive_features, positive_labels, positive_scores = features[positive_mask], labels[positive_mask], \ scores[positive_mask] negative_mask = (labels == self.num_classes) negative_features, negative_labels, negative_scores = features[negative_mask], labels[negative_mask], \ scores[negative_mask] N = negative_features.shape[0] S = self.sample_num - positive_mask.sum() index = paddle.to_tensor(random.sample(range(N), int(S)), dtype='int32') negative_features = paddle.index_select(x=negative_features, index=index, axis=0) negative_labels = paddle.index_select(x=negative_labels, index=index, axis=0) negative_scores = paddle.index_select(x=negative_scores, index=index, axis=0) features = paddle.concat([positive_features, negative_features], 0) labels = paddle.concat([positive_labels, negative_labels], 0) scores = paddle.concat([positive_scores, negative_scores], 0) if len(labels.shape) == 1: labels = labels.reshape([-1, 1]) label_mask = paddle.equal(labels, labels.T).detach() similarity = (paddle.matmul(features, features.T) / self.temperature) sim_row_max = paddle.max(similarity, axis=1, keepdim=True) similarity = similarity - sim_row_max logits_mask = paddle.ones_like(similarity).detach() logits_mask.fill_diagonal_(0) exp_sim = paddle.exp(similarity) * logits_mask log_prob = similarity - paddle.log(exp_sim.sum(axis=1, keepdim=True)) per_label_log_prob = (log_prob * logits_mask * label_mask).sum(1) / label_mask.sum(1) keep = scores > self.thresh per_label_log_prob = per_label_log_prob[keep] loss = -per_label_log_prob return loss.mean() ================================================ FILE: ppdet/modeling/losses/varifocal_loss.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # The code is based on: # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/varifocal_loss.py from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register, serializable from ppdet.modeling import ops # from paddle.base.framework import in_dygraph_mode __all__ = ['VarifocalLoss'] def varifocal_loss(pred, target, alpha=0.75, gamma=2.0, iou_weighted=True, use_sigmoid=True): """`Varifocal Loss `_ Args: pred (Tensor): The prediction with shape (N, C), C is the number of classes target (Tensor): The learning target of the iou-aware classification score with shape (N, C), C is the number of classes. alpha (float, optional): A balance factor for the negative part of Varifocal Loss, which is different from the alpha of Focal Loss. Defaults to 0.75. gamma (float, optional): The gamma for calculating the modulating factor. Defaults to 2.0. iou_weighted (bool, optional): Whether to weight the loss of the positive example with the iou target. Defaults to True. """ # pred and target should be of the same size assert len(pred.shape) == len(target.shape) # rank # if in_dygraph_mode(): # assert pred.shape == target.shape if use_sigmoid: pred_new = F.sigmoid(pred) else: pred_new = pred target = target.cast(pred.dtype) if iou_weighted: focal_weight = target * (target > 0.0).cast('float32') + \ alpha * (pred_new - target).abs().pow(gamma) * \ (target <= 0.0).cast('float32') else: focal_weight = (target > 0.0).cast('float32') + \ alpha * (pred_new - target).abs().pow(gamma) * \ (target <= 0.0).cast('float32') if use_sigmoid: loss = F.binary_cross_entropy_with_logits( pred, target, reduction='none') * focal_weight else: loss = F.binary_cross_entropy( pred, target, reduction='none') * focal_weight loss = loss.sum(axis=1) return loss @register @serializable class VarifocalLoss(nn.Layer): def __init__(self, use_sigmoid=True, alpha=0.75, gamma=2.0, iou_weighted=True, reduction='mean', loss_weight=1.0): """`Varifocal Loss `_ Args: use_sigmoid (bool, optional): Whether the prediction is used for sigmoid or softmax. Defaults to True. alpha (float, optional): A balance factor for the negative part of Varifocal Loss, which is different from the alpha of Focal Loss. Defaults to 0.75. gamma (float, optional): The gamma for calculating the modulating factor. Defaults to 2.0. iou_weighted (bool, optional): Whether to weight the loss of the positive examples with the iou target. Defaults to True. reduction (str, optional): The method used to reduce the loss into a scalar. Defaults to 'mean'. Options are "none", "mean" and "sum". loss_weight (float, optional): Weight of loss. Defaults to 1.0. """ super(VarifocalLoss, self).__init__() assert alpha >= 0.0 self.use_sigmoid = use_sigmoid self.alpha = alpha self.gamma = gamma self.iou_weighted = iou_weighted self.reduction = reduction self.loss_weight = loss_weight def forward(self, pred, target, weight=None, avg_factor=None): """Forward function. Args: pred (Tensor): The prediction. target (Tensor): The learning target of the prediction. weight (Tensor, optional): The weight of loss for each prediction. Defaults to None. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. Returns: Tensor: The calculated loss """ loss = self.loss_weight * varifocal_loss( pred, target, alpha=self.alpha, gamma=self.gamma, iou_weighted=self.iou_weighted, use_sigmoid=self.use_sigmoid) if weight is not None: loss = loss * weight if avg_factor is None: if self.reduction == 'none': return loss elif self.reduction == 'mean': return loss.mean() elif self.reduction == 'sum': return loss.sum() else: # if reduction is mean, then average the loss by avg_factor if self.reduction == 'mean': loss = loss.sum() / avg_factor # if reduction is 'none', then do nothing, otherwise raise an error elif self.reduction != 'none': raise ValueError( 'avg_factor can not be used with reduction="sum"') return loss ================================================ FILE: ppdet/modeling/losses/yolo_loss.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from ..bbox_utils import decode_yolo, xywh2xyxy, batch_iou_similarity __all__ = ['YOLOv3Loss'] def bbox_transform(pbox, anchor, downsample): pbox = decode_yolo(pbox, anchor, downsample) pbox = xywh2xyxy(pbox) return pbox @register class YOLOv3Loss(nn.Layer): __inject__ = ['iou_loss', 'iou_aware_loss'] __shared__ = ['num_classes'] def __init__(self, num_classes=80, ignore_thresh=0.7, label_smooth=False, downsample=[32, 16, 8], scale_x_y=1., iou_loss=None, iou_aware_loss=None): """ YOLOv3Loss layer Args: num_calsses (int): number of foreground classes ignore_thresh (float): threshold to ignore confidence loss label_smooth (bool): whether to use label smoothing downsample (list): downsample ratio for each detection block scale_x_y (float): scale_x_y factor iou_loss (object): IoULoss instance iou_aware_loss (object): IouAwareLoss instance """ super(YOLOv3Loss, self).__init__() self.num_classes = num_classes self.ignore_thresh = ignore_thresh self.label_smooth = label_smooth self.downsample = downsample self.scale_x_y = scale_x_y self.iou_loss = iou_loss self.iou_aware_loss = iou_aware_loss self.distill_pairs = [] def obj_loss(self, pbox, gbox, pobj, tobj, anchor, downsample): # pbox pbox = decode_yolo(pbox, anchor, downsample) pbox = xywh2xyxy(pbox) pbox = paddle.concat(pbox, axis=-1) b = pbox.shape[0] pbox = pbox.reshape((b, -1, 4)) # gbox gxy = gbox[:, :, 0:2] - gbox[:, :, 2:4] * 0.5 gwh = gbox[:, :, 0:2] + gbox[:, :, 2:4] * 0.5 gbox = paddle.concat([gxy, gwh], axis=-1) iou = batch_iou_similarity(pbox, gbox) iou.stop_gradient = True iou_max = iou.max(2) # [N, M1] iou_mask = paddle.cast(iou_max <= self.ignore_thresh, dtype=pbox.dtype) iou_mask.stop_gradient = True pobj = pobj.reshape((b, -1)) tobj = tobj.reshape((b, -1)) obj_mask = paddle.cast(tobj > 0, dtype=pbox.dtype) obj_mask.stop_gradient = True loss_obj = F.binary_cross_entropy_with_logits( pobj, obj_mask, reduction='none') loss_obj_pos = (loss_obj * tobj) loss_obj_neg = (loss_obj * (1 - obj_mask) * iou_mask) return loss_obj_pos + loss_obj_neg def cls_loss(self, pcls, tcls): if self.label_smooth: delta = min(1. / self.num_classes, 1. / 40) pos, neg = 1 - delta, delta # 1 for positive, 0 for negative tcls = pos * paddle.cast( tcls > 0., dtype=tcls.dtype) + neg * paddle.cast( tcls <= 0., dtype=tcls.dtype) loss_cls = F.binary_cross_entropy_with_logits( pcls, tcls, reduction='none') return loss_cls def yolov3_loss(self, p, t, gt_box, anchor, downsample, scale=1., eps=1e-10): na = len(anchor) b, c, h, w = p.shape if self.iou_aware_loss: ioup, p = p[:, 0:na, :, :], p[:, na:, :, :] ioup = ioup.unsqueeze(-1) p = p.reshape((b, na, -1, h, w)).transpose((0, 1, 3, 4, 2)) x, y = p[:, :, :, :, 0:1], p[:, :, :, :, 1:2] w, h = p[:, :, :, :, 2:3], p[:, :, :, :, 3:4] obj, pcls = p[:, :, :, :, 4:5], p[:, :, :, :, 5:] self.distill_pairs.append([x, y, w, h, obj, pcls]) t = t.transpose((0, 1, 3, 4, 2)) tx, ty = t[:, :, :, :, 0:1], t[:, :, :, :, 1:2] tw, th = t[:, :, :, :, 2:3], t[:, :, :, :, 3:4] tscale = t[:, :, :, :, 4:5] tobj, tcls = t[:, :, :, :, 5:6], t[:, :, :, :, 6:] tscale_obj = tscale * tobj loss = dict() x = scale * F.sigmoid(x) - 0.5 * (scale - 1.) y = scale * F.sigmoid(y) - 0.5 * (scale - 1.) if abs(scale - 1.) < eps: loss_x = F.binary_cross_entropy(x, tx, reduction='none') loss_y = F.binary_cross_entropy(y, ty, reduction='none') loss_xy = tscale_obj * (loss_x + loss_y) else: loss_x = paddle.abs(x - tx) loss_y = paddle.abs(y - ty) loss_xy = tscale_obj * (loss_x + loss_y) loss_xy = loss_xy.sum([1, 2, 3, 4]).mean() loss_w = paddle.abs(w - tw) loss_h = paddle.abs(h - th) loss_wh = tscale_obj * (loss_w + loss_h) loss_wh = loss_wh.sum([1, 2, 3, 4]).mean() loss['loss_xy'] = loss_xy loss['loss_wh'] = loss_wh if self.iou_loss is not None: # warn: do not modify x, y, w, h in place box, tbox = [x, y, w, h], [tx, ty, tw, th] pbox = bbox_transform(box, anchor, downsample) gbox = bbox_transform(tbox, anchor, downsample) loss_iou = self.iou_loss(pbox, gbox) loss_iou = loss_iou * tscale_obj loss_iou = loss_iou.sum([1, 2, 3, 4]).mean() loss['loss_iou'] = loss_iou if self.iou_aware_loss is not None: box, tbox = [x, y, w, h], [tx, ty, tw, th] pbox = bbox_transform(box, anchor, downsample) gbox = bbox_transform(tbox, anchor, downsample) loss_iou_aware = self.iou_aware_loss(ioup, pbox, gbox) loss_iou_aware = loss_iou_aware * tobj loss_iou_aware = loss_iou_aware.sum([1, 2, 3, 4]).mean() loss['loss_iou_aware'] = loss_iou_aware box = [x, y, w, h] loss_obj = self.obj_loss(box, gt_box, obj, tobj, anchor, downsample) loss_obj = loss_obj.sum(-1).mean() loss['loss_obj'] = loss_obj loss_cls = self.cls_loss(pcls, tcls) * tobj loss_cls = loss_cls.sum([1, 2, 3, 4]).mean() loss['loss_cls'] = loss_cls return loss def forward(self, inputs, targets, anchors): np = len(inputs) gt_targets = [targets['target{}'.format(i)] for i in range(np)] gt_box = targets['gt_bbox'] yolo_losses = dict() self.distill_pairs.clear() for x, t, anchor, downsample in zip(inputs, gt_targets, anchors, self.downsample): yolo_loss = self.yolov3_loss( x.astype('float32'), t, gt_box, anchor, downsample, self.scale_x_y) for k, v in yolo_loss.items(): if k in yolo_losses: yolo_losses[k] += v else: yolo_losses[k] = v loss = 0 for k, v in yolo_losses.items(): loss += v yolo_losses['loss'] = loss return yolo_losses ================================================ FILE: ppdet/modeling/mot/__init__.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import matching from . import tracker from . import motion from . import visualization from . import utils from .matching import * from .tracker import * from .motion import * from .visualization import * from .utils import * ================================================ FILE: ppdet/modeling/mot/matching/__init__.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import jde_matching from . import deepsort_matching from . import ocsort_matching from .jde_matching import * from .deepsort_matching import * from .ocsort_matching import * ================================================ FILE: ppdet/modeling/mot/matching/deepsort_matching.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This code is based on https://github.com/nwojke/deep_sort/tree/master/deep_sort """ import numpy as np from scipy.optimize import linear_sum_assignment from ..motion import kalman_filter INFTY_COST = 1e+5 __all__ = [ 'iou_1toN', 'iou_cost', '_nn_euclidean_distance', '_nn_cosine_distance', 'NearestNeighborDistanceMetric', 'min_cost_matching', 'matching_cascade', 'gate_cost_matrix', ] def iou_1toN(bbox, candidates): """ Computer intersection over union (IoU) by one box to N candidates. Args: bbox (ndarray): A bounding box in format `(top left x, top left y, width, height)`. candidates (ndarray): A matrix of candidate bounding boxes (one per row) in the same format as `bbox`. Returns: ious (ndarray): The intersection over union in [0, 1] between the `bbox` and each candidate. A higher score means a larger fraction of the `bbox` is occluded by the candidate. """ bbox_tl = bbox[:2] bbox_br = bbox[:2] + bbox[2:] candidates_tl = candidates[:, :2] candidates_br = candidates[:, :2] + candidates[:, 2:] tl = np.c_[np.maximum(bbox_tl[0], candidates_tl[:, 0])[:, np.newaxis], np.maximum(bbox_tl[1], candidates_tl[:, 1])[:, np.newaxis]] br = np.c_[np.minimum(bbox_br[0], candidates_br[:, 0])[:, np.newaxis], np.minimum(bbox_br[1], candidates_br[:, 1])[:, np.newaxis]] wh = np.maximum(0., br - tl) area_intersection = wh.prod(axis=1) area_bbox = bbox[2:].prod() area_candidates = candidates[:, 2:].prod(axis=1) ious = area_intersection / (area_bbox + area_candidates - area_intersection) return ious def iou_cost(tracks, detections, track_indices=None, detection_indices=None): """ IoU distance metric. Args: tracks (list[Track]): A list of tracks. detections (list[Detection]): A list of detections. track_indices (Optional[list[int]]): A list of indices to tracks that should be matched. Defaults to all `tracks`. detection_indices (Optional[list[int]]): A list of indices to detections that should be matched. Defaults to all `detections`. Returns: cost_matrix (ndarray): A cost matrix of shape len(track_indices), len(detection_indices) where entry (i, j) is `1 - iou(tracks[track_indices[i]], detections[detection_indices[j]])`. """ if track_indices is None: track_indices = np.arange(len(tracks)) if detection_indices is None: detection_indices = np.arange(len(detections)) cost_matrix = np.zeros((len(track_indices), len(detection_indices))) for row, track_idx in enumerate(track_indices): if tracks[track_idx].time_since_update > 1: cost_matrix[row, :] = 1e+5 continue bbox = tracks[track_idx].to_tlwh() candidates = np.asarray([detections[i].tlwh for i in detection_indices]) cost_matrix[row, :] = 1. - iou_1toN(bbox, candidates) return cost_matrix def _nn_euclidean_distance(s, q): """ Compute pair-wise squared (Euclidean) distance between points in `s` and `q`. Args: s (ndarray): Sample points: an NxM matrix of N samples of dimensionality M. q (ndarray): Query points: an LxM matrix of L samples of dimensionality M. Returns: distances (ndarray): A vector of length M that contains for each entry in `q` the smallest Euclidean distance to a sample in `s`. """ s, q = np.asarray(s), np.asarray(q) if len(s) == 0 or len(q) == 0: return np.zeros((len(s), len(q))) s2, q2 = np.square(s).sum(axis=1), np.square(q).sum(axis=1) distances = -2. * np.dot(s, q.T) + s2[:, None] + q2[None, :] distances = np.clip(distances, 0., float(np.inf)) return np.maximum(0.0, distances.min(axis=0)) def _nn_cosine_distance(s, q): """ Compute pair-wise cosine distance between points in `s` and `q`. Args: s (ndarray): Sample points: an NxM matrix of N samples of dimensionality M. q (ndarray): Query points: an LxM matrix of L samples of dimensionality M. Returns: distances (ndarray): A vector of length M that contains for each entry in `q` the smallest Euclidean distance to a sample in `s`. """ s = np.asarray(s) / np.linalg.norm(s, axis=1, keepdims=True) q = np.asarray(q) / np.linalg.norm(q, axis=1, keepdims=True) distances = 1. - np.dot(s, q.T) return distances.min(axis=0) class NearestNeighborDistanceMetric(object): """ A nearest neighbor distance metric that, for each target, returns the closest distance to any sample that has been observed so far. Args: metric (str): Either "euclidean" or "cosine". matching_threshold (float): The matching threshold. Samples with larger distance are considered an invalid match. budget (Optional[int]): If not None, fix samples per class to at most this number. Removes the oldest samples when the budget is reached. Attributes: samples (Dict[int -> List[ndarray]]): A dictionary that maps from target identities to the list of samples that have been observed so far. """ def __init__(self, metric, matching_threshold, budget=None): if metric == "euclidean": self._metric = _nn_euclidean_distance elif metric == "cosine": self._metric = _nn_cosine_distance else: raise ValueError( "Invalid metric; must be either 'euclidean' or 'cosine'") self.matching_threshold = matching_threshold self.budget = budget self.samples = {} def partial_fit(self, features, targets, active_targets): """ Update the distance metric with new data. Args: features (ndarray): An NxM matrix of N features of dimensionality M. targets (ndarray): An integer array of associated target identities. active_targets (List[int]): A list of targets that are currently present in the scene. """ for feature, target in zip(features, targets): self.samples.setdefault(target, []).append(feature) if self.budget is not None: self.samples[target] = self.samples[target][-self.budget:] self.samples = {k: self.samples[k] for k in active_targets} def distance(self, features, targets): """ Compute distance between features and targets. Args: features (ndarray): An NxM matrix of N features of dimensionality M. targets (list[int]): A list of targets to match the given `features` against. Returns: cost_matrix (ndarray): a cost matrix of shape len(targets), len(features), where element (i, j) contains the closest squared distance between `targets[i]` and `features[j]`. """ cost_matrix = np.zeros((len(targets), len(features))) for i, target in enumerate(targets): cost_matrix[i, :] = self._metric(self.samples[target], features) return cost_matrix def min_cost_matching(distance_metric, max_distance, tracks, detections, track_indices=None, detection_indices=None): """ Solve linear assignment problem. Args: distance_metric : Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray The distance metric is given a list of tracks and detections as well as a list of N track indices and M detection indices. The metric should return the NxM dimensional cost matrix, where element (i, j) is the association cost between the i-th track in the given track indices and the j-th detection in the given detection_indices. max_distance (float): Gating threshold. Associations with cost larger than this value are disregarded. tracks (list[Track]): A list of predicted tracks at the current time step. detections (list[Detection]): A list of detections at the current time step. track_indices (list[int]): List of track indices that maps rows in `cost_matrix` to tracks in `tracks`. detection_indices (List[int]): List of detection indices that maps columns in `cost_matrix` to detections in `detections`. Returns: A tuple (List[(int, int)], List[int], List[int]) with the following three entries: * A list of matched track and detection indices. * A list of unmatched track indices. * A list of unmatched detection indices. """ if track_indices is None: track_indices = np.arange(len(tracks)) if detection_indices is None: detection_indices = np.arange(len(detections)) if len(detection_indices) == 0 or len(track_indices) == 0: return [], track_indices, detection_indices # Nothing to match. cost_matrix = distance_metric(tracks, detections, track_indices, detection_indices) cost_matrix[cost_matrix > max_distance] = max_distance + 1e-5 indices = linear_sum_assignment(cost_matrix) matches, unmatched_tracks, unmatched_detections = [], [], [] for col, detection_idx in enumerate(detection_indices): if col not in indices[1]: unmatched_detections.append(detection_idx) for row, track_idx in enumerate(track_indices): if row not in indices[0]: unmatched_tracks.append(track_idx) for row, col in zip(indices[0], indices[1]): track_idx = track_indices[row] detection_idx = detection_indices[col] if cost_matrix[row, col] > max_distance: unmatched_tracks.append(track_idx) unmatched_detections.append(detection_idx) else: matches.append((track_idx, detection_idx)) return matches, unmatched_tracks, unmatched_detections def matching_cascade(distance_metric, max_distance, cascade_depth, tracks, detections, track_indices=None, detection_indices=None): """ Run matching cascade. Args: distance_metric : Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray The distance metric is given a list of tracks and detections as well as a list of N track indices and M detection indices. The metric should return the NxM dimensional cost matrix, where element (i, j) is the association cost between the i-th track in the given track indices and the j-th detection in the given detection_indices. max_distance (float): Gating threshold. Associations with cost larger than this value are disregarded. cascade_depth (int): The cascade depth, should be se to the maximum track age. tracks (list[Track]): A list of predicted tracks at the current time step. detections (list[Detection]): A list of detections at the current time step. track_indices (list[int]): List of track indices that maps rows in `cost_matrix` to tracks in `tracks`. detection_indices (List[int]): List of detection indices that maps columns in `cost_matrix` to detections in `detections`. Returns: A tuple (List[(int, int)], List[int], List[int]) with the following three entries: * A list of matched track and detection indices. * A list of unmatched track indices. * A list of unmatched detection indices. """ if track_indices is None: track_indices = list(range(len(tracks))) if detection_indices is None: detection_indices = list(range(len(detections))) unmatched_detections = detection_indices matches = [] for level in range(cascade_depth): if len(unmatched_detections) == 0: # No detections left break track_indices_l = [ k for k in track_indices if tracks[k].time_since_update == 1 + level ] if len(track_indices_l) == 0: # Nothing to match at this level continue matches_l, _, unmatched_detections = \ min_cost_matching( distance_metric, max_distance, tracks, detections, track_indices_l, unmatched_detections) matches += matches_l unmatched_tracks = list(set(track_indices) - set(k for k, _ in matches)) return matches, unmatched_tracks, unmatched_detections def gate_cost_matrix(kf, cost_matrix, tracks, detections, track_indices, detection_indices, gated_cost=INFTY_COST, only_position=False): """ Invalidate infeasible entries in cost matrix based on the state distributions obtained by Kalman filtering. Args: kf (object): The Kalman filter. cost_matrix (ndarray): The NxM dimensional cost matrix, where N is the number of track indices and M is the number of detection indices, such that entry (i, j) is the association cost between `tracks[track_indices[i]]` and `detections[detection_indices[j]]`. tracks (list[Track]): A list of predicted tracks at the current time step. detections (list[Detection]): A list of detections at the current time step. track_indices (List[int]): List of track indices that maps rows in `cost_matrix` to tracks in `tracks`. detection_indices (List[int]): List of detection indices that maps columns in `cost_matrix` to detections in `detections`. gated_cost (Optional[float]): Entries in the cost matrix corresponding to infeasible associations are set this value. Defaults to a very large value. only_position (Optional[bool]): If True, only the x, y position of the state distribution is considered during gating. Default False. """ gating_dim = 2 if only_position else 4 gating_threshold = kalman_filter.chi2inv95[gating_dim] measurements = np.asarray( [detections[i].to_xyah() for i in detection_indices]) for row, track_idx in enumerate(track_indices): track = tracks[track_idx] gating_distance = kf.gating_distance(track.mean, track.covariance, measurements, only_position) cost_matrix[row, gating_distance > gating_threshold] = gated_cost return cost_matrix ================================================ FILE: ppdet/modeling/mot/matching/jde_matching.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This code is based on https://github.com/Zhongdao/Towards-Realtime-MOT/blob/master/tracker/matching.py """ try: import lap except: print( 'Warning: Unable to use JDE/FairMOT/ByteTrack, please install lap, for example: `pip install lap`, see https://github.com/gatagat/lap' ) pass import scipy import numpy as np from scipy.spatial.distance import cdist from ..motion import kalman_filter import warnings warnings.filterwarnings("ignore") __all__ = [ 'merge_matches', 'linear_assignment', 'bbox_ious', 'iou_distance', 'embedding_distance', 'fuse_motion', ] def merge_matches(m1, m2, shape): O, P, Q = shape m1 = np.asarray(m1) m2 = np.asarray(m2) M1 = scipy.sparse.coo_matrix( (np.ones(len(m1)), (m1[:, 0], m1[:, 1])), shape=(O, P)) M2 = scipy.sparse.coo_matrix( (np.ones(len(m2)), (m2[:, 0], m2[:, 1])), shape=(P, Q)) mask = M1 * M2 match = mask.nonzero() match = list(zip(match[0], match[1])) unmatched_O = tuple(set(range(O)) - set([i for i, j in match])) unmatched_Q = tuple(set(range(Q)) - set([j for i, j in match])) return match, unmatched_O, unmatched_Q def linear_assignment(cost_matrix, thresh): try: import lap except Exception as e: raise RuntimeError( 'Unable to use JDE/FairMOT/ByteTrack, please install lap, for example: `pip install lap`, see https://github.com/gatagat/lap' ) if cost_matrix.size == 0: return np.empty( (0, 2), dtype=int), tuple(range(cost_matrix.shape[0])), tuple( range(cost_matrix.shape[1])) matches, unmatched_a, unmatched_b = [], [], [] cost, x, y = lap.lapjv(cost_matrix, extend_cost=True, cost_limit=thresh) for ix, mx in enumerate(x): if mx >= 0: matches.append([ix, mx]) unmatched_a = np.where(x < 0)[0] unmatched_b = np.where(y < 0)[0] matches = np.asarray(matches) return matches, unmatched_a, unmatched_b def bbox_ious(atlbrs, btlbrs): boxes = np.ascontiguousarray(atlbrs, dtype=np.float32) query_boxes = np.ascontiguousarray(btlbrs, dtype=np.float32) N = boxes.shape[0] K = query_boxes.shape[0] ious = np.zeros((N, K), dtype=boxes.dtype) if N * K == 0: return ious for k in range(K): box_area = ((query_boxes[k, 2] - query_boxes[k, 0] + 1) * (query_boxes[k, 3] - query_boxes[k, 1] + 1)) for n in range(N): iw = (min(boxes[n, 2], query_boxes[k, 2]) - max( boxes[n, 0], query_boxes[k, 0]) + 1) if iw > 0: ih = (min(boxes[n, 3], query_boxes[k, 3]) - max( boxes[n, 1], query_boxes[k, 1]) + 1) if ih > 0: ua = float((boxes[n, 2] - boxes[n, 0] + 1) * (boxes[ n, 3] - boxes[n, 1] + 1) + box_area - iw * ih) ious[n, k] = iw * ih / ua return ious def iou_distance(atracks, btracks): """ Compute cost based on IoU between two list[STrack]. """ if (len(atracks) > 0 and isinstance(atracks[0], np.ndarray)) or ( len(btracks) > 0 and isinstance(btracks[0], np.ndarray)): atlbrs = atracks btlbrs = btracks else: atlbrs = [track.tlbr for track in atracks] btlbrs = [track.tlbr for track in btracks] _ious = bbox_ious(atlbrs, btlbrs) cost_matrix = 1 - _ious return cost_matrix def embedding_distance(tracks, detections, metric='euclidean'): """ Compute cost based on features between two list[STrack]. """ cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float32) if cost_matrix.size == 0: return cost_matrix det_features = np.asarray( [track.curr_feat for track in detections], dtype=np.float32) track_features = np.asarray( [track.smooth_feat for track in tracks], dtype=np.float32) cost_matrix = np.maximum(0.0, cdist(track_features, det_features, metric)) # Nomalized features return cost_matrix def fuse_motion(kf, cost_matrix, tracks, detections, only_position=False, lambda_=0.98): if cost_matrix.size == 0: return cost_matrix gating_dim = 2 if only_position else 4 gating_threshold = kalman_filter.chi2inv95[gating_dim] measurements = np.asarray([det.to_xyah() for det in detections]) for row, track in enumerate(tracks): gating_distance = kf.gating_distance( track.mean, track.covariance, measurements, only_position, metric='maha') cost_matrix[row, gating_distance > gating_threshold] = np.inf cost_matrix[row] = lambda_ * cost_matrix[row] + (1 - lambda_ ) * gating_distance return cost_matrix ================================================ FILE: ppdet/modeling/mot/matching/ocsort_matching.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This code is based on https://github.com/noahcao/OC_SORT/blob/master/trackers/ocsort_tracker/association.py """ import os import numpy as np def iou_batch(bboxes1, bboxes2): bboxes2 = np.expand_dims(bboxes2, 0) bboxes1 = np.expand_dims(bboxes1, 1) xx1 = np.maximum(bboxes1[..., 0], bboxes2[..., 0]) yy1 = np.maximum(bboxes1[..., 1], bboxes2[..., 1]) xx2 = np.minimum(bboxes1[..., 2], bboxes2[..., 2]) yy2 = np.minimum(bboxes1[..., 3], bboxes2[..., 3]) w = np.maximum(0., xx2 - xx1) h = np.maximum(0., yy2 - yy1) area = w * h iou_matrix = area / ((bboxes1[..., 2] - bboxes1[..., 0]) * (bboxes1[..., 3] - bboxes1[..., 1]) + (bboxes2[..., 2] - bboxes2[..., 0]) * (bboxes2[..., 3] - bboxes2[..., 1]) - area) return iou_matrix def speed_direction_batch(dets, tracks): tracks = tracks[..., np.newaxis] CX1, CY1 = (dets[:, 0] + dets[:, 2]) / 2.0, (dets[:, 1] + dets[:, 3]) / 2.0 CX2, CY2 = (tracks[:, 0] + tracks[:, 2]) / 2.0, ( tracks[:, 1] + tracks[:, 3]) / 2.0 dx = CX1 - CX2 dy = CY1 - CY2 norm = np.sqrt(dx**2 + dy**2) + 1e-6 dx = dx / norm dy = dy / norm return dy, dx def linear_assignment(cost_matrix): try: import lap _, x, y = lap.lapjv(cost_matrix, extend_cost=True) return np.array([[y[i], i] for i in x if i >= 0]) except ImportError: from scipy.optimize import linear_sum_assignment x, y = linear_sum_assignment(cost_matrix) return np.array(list(zip(x, y))) def associate(detections, trackers, iou_threshold, velocities, previous_obs, vdc_weight): if (len(trackers) == 0): return np.empty( (0, 2), dtype=int), np.arange(len(detections)), np.empty( (0, 5), dtype=int) Y, X = speed_direction_batch(detections, previous_obs) inertia_Y, inertia_X = velocities[:, 0], velocities[:, 1] inertia_Y = np.repeat(inertia_Y[:, np.newaxis], Y.shape[1], axis=1) inertia_X = np.repeat(inertia_X[:, np.newaxis], X.shape[1], axis=1) diff_angle_cos = inertia_X * X + inertia_Y * Y diff_angle_cos = np.clip(diff_angle_cos, a_min=-1, a_max=1) diff_angle = np.arccos(diff_angle_cos) diff_angle = (np.pi / 2.0 - np.abs(diff_angle)) / np.pi valid_mask = np.ones(previous_obs.shape[0]) valid_mask[np.where(previous_obs[:, 4] < 0)] = 0 iou_matrix = iou_batch(detections, trackers) scores = np.repeat( detections[:, -1][:, np.newaxis], trackers.shape[0], axis=1) # iou_matrix = iou_matrix * scores # a trick sometiems works, we don't encourage this valid_mask = np.repeat(valid_mask[:, np.newaxis], X.shape[1], axis=1) angle_diff_cost = (valid_mask * diff_angle) * vdc_weight angle_diff_cost = angle_diff_cost.T angle_diff_cost = angle_diff_cost * scores if min(iou_matrix.shape) > 0: a = (iou_matrix > iou_threshold).astype(np.int32) if a.sum(1).max() == 1 and a.sum(0).max() == 1: matched_indices = np.stack(np.where(a), axis=1) else: matched_indices = linear_assignment(-(iou_matrix + angle_diff_cost)) else: matched_indices = np.empty(shape=(0, 2)) unmatched_detections = [] for d, det in enumerate(detections): if (d not in matched_indices[:, 0]): unmatched_detections.append(d) unmatched_trackers = [] for t, trk in enumerate(trackers): if (t not in matched_indices[:, 1]): unmatched_trackers.append(t) # filter out matched with low IOU matches = [] for m in matched_indices: if (iou_matrix[m[0], m[1]] < iou_threshold): unmatched_detections.append(m[0]) unmatched_trackers.append(m[1]) else: matches.append(m.reshape(1, 2)) if (len(matches) == 0): matches = np.empty((0, 2), dtype=int) else: matches = np.concatenate(matches, axis=0) return matches, np.array(unmatched_detections), np.array(unmatched_trackers) def associate_only_iou(detections, trackers, iou_threshold): if (len(trackers) == 0): return np.empty( (0, 2), dtype=int), np.arange(len(detections)), np.empty( (0, 5), dtype=int) iou_matrix = iou_batch(detections, trackers) if min(iou_matrix.shape) > 0: a = (iou_matrix > iou_threshold).astype(np.int32) if a.sum(1).max() == 1 and a.sum(0).max() == 1: matched_indices = np.stack(np.where(a), axis=1) else: matched_indices = linear_assignment(-iou_matrix) else: matched_indices = np.empty(shape=(0, 2)) unmatched_detections = [] for d, det in enumerate(detections): if (d not in matched_indices[:, 0]): unmatched_detections.append(d) unmatched_trackers = [] for t, trk in enumerate(trackers): if (t not in matched_indices[:, 1]): unmatched_trackers.append(t) # filter out matched with low IOU matches = [] for m in matched_indices: if (iou_matrix[m[0], m[1]] < iou_threshold): unmatched_detections.append(m[0]) unmatched_trackers.append(m[1]) else: matches.append(m.reshape(1, 2)) if (len(matches) == 0): matches = np.empty((0, 2), dtype=int) else: matches = np.concatenate(matches, axis=0) return matches, np.array(unmatched_detections), np.array(unmatched_trackers) ================================================ FILE: ppdet/modeling/mot/motion/__init__.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import kalman_filter from .kalman_filter import * from .gmc import * ================================================ FILE: ppdet/modeling/mot/motion/gmc.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This code is based on https://github.com/WWangYuHsiang/SMILEtrack/blob/main/BoT-SORT/tracker/gmc.py """ import cv2 import matplotlib.pyplot as plt import numpy as np import copy import time from ppdet.core.workspace import register, serializable @register @serializable class GMC: def __init__(self, method='sparseOptFlow', downscale=2, verbose=None): super(GMC, self).__init__() self.method = method self.downscale = max(1, int(downscale)) if self.method == 'orb': self.detector = cv2.FastFeatureDetector_create(20) self.extractor = cv2.ORB_create() self.matcher = cv2.BFMatcher(cv2.NORM_HAMMING) elif self.method == 'sift': self.detector = cv2.SIFT_create( nOctaveLayers=3, contrastThreshold=0.02, edgeThreshold=20) self.extractor = cv2.SIFT_create( nOctaveLayers=3, contrastThreshold=0.02, edgeThreshold=20) self.matcher = cv2.BFMatcher(cv2.NORM_L2) elif self.method == 'ecc': number_of_iterations = 5000 termination_eps = 1e-6 self.warp_mode = cv2.MOTION_EUCLIDEAN self.criteria = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, number_of_iterations, termination_eps) elif self.method == 'sparseOptFlow': self.feature_params = dict( maxCorners=1000, qualityLevel=0.01, minDistance=1, blockSize=3, useHarrisDetector=False, k=0.04) # self.gmc_file = open('GMC_results.txt', 'w') elif self.method == 'file' or self.method == 'files': seqName = verbose[0] ablation = verbose[1] if ablation: filePath = r'tracker/GMC_files/MOT17_ablation' else: filePath = r'tracker/GMC_files/MOTChallenge' if '-FRCNN' in seqName: seqName = seqName[:-6] elif '-DPM' in seqName: seqName = seqName[:-4] elif '-SDP' in seqName: seqName = seqName[:-4] self.gmcFile = open(filePath + "/GMC-" + seqName + ".txt", 'r') if self.gmcFile is None: raise ValueError("Error: Unable to open GMC file in directory:" + filePath) elif self.method == 'none' or self.method == 'None': self.method = 'none' else: raise ValueError("Error: Unknown CMC method:" + method) self.prevFrame = None self.prevKeyPoints = None self.prevDescriptors = None self.initializedFirstFrame = False def apply(self, raw_frame, detections=None): if self.method == 'orb' or self.method == 'sift': return self.applyFeaures(raw_frame, detections) elif self.method == 'ecc': return self.applyEcc(raw_frame, detections) elif self.method == 'sparseOptFlow': return self.applySparseOptFlow(raw_frame, detections) elif self.method == 'file': return self.applyFile(raw_frame, detections) elif self.method == 'none': return np.eye(2, 3) else: return np.eye(2, 3) def applyEcc(self, raw_frame, detections=None): # Initialize height, width, _ = raw_frame.shape frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2GRAY) H = np.eye(2, 3, dtype=np.float32) # Downscale image (TODO: consider using pyramids) if self.downscale > 1.0: frame = cv2.GaussianBlur(frame, (3, 3), 1.5) frame = cv2.resize(frame, (width // self.downscale, height // self.downscale)) width = width // self.downscale height = height // self.downscale # Handle first frame if not self.initializedFirstFrame: # Initialize data self.prevFrame = frame.copy() # Initialization done self.initializedFirstFrame = True return H # Run the ECC algorithm. The results are stored in warp_matrix. # (cc, H) = cv2.findTransformECC(self.prevFrame, frame, H, self.warp_mode, self.criteria) try: (cc, H) = cv2.findTransformECC(self.prevFrame, frame, H, self.warp_mode, self.criteria, None, 1) except: print('Warning: find transform failed. Set warp as identity') return H def applyFeaures(self, raw_frame, detections=None): # Initialize height, width, _ = raw_frame.shape frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2GRAY) H = np.eye(2, 3) # Downscale image (TODO: consider using pyramids) if self.downscale > 1.0: # frame = cv2.GaussianBlur(frame, (3, 3), 1.5) frame = cv2.resize(frame, (width // self.downscale, height // self.downscale)) width = width // self.downscale height = height // self.downscale # find the keypoints mask = np.zeros_like(frame) # mask[int(0.05 * height): int(0.95 * height), int(0.05 * width): int(0.95 * width)] = 255 mask[int(0.02 * height):int(0.98 * height), int(0.02 * width):int( 0.98 * width)] = 255 if detections is not None: for det in detections: tlbr = (det[:4] / self.downscale).astype(np.int_) mask[tlbr[1]:tlbr[3], tlbr[0]:tlbr[2]] = 0 keypoints = self.detector.detect(frame, mask) # compute the descriptors keypoints, descriptors = self.extractor.compute(frame, keypoints) # Handle first frame if not self.initializedFirstFrame: # Initialize data self.prevFrame = frame.copy() self.prevKeyPoints = copy.copy(keypoints) self.prevDescriptors = copy.copy(descriptors) # Initialization done self.initializedFirstFrame = True return H # Match descriptors. knnMatches = self.matcher.knnMatch(self.prevDescriptors, descriptors, 2) # Filtered matches based on smallest spatial distance matches = [] spatialDistances = [] maxSpatialDistance = 0.25 * np.array([width, height]) # Handle empty matches case if len(knnMatches) == 0: # Store to next iteration self.prevFrame = frame.copy() self.prevKeyPoints = copy.copy(keypoints) self.prevDescriptors = copy.copy(descriptors) return H for m, n in knnMatches: if m.distance < 0.9 * n.distance: prevKeyPointLocation = self.prevKeyPoints[m.queryIdx].pt currKeyPointLocation = keypoints[m.trainIdx].pt spatialDistance = ( prevKeyPointLocation[0] - currKeyPointLocation[0], prevKeyPointLocation[1] - currKeyPointLocation[1]) if (np.abs(spatialDistance[0]) < maxSpatialDistance[0]) and \ (np.abs(spatialDistance[1]) < maxSpatialDistance[1]): spatialDistances.append(spatialDistance) matches.append(m) meanSpatialDistances = np.mean(spatialDistances, 0) stdSpatialDistances = np.std(spatialDistances, 0) inliesrs = (spatialDistances - meanSpatialDistances ) < 2.5 * stdSpatialDistances goodMatches = [] prevPoints = [] currPoints = [] for i in range(len(matches)): if inliesrs[i, 0] and inliesrs[i, 1]: goodMatches.append(matches[i]) prevPoints.append(self.prevKeyPoints[matches[i].queryIdx].pt) currPoints.append(keypoints[matches[i].trainIdx].pt) prevPoints = np.array(prevPoints) currPoints = np.array(currPoints) # Draw the keypoint matches on the output image if 0: matches_img = np.hstack((self.prevFrame, frame)) matches_img = cv2.cvtColor(matches_img, cv2.COLOR_GRAY2BGR) W = np.size(self.prevFrame, 1) for m in goodMatches: prev_pt = np.array( self.prevKeyPoints[m.queryIdx].pt, dtype=np.int_) curr_pt = np.array(keypoints[m.trainIdx].pt, dtype=np.int_) curr_pt[0] += W color = np.random.randint(0, 255, (3, )) color = (int(color[0]), int(color[1]), int(color[2])) matches_img = cv2.line(matches_img, prev_pt, curr_pt, tuple(color), 1, cv2.LINE_AA) matches_img = cv2.circle(matches_img, prev_pt, 2, tuple(color), -1) matches_img = cv2.circle(matches_img, curr_pt, 2, tuple(color), -1) plt.figure() plt.imshow(matches_img) plt.show() # Find rigid matrix if (np.size(prevPoints, 0) > 4) and ( np.size(prevPoints, 0) == np.size(prevPoints, 0)): H, inliesrs = cv2.estimateAffinePartial2D(prevPoints, currPoints, cv2.RANSAC) # Handle downscale if self.downscale > 1.0: H[0, 2] *= self.downscale H[1, 2] *= self.downscale else: print('Warning: not enough matching points') # Store to next iteration self.prevFrame = frame.copy() self.prevKeyPoints = copy.copy(keypoints) self.prevDescriptors = copy.copy(descriptors) return H def applySparseOptFlow(self, raw_frame, detections=None): t0 = time.time() # Initialize height, width, _ = raw_frame.shape frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2GRAY) H = np.eye(2, 3) # Downscale image if self.downscale > 1.0: # frame = cv2.GaussianBlur(frame, (3, 3), 1.5) frame = cv2.resize(frame, (width // self.downscale, height // self.downscale)) # find the keypoints keypoints = cv2.goodFeaturesToTrack( frame, mask=None, **self.feature_params) # Handle first frame if not self.initializedFirstFrame: # Initialize data self.prevFrame = frame.copy() self.prevKeyPoints = copy.copy(keypoints) # Initialization done self.initializedFirstFrame = True return H if self.prevFrame.shape != frame.shape: self.prevFrame = frame.copy() self.prevKeyPoints = copy.copy(keypoints) return H # find correspondences matchedKeypoints, status, err = cv2.calcOpticalFlowPyrLK( self.prevFrame, frame, self.prevKeyPoints, None) # leave good correspondences only prevPoints = [] currPoints = [] for i in range(len(status)): if status[i]: prevPoints.append(self.prevKeyPoints[i]) currPoints.append(matchedKeypoints[i]) prevPoints = np.array(prevPoints) currPoints = np.array(currPoints) # Find rigid matrix if (np.size(prevPoints, 0) > 4) and ( np.size(prevPoints, 0) == np.size(prevPoints, 0)): H, inliesrs = cv2.estimateAffinePartial2D(prevPoints, currPoints, cv2.RANSAC) # Handle downscale if self.downscale > 1.0: H[0, 2] *= self.downscale H[1, 2] *= self.downscale else: print('Warning: not enough matching points') # Store to next iteration self.prevFrame = frame.copy() self.prevKeyPoints = copy.copy(keypoints) t1 = time.time() # gmc_line = str(1000 * (t1 - t0)) + "\t" + str(H[0, 0]) + "\t" + str(H[0, 1]) + "\t" + str( # H[0, 2]) + "\t" + str(H[1, 0]) + "\t" + str(H[1, 1]) + "\t" + str(H[1, 2]) + "\n" # self.gmc_file.write(gmc_line) return H def applyFile(self, raw_frame, detections=None): line = self.gmcFile.readline() tokens = line.split("\t") H = np.eye(2, 3, dtype=np.float_) H[0, 0] = float(tokens[1]) H[0, 1] = float(tokens[2]) H[0, 2] = float(tokens[3]) H[1, 0] = float(tokens[4]) H[1, 1] = float(tokens[5]) H[1, 2] = float(tokens[6]) return H ================================================ FILE: ppdet/modeling/mot/motion/kalman_filter.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This code is based on https://github.com/nwojke/deep_sort/blob/master/deep_sort/kalman_filter.py """ import numpy as np import scipy.linalg use_numba = True try: import numba as nb @nb.njit(fastmath=True, cache=True) def nb_project(mean, covariance, std, _update_mat): innovation_cov = np.diag(np.square(std)) mean = np.dot(_update_mat, mean) covariance = np.dot(np.dot(_update_mat, covariance), _update_mat.T) return mean, covariance + innovation_cov @nb.njit(fastmath=True, cache=True) def nb_multi_predict(mean, covariance, motion_cov, motion_mat): mean = np.dot(mean, motion_mat.T) left = np.dot(motion_mat, covariance) covariance = np.dot(left, motion_mat.T) + motion_cov return mean, covariance @nb.njit(fastmath=True, cache=True) def nb_update(mean, covariance, proj_mean, proj_cov, measurement, meas_mat): kalman_gain = np.linalg.solve(proj_cov, (covariance @meas_mat.T).T).T innovation = measurement - proj_mean mean = mean + innovation @kalman_gain.T covariance = covariance - kalman_gain @proj_cov @kalman_gain.T return mean, covariance except: use_numba = False print( 'Warning: Unable to use numba in PP-Tracking, please install numba, for example(python3.7): `pip install numba==0.56.4`' ) pass __all__ = ['KalmanFilter'] """ Table for the 0.95 quantile of the chi-square distribution with N degrees of freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv function and used as Mahalanobis gating threshold. """ chi2inv95 = { 1: 3.8415, 2: 5.9915, 3: 7.8147, 4: 9.4877, 5: 11.070, 6: 12.592, 7: 14.067, 8: 15.507, 9: 16.919 } class KalmanFilter(object): """ A simple Kalman filter for tracking bounding boxes in image space. The 8-dimensional state space x, y, a, h, vx, vy, va, vh contains the bounding box center position (x, y), aspect ratio a, height h, and their respective velocities. Object motion follows a constant velocity model. The bounding box location (x, y, a, h) is taken as direct observation of the state space (linear observation model). """ def __init__(self): ndim, dt = 4, 1. # Create Kalman filter model matrices. self._motion_mat = np.eye(2 * ndim, 2 * ndim, dtype=np.float32) for i in range(ndim): self._motion_mat[i, ndim + i] = dt self._update_mat = np.eye(ndim, 2 * ndim, dtype=np.float32) # Motion and observation uncertainty are chosen relative to the current # state estimate. These weights control the amount of uncertainty in # the model. This is a bit hacky. self._std_weight_position = 1. / 20 self._std_weight_velocity = 1. / 160 def initiate(self, measurement): """ Create track from unassociated measurement. Args: measurement (ndarray): Bounding box coordinates (x, y, a, h) with center position (x, y), aspect ratio a, and height h. Returns: The mean vector (8 dimensional) and covariance matrix (8x8 dimensional) of the new track. Unobserved velocities are initialized to 0 mean. """ mean_pos = measurement mean_vel = np.zeros_like(mean_pos) mean = np.r_[mean_pos, mean_vel] std = [ 2 * self._std_weight_position * measurement[3], 2 * self._std_weight_position * measurement[3], 1e-2, 2 * self._std_weight_position * measurement[3], 10 * self._std_weight_velocity * measurement[3], 10 * self._std_weight_velocity * measurement[3], 1e-5, 10 * self._std_weight_velocity * measurement[3] ] covariance = np.diag(np.square(std)) return mean, np.float32(covariance) def predict(self, mean, covariance): """ Run Kalman filter prediction step. Args: mean (ndarray): The 8 dimensional mean vector of the object state at the previous time step. covariance (ndarray): The 8x8 dimensional covariance matrix of the object state at the previous time step. Returns: The mean vector and covariance matrix of the predicted state. Unobserved velocities are initialized to 0 mean. """ std_pos = [ self._std_weight_position * mean[3], self._std_weight_position * mean[3], 1e-2, self._std_weight_position * mean[3] ] std_vel = [ self._std_weight_velocity * mean[3], self._std_weight_velocity * mean[3], 1e-5, self._std_weight_velocity * mean[3] ] motion_cov = np.diag(np.square(np.r_[std_pos, std_vel])) #mean = np.dot(self._motion_mat, mean) mean = np.dot(mean, self._motion_mat.T) covariance = np.linalg.multi_dot( (self._motion_mat, covariance, self._motion_mat.T)) + motion_cov return mean, covariance def project(self, mean, covariance): """ Project state distribution to measurement space. Args mean (ndarray): The state's mean vector (8 dimensional array). covariance (ndarray): The state's covariance matrix (8x8 dimensional). Returns: The projected mean and covariance matrix of the given state estimate. """ std = np.array( [ self._std_weight_position * mean[3], self._std_weight_position * mean[3], 1e-1, self._std_weight_position * mean[3] ], dtype=np.float32) if use_numba: return nb_project(mean, covariance, std, self._update_mat) innovation_cov = np.diag(np.square(std)) mean = np.dot(self._update_mat, mean) covariance = np.linalg.multi_dot((self._update_mat, covariance, self._update_mat.T)) return mean, covariance + innovation_cov def multi_predict(self, mean, covariance): """ Run Kalman filter prediction step (Vectorized version). Args: mean (ndarray): The Nx8 dimensional mean matrix of the object states at the previous time step. covariance (ndarray): The Nx8x8 dimensional covariance matrics of the object states at the previous time step. Returns: The mean vector and covariance matrix of the predicted state. Unobserved velocities are initialized to 0 mean. """ std_pos = np.array([ self._std_weight_position * mean[:, 3], self._std_weight_position * mean[:, 3], 1e-2 * np.ones_like(mean[:, 3]), self._std_weight_position * mean[:, 3] ]) std_vel = np.array([ self._std_weight_velocity * mean[:, 3], self._std_weight_velocity * mean[:, 3], 1e-5 * np.ones_like(mean[:, 3]), self._std_weight_velocity * mean[:, 3] ]) sqr = np.square(np.r_[std_pos, std_vel]).T if use_numba: means = [] covariances = [] for i in range(len(mean)): a, b = nb_multi_predict(mean[i], covariance[i], np.diag(sqr[i]), self._motion_mat) means.append(a) covariances.append(b) return np.asarray(means), np.asarray(covariances) motion_cov = [] for i in range(len(mean)): motion_cov.append(np.diag(sqr[i])) motion_cov = np.asarray(motion_cov) mean = np.dot(mean, self._motion_mat.T) left = np.dot(self._motion_mat, covariance).transpose((1, 0, 2)) covariance = np.dot(left, self._motion_mat.T) + motion_cov return mean, covariance def update(self, mean, covariance, measurement): """ Run Kalman filter correction step. Args: mean (ndarray): The predicted state's mean vector (8 dimensional). covariance (ndarray): The state's covariance matrix (8x8 dimensional). measurement (ndarray): The 4 dimensional measurement vector (x, y, a, h), where (x, y) is the center position, a the aspect ratio, and h the height of the bounding box. Returns: The measurement-corrected state distribution. """ projected_mean, projected_cov = self.project(mean, covariance) if use_numba: return nb_update(mean, covariance, projected_mean, projected_cov, measurement, self._update_mat) kalman_gain = np.linalg.solve(projected_cov, (covariance @self._update_mat.T).T).T innovation = measurement - projected_mean mean = mean + innovation @kalman_gain.T covariance = covariance - kalman_gain @projected_cov @kalman_gain.T return mean, covariance def gating_distance(self, mean, covariance, measurements, only_position=False, metric='maha'): """ Compute gating distance between state distribution and measurements. A suitable distance threshold can be obtained from `chi2inv95`. If `only_position` is False, the chi-square distribution has 4 degrees of freedom, otherwise 2. Args: mean (ndarray): Mean vector over the state distribution (8 dimensional). covariance (ndarray): Covariance of the state distribution (8x8 dimensional). measurements (ndarray): An Nx4 dimensional matrix of N measurements, each in format (x, y, a, h) where (x, y) is the bounding box center position, a the aspect ratio, and h the height. only_position (Optional[bool]): If True, distance computation is done with respect to the bounding box center position only. metric (str): Metric type, 'gaussian' or 'maha'. Returns An array of length N, where the i-th element contains the squared Mahalanobis distance between (mean, covariance) and `measurements[i]`. """ mean, covariance = self.project(mean, covariance) if only_position: mean, covariance = mean[:2], covariance[:2, :2] measurements = measurements[:, :2] d = measurements - mean if metric == 'gaussian': return np.sum(d * d, axis=1) elif metric == 'maha': cholesky_factor = np.linalg.cholesky(covariance) z = scipy.linalg.solve_triangular( cholesky_factor, d.T, lower=True, check_finite=False, overwrite_b=True) squared_maha = np.sum(z * z, axis=0) return squared_maha else: raise ValueError('invalid distance metric') ================================================ FILE: ppdet/modeling/mot/motion/ocsort_kalman_filter.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This code is based on https://github.com/danbochman/SORT/blob/danny_opencv/kalman_filter.py """ import numpy as np from numpy import dot, zeros, eye from numpy.linalg import inv use_numba = True try: import numba as nb @nb.njit(fastmath=True, cache=True) def nb_predict(x, F, P, Q): x = dot(F, x) P = dot(dot(F, P), F.T) + Q return x, P @nb.njit(fastmath=True, cache=True) def nb_update(x, z, H, P, R, _I): y = z - np.dot(H, x) PHT = dot(P, H.T) S = dot(H, PHT) + R K = dot(PHT, inv(S)) x = x + dot(K, y) I_KH = _I - dot(K, H) P = dot(dot(I_KH, P), I_KH.T) + dot(dot(K, R), K.T) return x, P except: use_numba = False print( 'Warning: Unable to use numba in PP-Tracking, please install numba, for example(python3.7): `pip install numba==0.56.4`' ) pass class OCSORTKalmanFilter: def __init__(self, dim_x, dim_z): self.dim_x = dim_x self.dim_z = dim_z self.x = zeros((dim_x, 1)) self.P = eye(dim_x) self.Q = eye(dim_x) self.F = eye(dim_x) self.H = zeros((dim_z, dim_x)) self.R = eye(dim_z) self.M = zeros((dim_z, dim_z)) self._I = eye(dim_x) def predict(self): if use_numba: self.x, self.P = nb_predict(self.x, self.F, self.P, self.Q) else: self.x = dot(self.F, self.x) self.P = dot(dot(self.F, self.P), self.F.T) + self.Q def update(self, z): if z is None: return if use_numba: self.x, self.P = nb_update(self.x, z, self.H, self.P, self.R, self._I) else: y = z - np.dot(self.H, self.x) PHT = dot(self.P, self.H.T) S = dot(self.H, PHT) + self.R K = dot(PHT, inv(S)) self.x = self.x + dot(K, y) I_KH = self._I - dot(K, self.H) self.P = dot(dot(I_KH, self.P), I_KH.T) + dot(dot(K, self.R), K.T) ================================================ FILE: ppdet/modeling/mot/tracker/__init__.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import base_jde_tracker from . import base_sde_tracker from .base_jde_tracker import * from .base_sde_tracker import * from . import jde_tracker from . import deepsort_tracker from . import ocsort_tracker from . import center_tracker from .jde_tracker import * from .deepsort_tracker import * from .ocsort_tracker import * from .botsort_tracker import * from .center_tracker import * ================================================ FILE: ppdet/modeling/mot/tracker/base_jde_tracker.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This code is based on https://github.com/Zhongdao/Towards-Realtime-MOT/blob/master/tracker/multitracker.py """ import numpy as np from collections import defaultdict from collections import deque, OrderedDict from ..matching import jde_matching as matching from ppdet.core.workspace import register, serializable import warnings warnings.filterwarnings("ignore") __all__ = [ 'TrackState', 'BaseTrack', 'STrack', 'joint_stracks', 'sub_stracks', 'remove_duplicate_stracks', ] class TrackState(object): New = 0 Tracked = 1 Lost = 2 Removed = 3 @register @serializable class BaseTrack(object): _count_dict = defaultdict(int) # support single class and multi classes track_id = 0 is_activated = False state = TrackState.New history = OrderedDict() features = [] curr_feat = None score = 0 start_frame = 0 frame_id = 0 time_since_update = 0 # multi-camera location = (np.inf, np.inf) @property def end_frame(self): return self.frame_id @staticmethod def next_id(cls_id): BaseTrack._count_dict[cls_id] += 1 return BaseTrack._count_dict[cls_id] # @even: reset track id @staticmethod def init_count(num_classes): """ Initiate _count for all object classes :param num_classes: """ for cls_id in range(num_classes): BaseTrack._count_dict[cls_id] = 0 @staticmethod def reset_track_count(cls_id): BaseTrack._count_dict[cls_id] = 0 def activate(self, *args): raise NotImplementedError def predict(self): raise NotImplementedError def update(self, *args, **kwargs): raise NotImplementedError def mark_lost(self): self.state = TrackState.Lost def mark_removed(self): self.state = TrackState.Removed @register @serializable class STrack(BaseTrack): def __init__(self, tlwh, score, cls_id, buff_size=30, temp_feat=None): # wait activate self._tlwh = np.asarray(tlwh, dtype=np.float32) self.score = score self.cls_id = cls_id self.track_len = 0 self.kalman_filter = None self.mean, self.covariance = None, None self.is_activated = False self.use_reid = True if temp_feat is not None else False if self.use_reid: self.smooth_feat = None self.update_features(temp_feat) self.features = deque([], maxlen=buff_size) self.alpha = 0.9 def update_features(self, feat): # L2 normalizing, this function has no use for BYTETracker feat /= np.linalg.norm(feat) self.curr_feat = feat if self.smooth_feat is None: self.smooth_feat = feat else: self.smooth_feat = self.alpha * self.smooth_feat + (1.0 - self.alpha ) * feat self.features.append(feat) self.smooth_feat /= np.linalg.norm(self.smooth_feat) def predict(self): mean_state = self.mean.copy() if self.state != TrackState.Tracked: mean_state[7] = 0 self.mean, self.covariance = self.kalman_filter.predict(mean_state, self.covariance) @staticmethod def multi_predict(tracks, kalman_filter): if len(tracks) > 0: multi_mean = np.asarray([track.mean.copy() for track in tracks]) multi_covariance = np.asarray( [track.covariance for track in tracks]) for i, st in enumerate(tracks): if st.state != TrackState.Tracked: multi_mean[i][7] = 0 multi_mean, multi_covariance = kalman_filter.multi_predict( multi_mean, multi_covariance) for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)): tracks[i].mean = mean tracks[i].covariance = cov @staticmethod def multi_gmc(stracks, H=np.eye(2, 3)): if len(stracks) > 0: multi_mean = np.asarray([st.mean.copy() for st in stracks]) multi_covariance = np.asarray([st.covariance for st in stracks]) R = H[:2, :2] R8x8 = np.kron(np.eye(4, dtype=float), R) t = H[:2, 2] for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)): mean = R8x8.dot(mean) mean[:2] += t cov = R8x8.dot(cov).dot(R8x8.transpose()) stracks[i].mean = mean stracks[i].covariance = cov def reset_track_id(self): self.reset_track_count(self.cls_id) def activate(self, kalman_filter, frame_id): """Start a new track""" self.kalman_filter = kalman_filter # update track id for the object class self.track_id = self.next_id(self.cls_id) self.mean, self.covariance = self.kalman_filter.initiate( self.tlwh_to_xyah(self._tlwh)) self.track_len = 0 self.state = TrackState.Tracked # set flag 'tracked' if frame_id == 1: # to record the first frame's detection result self.is_activated = True self.frame_id = frame_id self.start_frame = frame_id def re_activate(self, new_track, frame_id, new_id=False): self.mean, self.covariance = self.kalman_filter.update( self.mean, self.covariance, self.tlwh_to_xyah(new_track.tlwh)) if self.use_reid: self.update_features(new_track.curr_feat) self.track_len = 0 self.state = TrackState.Tracked self.is_activated = True self.frame_id = frame_id if new_id: # update track id for the object class self.track_id = self.next_id(self.cls_id) def update(self, new_track, frame_id, update_feature=True): self.frame_id = frame_id self.track_len += 1 new_tlwh = new_track.tlwh self.mean, self.covariance = self.kalman_filter.update( self.mean, self.covariance, self.tlwh_to_xyah(new_tlwh)) self.state = TrackState.Tracked # set flag 'tracked' self.is_activated = True # set flag 'activated' self.score = new_track.score if update_feature and self.use_reid: self.update_features(new_track.curr_feat) @property def tlwh(self): """Get current position in bounding box format `(top left x, top left y, width, height)`. """ if self.mean is None: return self._tlwh.copy() ret = self.mean[:4].copy() ret[2] *= ret[3] ret[:2] -= ret[2:] / 2 return ret @property def tlbr(self): """Convert bounding box to format `(min x, min y, max x, max y)`, i.e., `(top left, bottom right)`. """ ret = self.tlwh.copy() ret[2:] += ret[:2] return ret @staticmethod def tlwh_to_xyah(tlwh): """Convert bounding box to format `(center x, center y, aspect ratio, height)`, where the aspect ratio is `width / height`. """ ret = np.asarray(tlwh).copy() ret[:2] += ret[2:] / 2 ret[2] /= ret[3] return ret def to_xyah(self): return self.tlwh_to_xyah(self.tlwh) @staticmethod def tlbr_to_tlwh(tlbr): ret = np.asarray(tlbr).copy() ret[2:] -= ret[:2] return ret @staticmethod def tlwh_to_tlbr(tlwh): ret = np.asarray(tlwh).copy() ret[2:] += ret[:2] return ret def __repr__(self): return 'OT_({}-{})_({}-{})'.format(self.cls_id, self.track_id, self.start_frame, self.end_frame) def joint_stracks(tlista, tlistb): exists = {} res = [] for t in tlista: exists[t.track_id] = 1 res.append(t) for t in tlistb: tid = t.track_id if not exists.get(tid, 0): exists[tid] = 1 res.append(t) return res def sub_stracks(tlista, tlistb): stracks = {} for t in tlista: stracks[t.track_id] = t for t in tlistb: tid = t.track_id if stracks.get(tid, 0): del stracks[tid] return list(stracks.values()) def remove_duplicate_stracks(stracksa, stracksb): pdist = matching.iou_distance(stracksa, stracksb) pairs = np.where(pdist < 0.15) dupa, dupb = list(), list() for p, q in zip(*pairs): timep = stracksa[p].frame_id - stracksa[p].start_frame timeq = stracksb[q].frame_id - stracksb[q].start_frame if timep > timeq: dupb.append(q) else: dupa.append(p) resa = [t for i, t in enumerate(stracksa) if not i in dupa] resb = [t for i, t in enumerate(stracksb) if not i in dupb] return resa, resb ================================================ FILE: ppdet/modeling/mot/tracker/base_sde_tracker.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This code is based on https://github.com/nwojke/deep_sort/blob/master/deep_sort/track.py """ import datetime from ppdet.core.workspace import register, serializable __all__ = ['TrackState', 'Track'] class TrackState(object): """ Enumeration type for the single target track state. Newly created tracks are classified as `tentative` until enough evidence has been collected. Then, the track state is changed to `confirmed`. Tracks that are no longer alive are classified as `deleted` to mark them for removal from the set of active tracks. """ Tentative = 1 Confirmed = 2 Deleted = 3 @register @serializable class Track(object): """ A single target track with state space `(x, y, a, h)` and associated velocities, where `(x, y)` is the center of the bounding box, `a` is the aspect ratio and `h` is the height. Args: mean (ndarray): Mean vector of the initial state distribution. covariance (ndarray): Covariance matrix of the initial state distribution. track_id (int): A unique track identifier. n_init (int): Number of consecutive detections before the track is confirmed. The track state is set to `Deleted` if a miss occurs within the first `n_init` frames. max_age (int): The maximum number of consecutive misses before the track state is set to `Deleted`. cls_id (int): The category id of the tracked box. score (float): The confidence score of the tracked box. feature (Optional[ndarray]): Feature vector of the detection this track originates from. If not None, this feature is added to the `features` cache. Attributes: hits (int): Total number of measurement updates. age (int): Total number of frames since first occurance. time_since_update (int): Total number of frames since last measurement update. state (TrackState): The current track state. features (List[ndarray]): A cache of features. On each measurement update, the associated feature vector is added to this list. """ def __init__(self, mean, covariance, track_id, n_init, max_age, cls_id, score, feature=None): self.mean = mean self.covariance = covariance self.track_id = track_id self.hits = 1 self.age = 1 self.time_since_update = 0 self.cls_id = cls_id self.score = score self.start_time = datetime.datetime.now() self.state = TrackState.Tentative self.features = [] self.feat = feature if feature is not None: self.features.append(feature) self._n_init = n_init self._max_age = max_age def to_tlwh(self): """Get position in format `(top left x, top left y, width, height)`.""" ret = self.mean[:4].copy() ret[2] *= ret[3] ret[:2] -= ret[2:] / 2 return ret def to_tlbr(self): """Get position in bounding box format `(min x, miny, max x, max y)`.""" ret = self.to_tlwh() ret[2:] = ret[:2] + ret[2:] return ret def predict(self, kalman_filter): """ Propagate the state distribution to the current time step using a Kalman filter prediction step. """ self.mean, self.covariance = kalman_filter.predict(self.mean, self.covariance) self.age += 1 self.time_since_update += 1 def update(self, kalman_filter, detection): """ Perform Kalman filter measurement update step and update the associated detection feature cache. """ self.mean, self.covariance = kalman_filter.update(self.mean, self.covariance, detection.to_xyah()) self.features.append(detection.feature) self.feat = detection.feature self.cls_id = detection.cls_id self.score = detection.score self.hits += 1 self.time_since_update = 0 if self.state == TrackState.Tentative and self.hits >= self._n_init: self.state = TrackState.Confirmed def mark_missed(self): """Mark this track as missed (no association at the current time step). """ if self.state == TrackState.Tentative: self.state = TrackState.Deleted elif self.time_since_update > self._max_age: self.state = TrackState.Deleted def is_tentative(self): """Returns True if this track is tentative (unconfirmed).""" return self.state == TrackState.Tentative def is_confirmed(self): """Returns True if this track is confirmed.""" return self.state == TrackState.Confirmed def is_deleted(self): """Returns True if this track is dead and should be deleted.""" return self.state == TrackState.Deleted ================================================ FILE: ppdet/modeling/mot/tracker/botsort_tracker.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This code is based on https://github.com/WWangYuHsiang/SMILEtrack/blob/main/BoT-SORT/tracker/bot_sort.py """ import cv2 import matplotlib.pyplot as plt import numpy as np from collections import deque from ..matching import jde_matching as matching from ..motion import GMC from .base_jde_tracker import TrackState, STrack from .base_jde_tracker import joint_stracks, sub_stracks, remove_duplicate_stracks from ..motion import KalmanFilter from ppdet.core.workspace import register, serializable @register @serializable class BOTSORTTracker(object): """ BOTSORT tracker, support single class Args: track_high_thresh (float): threshold of detection high score track_low_thresh (float): threshold of remove detection score new_track_thresh (float): threshold of new track score match_thresh (float): iou threshold for associate track_buffer (int): tracking reserved frames,default 30 min_box_area (float): reserved min box camera_motion (bool): Whether use camera motion, default False cmc_method (str): camera motion method,defalut sparseOptFlow frame_rate (int): fps buffer_size=int(frame_rate / 30.0 * track_buffer) """ def __init__(self, track_high_thresh=0.3, track_low_thresh=0.2, new_track_thresh=0.4, match_thresh=0.7, track_buffer=30, min_box_area=0, camera_motion=False, cmc_method='sparseOptFlow', frame_rate=30): self.tracked_stracks = [] # type: list[STrack] self.lost_stracks = [] # type: list[STrack] self.removed_stracks = [] # type: list[STrack] self.frame_id = 0 self.track_high_thresh = track_high_thresh self.track_low_thresh = track_low_thresh self.new_track_thresh = new_track_thresh self.match_thresh = match_thresh self.buffer_size = int(frame_rate / 30.0 * track_buffer) self.max_time_lost = self.buffer_size self.kalman_filter = KalmanFilter() self.min_box_area = min_box_area self.camera_motion = camera_motion self.gmc = GMC(method=cmc_method) def update(self, output_results, img=None): self.frame_id += 1 activated_starcks = [] refind_stracks = [] lost_stracks = [] removed_stracks = [] if len(output_results): bboxes = output_results[:, 2:6] scores = output_results[:, 1] classes = output_results[:, 0] # Remove bad detections lowest_inds = scores > self.track_low_thresh bboxes = bboxes[lowest_inds] scores = scores[lowest_inds] classes = classes[lowest_inds] # Find high threshold detections remain_inds = scores > self.track_high_thresh dets = bboxes[remain_inds] scores_keep = scores[remain_inds] classes_keep = classes[remain_inds] else: bboxes = [] scores = [] classes = [] dets = [] scores_keep = [] classes_keep = [] if len(dets) > 0: '''Detections''' detections = [ STrack(STrack.tlbr_to_tlwh(tlbr), s, c) for (tlbr, s, c) in zip(dets, scores_keep, classes_keep) ] else: detections = [] ''' Add newly detected tracklets to tracked_stracks''' unconfirmed = [] tracked_stracks = [] # type: list[STrack] for track in self.tracked_stracks: if not track.is_activated: unconfirmed.append(track) else: tracked_stracks.append(track) ''' Step 2: First association, with high score detection boxes''' strack_pool = joint_stracks(tracked_stracks, self.lost_stracks) # Predict the current location with KF STrack.multi_predict(strack_pool, self.kalman_filter) # Fix camera motion if self.camera_motion: warp = self.gmc.apply(img[0], dets) STrack.multi_gmc(strack_pool, warp) STrack.multi_gmc(unconfirmed, warp) # Associate with high score detection boxes ious_dists = matching.iou_distance(strack_pool, detections) matches, u_track, u_detection = matching.linear_assignment( ious_dists, thresh=self.match_thresh) for itracked, idet in matches: track = strack_pool[itracked] det = detections[idet] if track.state == TrackState.Tracked: track.update(detections[idet], self.frame_id) activated_starcks.append(track) else: track.re_activate(det, self.frame_id, new_id=False) refind_stracks.append(track) ''' Step 3: Second association, with low score detection boxes''' if len(scores): inds_high = scores < self.track_high_thresh inds_low = scores > self.track_low_thresh inds_second = np.logical_and(inds_low, inds_high) dets_second = bboxes[inds_second] scores_second = scores[inds_second] classes_second = classes[inds_second] else: dets_second = [] scores_second = [] classes_second = [] # association the untrack to the low score detections if len(dets_second) > 0: '''Detections''' detections_second = [ STrack(STrack.tlbr_to_tlwh(tlbr), s, c) for (tlbr, s, c) in zip(dets_second, scores_second, classes_second) ] else: detections_second = [] r_tracked_stracks = [ strack_pool[i] for i in u_track if strack_pool[i].state == TrackState.Tracked ] dists = matching.iou_distance(r_tracked_stracks, detections_second) matches, u_track, u_detection_second = matching.linear_assignment( dists, thresh=0.5) for itracked, idet in matches: track = r_tracked_stracks[itracked] det = detections_second[idet] if track.state == TrackState.Tracked: track.update(det, self.frame_id) activated_starcks.append(track) else: track.re_activate(det, self.frame_id, new_id=False) refind_stracks.append(track) for it in u_track: track = r_tracked_stracks[it] if not track.state == TrackState.Lost: track.mark_lost() lost_stracks.append(track) '''Deal with unconfirmed tracks, usually tracks with only one beginning frame''' detections = [detections[i] for i in u_detection] dists = matching.iou_distance(unconfirmed, detections) matches, u_unconfirmed, u_detection = matching.linear_assignment( dists, thresh=0.7) for itracked, idet in matches: unconfirmed[itracked].update(detections[idet], self.frame_id) activated_starcks.append(unconfirmed[itracked]) for it in u_unconfirmed: track = unconfirmed[it] track.mark_removed() removed_stracks.append(track) """ Step 4: Init new stracks""" for inew in u_detection: track = detections[inew] if track.score < self.new_track_thresh: continue track.activate(self.kalman_filter, self.frame_id) activated_starcks.append(track) """ Step 5: Update state""" for track in self.lost_stracks: if self.frame_id - track.end_frame > self.max_time_lost: track.mark_removed() removed_stracks.append(track) """ Merge """ self.tracked_stracks = [ t for t in self.tracked_stracks if t.state == TrackState.Tracked ] self.tracked_stracks = joint_stracks(self.tracked_stracks, activated_starcks) self.tracked_stracks = joint_stracks(self.tracked_stracks, refind_stracks) self.lost_stracks = sub_stracks(self.lost_stracks, self.tracked_stracks) self.lost_stracks.extend(lost_stracks) self.lost_stracks = sub_stracks(self.lost_stracks, self.removed_stracks) self.removed_stracks.extend(removed_stracks) self.tracked_stracks, self.lost_stracks = remove_duplicate_stracks( self.tracked_stracks, self.lost_stracks) # output_stracks = [track for track in self.tracked_stracks if track.is_activated] output_stracks = [track for track in self.tracked_stracks] return output_stracks ================================================ FILE: ppdet/modeling/mot/tracker/center_tracker.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This code is based on https://github.com/xingyizhou/CenterTrack/blob/master/src/lib/utils/tracker.py """ import copy import numpy as np import sklearn from ppdet.core.workspace import register, serializable from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) __all__ = ['CenterTracker'] @register @serializable class CenterTracker(object): __shared__ = ['num_classes'] def __init__(self, num_classes=1, min_box_area=0, vertical_ratio=-1, track_thresh=0.4, pre_thresh=0.5, new_thresh=0.4, out_thresh=0.4, hungarian=False): self.num_classes = num_classes self.min_box_area = min_box_area self.vertical_ratio = vertical_ratio self.track_thresh = track_thresh self.pre_thresh = max(track_thresh, pre_thresh) self.new_thresh = max(track_thresh, new_thresh) self.out_thresh = max(track_thresh, out_thresh) self.hungarian = hungarian self.reset() def init_track(self, results): print('Initialize tracking!') for item in results: if item['score'] > self.new_thresh: self.id_count += 1 item['tracking_id'] = self.id_count if not ('ct' in item): bbox = item['bbox'] item['ct'] = [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2] self.tracks.append(item) def reset(self): self.id_count = 0 self.tracks = [] def update(self, results, public_det=None): N = len(results) M = len(self.tracks) dets = np.array([det['ct'] + det['tracking'] for det in results], np.float32) # N x 2 track_size = np.array([((track['bbox'][2] - track['bbox'][0]) * \ (track['bbox'][3] - track['bbox'][1])) \ for track in self.tracks], np.float32) # M track_cat = np.array([track['class'] for track in self.tracks], np.int32) # M item_size = np.array([((item['bbox'][2] - item['bbox'][0]) * \ (item['bbox'][3] - item['bbox'][1])) \ for item in results], np.float32) # N item_cat = np.array([item['class'] for item in results], np.int32) # N tracks = np.array([pre_det['ct'] for pre_det in self.tracks], np.float32) # M x 2 dist = (((tracks.reshape(1, -1, 2) - \ dets.reshape(-1, 1, 2)) ** 2).sum(axis=2)) # N x M invalid = ((dist > track_size.reshape(1, M)) + \ (dist > item_size.reshape(N, 1)) + \ (item_cat.reshape(N, 1) != track_cat.reshape(1, M))) > 0 dist = dist + invalid * 1e18 if self.hungarian: item_score = np.array([item['score'] for item in results], np.float32) dist[dist > 1e18] = 1e18 from sklearn.utils.linear_assignment_ import linear_assignment matched_indices = linear_assignment(dist) else: matched_indices = greedy_assignment(copy.deepcopy(dist)) unmatched_dets = [d for d in range(dets.shape[0]) \ if not (d in matched_indices[:, 0])] unmatched_tracks = [d for d in range(tracks.shape[0]) \ if not (d in matched_indices[:, 1])] if self.hungarian: matches = [] for m in matched_indices: if dist[m[0], m[1]] > 1e16: unmatched_dets.append(m[0]) unmatched_tracks.append(m[1]) else: matches.append(m) matches = np.array(matches).reshape(-1, 2) else: matches = matched_indices ret = [] for m in matches: track = results[m[0]] track['tracking_id'] = self.tracks[m[1]]['tracking_id'] ret.append(track) # Private detection: create tracks for all un-matched detections for i in unmatched_dets: track = results[i] if track['score'] > self.new_thresh: self.id_count += 1 track['tracking_id'] = self.id_count ret.append(track) self.tracks = ret return ret def greedy_assignment(dist): matched_indices = [] if dist.shape[1] == 0: return np.array(matched_indices, np.int32).reshape(-1, 2) for i in range(dist.shape[0]): j = dist[i].argmin() if dist[i][j] < 1e16: dist[:, j] = 1e18 matched_indices.append([i, j]) return np.array(matched_indices, np.int32).reshape(-1, 2) ================================================ FILE: ppdet/modeling/mot/tracker/deepsort_tracker.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This code is based on https://github.com/nwojke/deep_sort/blob/master/deep_sort/tracker.py """ import numpy as np from ..motion import KalmanFilter from ..matching.deepsort_matching import NearestNeighborDistanceMetric from ..matching.deepsort_matching import iou_cost, min_cost_matching, matching_cascade, gate_cost_matrix from .base_sde_tracker import Track from ..utils import Detection from ppdet.core.workspace import register, serializable from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) __all__ = ['DeepSORTTracker'] @register @serializable class DeepSORTTracker(object): """ DeepSORT tracker Args: input_size (list): input feature map size to reid model, [h, w] format, [64, 192] as default. min_box_area (int): min box area to filter out low quality boxes vertical_ratio (float): w/h, the vertical ratio of the bbox to filter bad results, set 1.6 default for pedestrian tracking. If set <=0 means no need to filter bboxes. budget (int): If not None, fix samples per class to at most this number. Removes the oldest samples when the budget is reached. max_age (int): maximum number of missed misses before a track is deleted n_init (float): Number of frames that a track remains in initialization phase. Number of consecutive detections before the track is confirmed. The track state is set to `Deleted` if a miss occurs within the first `n_init` frames. metric_type (str): either "euclidean" or "cosine", the distance metric used for measurement to track association. matching_threshold (float): samples with larger distance are considered an invalid match. max_iou_distance (float): max iou distance threshold motion (object): KalmanFilter instance """ def __init__(self, input_size=[64, 192], min_box_area=0, vertical_ratio=-1, budget=100, max_age=70, n_init=3, metric_type='cosine', matching_threshold=0.2, max_iou_distance=0.9, motion='KalmanFilter'): self.input_size = input_size self.min_box_area = min_box_area self.vertical_ratio = vertical_ratio self.max_age = max_age self.n_init = n_init self.metric = NearestNeighborDistanceMetric(metric_type, matching_threshold, budget) self.max_iou_distance = max_iou_distance if motion == 'KalmanFilter': self.motion = KalmanFilter() self.tracks = [] self._next_id = 1 def predict(self): """ Propagate track state distributions one time step forward. This function should be called once every time step, before `update`. """ for track in self.tracks: track.predict(self.motion) def update(self, pred_dets, pred_embs): """ Perform measurement update and track management. Args: pred_dets (np.array): Detection results of the image, the shape is [N, 6], means 'cls_id, score, x0, y0, x1, y1'. pred_embs (np.array): Embedding results of the image, the shape is [N, 128], usually pred_embs.shape[1] is a multiple of 128. """ pred_cls_ids = pred_dets[:, 0:1] pred_scores = pred_dets[:, 1:2] pred_xyxys = pred_dets[:, 2:6] pred_tlwhs = np.concatenate((pred_xyxys[:, 0:2], pred_xyxys[:, 2:4] - pred_xyxys[:, 0:2] + 1), axis=1) detections = [ Detection(tlwh, score, feat, cls_id) for tlwh, score, feat, cls_id in zip(pred_tlwhs, pred_scores, pred_embs, pred_cls_ids) ] # Run matching cascade. matches, unmatched_tracks, unmatched_detections = \ self._match(detections) # Update track set. for track_idx, detection_idx in matches: self.tracks[track_idx].update(self.motion, detections[detection_idx]) for track_idx in unmatched_tracks: self.tracks[track_idx].mark_missed() for detection_idx in unmatched_detections: self._initiate_track(detections[detection_idx]) self.tracks = [t for t in self.tracks if not t.is_deleted()] # Update distance metric. active_targets = [t.track_id for t in self.tracks if t.is_confirmed()] features, targets = [], [] for track in self.tracks: if not track.is_confirmed(): continue features += track.features targets += [track.track_id for _ in track.features] track.features = [] self.metric.partial_fit( np.asarray(features), np.asarray(targets), active_targets) output_stracks = self.tracks return output_stracks def _match(self, detections): def gated_metric(tracks, dets, track_indices, detection_indices): features = np.array([dets[i].feature for i in detection_indices]) targets = np.array([tracks[i].track_id for i in track_indices]) cost_matrix = self.metric.distance(features, targets) cost_matrix = gate_cost_matrix(self.motion, cost_matrix, tracks, dets, track_indices, detection_indices) return cost_matrix # Split track set into confirmed and unconfirmed tracks. confirmed_tracks = [ i for i, t in enumerate(self.tracks) if t.is_confirmed() ] unconfirmed_tracks = [ i for i, t in enumerate(self.tracks) if not t.is_confirmed() ] # Associate confirmed tracks using appearance features. matches_a, unmatched_tracks_a, unmatched_detections = \ matching_cascade( gated_metric, self.metric.matching_threshold, self.max_age, self.tracks, detections, confirmed_tracks) # Associate remaining tracks together with unconfirmed tracks using IOU. iou_track_candidates = unconfirmed_tracks + [ k for k in unmatched_tracks_a if self.tracks[k].time_since_update == 1 ] unmatched_tracks_a = [ k for k in unmatched_tracks_a if self.tracks[k].time_since_update != 1 ] matches_b, unmatched_tracks_b, unmatched_detections = \ min_cost_matching( iou_cost, self.max_iou_distance, self.tracks, detections, iou_track_candidates, unmatched_detections) matches = matches_a + matches_b unmatched_tracks = list(set(unmatched_tracks_a + unmatched_tracks_b)) return matches, unmatched_tracks, unmatched_detections def _initiate_track(self, detection): mean, covariance = self.motion.initiate(detection.to_xyah()) self.tracks.append( Track(mean, covariance, self._next_id, self.n_init, self.max_age, detection.cls_id, detection.score, detection.feature)) self._next_id += 1 ================================================ FILE: ppdet/modeling/mot/tracker/jde_tracker.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This code is based on https://github.com/Zhongdao/Towards-Realtime-MOT/blob/master/tracker/multitracker.py """ import numpy as np from collections import defaultdict from ..matching import jde_matching as matching from ..motion import KalmanFilter from .base_jde_tracker import TrackState, STrack from .base_jde_tracker import joint_stracks, sub_stracks, remove_duplicate_stracks from ppdet.core.workspace import register, serializable from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) __all__ = ['JDETracker'] @register @serializable class JDETracker(object): __shared__ = ['num_classes'] """ JDE tracker, support single class and multi classes Args: use_byte (bool): Whether use ByteTracker, default False num_classes (int): the number of classes det_thresh (float): threshold of detection score track_buffer (int): buffer for tracker min_box_area (int): min box area to filter out low quality boxes vertical_ratio (float): w/h, the vertical ratio of the bbox to filter bad results. If set <= 0 means no need to filter bboxes,usually set 1.6 for pedestrian tracking. tracked_thresh (float): linear assignment threshold of tracked stracks and detections r_tracked_thresh (float): linear assignment threshold of tracked stracks and unmatched detections unconfirmed_thresh (float): linear assignment threshold of unconfirmed stracks and unmatched detections conf_thres (float): confidence threshold for tracking, also used in ByteTracker as higher confidence threshold match_thres (float): linear assignment threshold of tracked stracks and detections in ByteTracker low_conf_thres (float): lower confidence threshold for tracking in ByteTracker input_size (list): input feature map size to reid model, [h, w] format, [64, 192] as default. motion (str): motion model, KalmanFilter as default metric_type (str): either "euclidean" or "cosine", the distance metric used for measurement to track association. """ def __init__(self, use_byte=False, num_classes=1, det_thresh=0.3, track_buffer=30, min_box_area=0, vertical_ratio=0, tracked_thresh=0.7, r_tracked_thresh=0.5, unconfirmed_thresh=0.7, conf_thres=0, match_thres=0.8, low_conf_thres=0.2, input_size=[64, 192], motion='KalmanFilter', metric_type='euclidean'): self.use_byte = use_byte self.num_classes = num_classes self.det_thresh = det_thresh if not use_byte else conf_thres + 0.1 self.track_buffer = track_buffer self.min_box_area = min_box_area self.vertical_ratio = vertical_ratio self.tracked_thresh = tracked_thresh self.r_tracked_thresh = r_tracked_thresh self.unconfirmed_thresh = unconfirmed_thresh self.conf_thres = conf_thres self.match_thres = match_thres self.low_conf_thres = low_conf_thres self.input_size = input_size if motion == 'KalmanFilter': self.motion = KalmanFilter() self.metric_type = metric_type self.frame_id = 0 self.tracked_tracks_dict = defaultdict(list) # dict(list[STrack]) self.lost_tracks_dict = defaultdict(list) # dict(list[STrack]) self.removed_tracks_dict = defaultdict(list) # dict(list[STrack]) self.max_time_lost = 0 # max_time_lost will be calculated: int(frame_rate / 30.0 * track_buffer) def update(self, pred_dets, pred_embs=None): """ Processes the image frame and finds bounding box(detections). Associates the detection with corresponding tracklets and also handles lost, removed, refound and active tracklets. Args: pred_dets (np.array): Detection results of the image, the shape is [N, 6], means 'cls_id, score, x0, y0, x1, y1'. pred_embs (np.array): Embedding results of the image, the shape is [N, 128] or [N, 512]. Return: output_stracks_dict (dict(list)): The list contains information regarding the online_tracklets for the received image tensor. """ self.frame_id += 1 if self.frame_id == 1: STrack.init_count(self.num_classes) activated_tracks_dict = defaultdict(list) refined_tracks_dict = defaultdict(list) lost_tracks_dict = defaultdict(list) removed_tracks_dict = defaultdict(list) output_tracks_dict = defaultdict(list) pred_dets_dict = defaultdict(list) pred_embs_dict = defaultdict(list) # unify single and multi classes detection and embedding results for cls_id in range(self.num_classes): cls_idx = (pred_dets[:, 0:1] == cls_id).squeeze(-1) pred_dets_dict[cls_id] = pred_dets[cls_idx] if pred_embs is not None: pred_embs_dict[cls_id] = pred_embs[cls_idx] else: pred_embs_dict[cls_id] = None for cls_id in range(self.num_classes): """ Step 1: Get detections by class""" pred_dets_cls = pred_dets_dict[cls_id] pred_embs_cls = pred_embs_dict[cls_id] remain_inds = (pred_dets_cls[:, 1:2] > self.conf_thres).squeeze(-1) if remain_inds.sum() > 0: pred_dets_cls = pred_dets_cls[remain_inds] if pred_embs_cls is None: # in original ByteTrack detections = [ STrack( STrack.tlbr_to_tlwh(tlbrs[2:6]), tlbrs[1], cls_id, 30, temp_feat=None) for tlbrs in pred_dets_cls ] else: pred_embs_cls = pred_embs_cls[remain_inds] detections = [ STrack( STrack.tlbr_to_tlwh(tlbrs[2:6]), tlbrs[1], cls_id, 30, temp_feat) for (tlbrs, temp_feat) in zip(pred_dets_cls, pred_embs_cls) ] else: detections = [] ''' Add newly detected tracklets to tracked_stracks''' unconfirmed_dict = defaultdict(list) tracked_tracks_dict = defaultdict(list) for track in self.tracked_tracks_dict[cls_id]: if not track.is_activated: # previous tracks which are not active in the current frame are added in unconfirmed list unconfirmed_dict[cls_id].append(track) else: # Active tracks are added to the local list 'tracked_stracks' tracked_tracks_dict[cls_id].append(track) """ Step 2: First association, with embedding""" # building tracking pool for the current frame track_pool_dict = defaultdict(list) track_pool_dict[cls_id] = joint_stracks( tracked_tracks_dict[cls_id], self.lost_tracks_dict[cls_id]) # Predict the current location with KalmanFilter STrack.multi_predict(track_pool_dict[cls_id], self.motion) if pred_embs_cls is None: # in original ByteTrack dists = matching.iou_distance(track_pool_dict[cls_id], detections) matches, u_track, u_detection = matching.linear_assignment( dists, thresh=self.match_thres) # not self.tracked_thresh else: dists = matching.embedding_distance( track_pool_dict[cls_id], detections, metric=self.metric_type) dists = matching.fuse_motion( self.motion, dists, track_pool_dict[cls_id], detections) matches, u_track, u_detection = matching.linear_assignment( dists, thresh=self.tracked_thresh) for i_tracked, idet in matches: # i_tracked is the id of the track and idet is the detection track = track_pool_dict[cls_id][i_tracked] det = detections[idet] if track.state == TrackState.Tracked: # If the track is active, add the detection to the track track.update(detections[idet], self.frame_id) activated_tracks_dict[cls_id].append(track) else: # We have obtained a detection from a track which is not active, # hence put the track in refind_stracks list track.re_activate(det, self.frame_id, new_id=False) refined_tracks_dict[cls_id].append(track) # None of the steps below happen if there are no undetected tracks. """ Step 3: Second association, with IOU""" if self.use_byte: inds_low = pred_dets_dict[cls_id][:, 1:2] > self.low_conf_thres inds_high = pred_dets_dict[cls_id][:, 1:2] < self.conf_thres inds_second = np.logical_and(inds_low, inds_high).squeeze(-1) pred_dets_cls_second = pred_dets_dict[cls_id][inds_second] # association the untrack to the low score detections if len(pred_dets_cls_second) > 0: if pred_embs_dict[cls_id] is None: # in original ByteTrack detections_second = [ STrack( STrack.tlbr_to_tlwh(tlbrs[2:6]), tlbrs[1], cls_id, 30, temp_feat=None) for tlbrs in pred_dets_cls_second ] else: pred_embs_cls_second = pred_embs_dict[cls_id][ inds_second] detections_second = [ STrack( STrack.tlbr_to_tlwh(tlbrs[2:6]), tlbrs[1], cls_id, 30, temp_feat) for (tlbrs, temp_feat) in zip(pred_dets_cls_second, pred_embs_cls_second) ] else: detections_second = [] r_tracked_stracks = [ track_pool_dict[cls_id][i] for i in u_track if track_pool_dict[cls_id][i].state == TrackState.Tracked ] dists = matching.iou_distance(r_tracked_stracks, detections_second) matches, u_track, u_detection_second = matching.linear_assignment( dists, thresh=0.4) # not r_tracked_thresh else: detections = [detections[i] for i in u_detection] r_tracked_stracks = [] for i in u_track: if track_pool_dict[cls_id][i].state == TrackState.Tracked: r_tracked_stracks.append(track_pool_dict[cls_id][i]) dists = matching.iou_distance(r_tracked_stracks, detections) matches, u_track, u_detection = matching.linear_assignment( dists, thresh=self.r_tracked_thresh) for i_tracked, idet in matches: track = r_tracked_stracks[i_tracked] det = detections[ idet] if not self.use_byte else detections_second[idet] if track.state == TrackState.Tracked: track.update(det, self.frame_id) activated_tracks_dict[cls_id].append(track) else: track.re_activate(det, self.frame_id, new_id=False) refined_tracks_dict[cls_id].append(track) for it in u_track: track = r_tracked_stracks[it] if not track.state == TrackState.Lost: track.mark_lost() lost_tracks_dict[cls_id].append(track) '''Deal with unconfirmed tracks, usually tracks with only one beginning frame''' detections = [detections[i] for i in u_detection] dists = matching.iou_distance(unconfirmed_dict[cls_id], detections) matches, u_unconfirmed, u_detection = matching.linear_assignment( dists, thresh=self.unconfirmed_thresh) for i_tracked, idet in matches: unconfirmed_dict[cls_id][i_tracked].update(detections[idet], self.frame_id) activated_tracks_dict[cls_id].append(unconfirmed_dict[cls_id][ i_tracked]) for it in u_unconfirmed: track = unconfirmed_dict[cls_id][it] track.mark_removed() removed_tracks_dict[cls_id].append(track) """ Step 4: Init new stracks""" for inew in u_detection: track = detections[inew] if track.score < self.det_thresh: continue track.activate(self.motion, self.frame_id) activated_tracks_dict[cls_id].append(track) """ Step 5: Update state""" for track in self.lost_tracks_dict[cls_id]: if self.frame_id - track.end_frame > self.max_time_lost: track.mark_removed() removed_tracks_dict[cls_id].append(track) self.tracked_tracks_dict[cls_id] = [ t for t in self.tracked_tracks_dict[cls_id] if t.state == TrackState.Tracked ] self.tracked_tracks_dict[cls_id] = joint_stracks( self.tracked_tracks_dict[cls_id], activated_tracks_dict[cls_id]) self.tracked_tracks_dict[cls_id] = joint_stracks( self.tracked_tracks_dict[cls_id], refined_tracks_dict[cls_id]) self.lost_tracks_dict[cls_id] = sub_stracks( self.lost_tracks_dict[cls_id], self.tracked_tracks_dict[cls_id]) self.lost_tracks_dict[cls_id].extend(lost_tracks_dict[cls_id]) self.lost_tracks_dict[cls_id] = sub_stracks( self.lost_tracks_dict[cls_id], self.removed_tracks_dict[cls_id]) self.removed_tracks_dict[cls_id].extend(removed_tracks_dict[cls_id]) self.tracked_tracks_dict[cls_id], self.lost_tracks_dict[ cls_id] = remove_duplicate_stracks( self.tracked_tracks_dict[cls_id], self.lost_tracks_dict[cls_id]) # get scores of lost tracks output_tracks_dict[cls_id] = [ track for track in self.tracked_tracks_dict[cls_id] if track.is_activated ] logger.debug('===========Frame {}=========='.format(self.frame_id)) logger.debug('Activated: {}'.format( [track.track_id for track in activated_tracks_dict[cls_id]])) logger.debug('Refind: {}'.format( [track.track_id for track in refined_tracks_dict[cls_id]])) logger.debug('Lost: {}'.format( [track.track_id for track in lost_tracks_dict[cls_id]])) logger.debug('Removed: {}'.format( [track.track_id for track in removed_tracks_dict[cls_id]])) return output_tracks_dict ================================================ FILE: ppdet/modeling/mot/tracker/ocsort_tracker.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This code is based on https://github.com/noahcao/OC_SORT/blob/master/trackers/ocsort_tracker/ocsort.py """ import numpy as np from ..matching.ocsort_matching import associate, linear_assignment, iou_batch, associate_only_iou from ..motion.ocsort_kalman_filter import OCSORTKalmanFilter from ppdet.core.workspace import register, serializable def k_previous_obs(observations, cur_age, k): if len(observations) == 0: return [-1, -1, -1, -1, -1] for i in range(k): dt = k - i if cur_age - dt in observations: return observations[cur_age - dt] max_age = max(observations.keys()) return observations[max_age] def convert_bbox_to_z(bbox): """ Takes a bounding box in the form [x1,y1,x2,y2] and returns z in the form [x,y,s,r] where x,y is the centre of the box and s is the scale/area and r is the aspect ratio """ w = bbox[2] - bbox[0] h = bbox[3] - bbox[1] x = bbox[0] + w / 2. y = bbox[1] + h / 2. s = w * h # scale is just area r = w / float(h + 1e-6) return np.array([x, y, s, r]).reshape((4, 1)) def convert_x_to_bbox(x, score=None): """ Takes a bounding box in the centre form [x,y,s,r] and returns it in the form [x1,y1,x2,y2] where x1,y1 is the top left and x2,y2 is the bottom right """ w = np.sqrt(x[2] * x[3]) h = x[2] / w if (score == None): return np.array( [x[0] - w / 2., x[1] - h / 2., x[0] + w / 2., x[1] + h / 2.]).reshape((1, 4)) else: score = np.array([score]) return np.array([ x[0] - w / 2., x[1] - h / 2., x[0] + w / 2., x[1] + h / 2., score ]).reshape((1, 5)) def speed_direction(bbox1, bbox2): cx1, cy1 = (bbox1[0] + bbox1[2]) / 2.0, (bbox1[1] + bbox1[3]) / 2.0 cx2, cy2 = (bbox2[0] + bbox2[2]) / 2.0, (bbox2[1] + bbox2[3]) / 2.0 speed = np.array([cy2 - cy1, cx2 - cx1]) norm = np.sqrt((cy2 - cy1)**2 + (cx2 - cx1)**2) + 1e-6 return speed / norm class KalmanBoxTracker(object): """ This class represents the internal state of individual tracked objects observed as bbox. Args: bbox (np.array): bbox in [x1,y1,x2,y2,score] format. delta_t (int): delta_t of previous observation """ count = 0 def __init__(self, bbox, delta_t=3): self.kf = OCSORTKalmanFilter(dim_x=7, dim_z=4) self.kf.F = np.array([[1., 0, 0, 0, 1., 0, 0], [0, 1., 0, 0, 0, 1., 0], [0, 0, 1., 0, 0, 0, 1], [0, 0, 0, 1., 0, 0, 0], [0, 0, 0, 0, 1., 0, 0], [0, 0, 0, 0, 0, 1., 0], [0, 0, 0, 0, 0, 0, 1.]]) self.kf.H = np.array([[1., 0, 0, 0, 0, 0, 0], [0, 1., 0, 0, 0, 0, 0], [0, 0, 1., 0, 0, 0, 0], [0, 0, 0, 1., 0, 0, 0]]) self.kf.R[2:, 2:] *= 10. self.kf.P[4:, 4:] *= 1000. # give high uncertainty to the unobservable initial velocities self.kf.P *= 10. self.kf.Q[-1, -1] *= 0.01 self.kf.Q[4:, 4:] *= 0.01 self.score = bbox[4] self.kf.x[:4] = convert_bbox_to_z(bbox) self.time_since_update = 0 self.id = KalmanBoxTracker.count KalmanBoxTracker.count += 1 self.history = [] self.hits = 0 self.hit_streak = 0 self.age = 0 """ NOTE: [-1,-1,-1,-1,-1] is a compromising placeholder for non-observation status, the same for the return of function k_previous_obs. It is ugly and I do not like it. But to support generate observation array in a fast and unified way, which you would see below k_observations = np.array([k_previous_obs(...]]), let's bear it for now. """ self.last_observation = np.array([-1, -1, -1, -1, -1]) # placeholder self.observations = dict() self.history_observations = [] self.velocity = None self.delta_t = delta_t def update(self, bbox, angle_cost=False): """ Updates the state vector with observed bbox. """ if bbox is not None: if angle_cost and self.last_observation.sum( ) >= 0: # no previous observation previous_box = None for i in range(self.delta_t): dt = self.delta_t - i if self.age - dt in self.observations: previous_box = self.observations[self.age - dt] break if previous_box is None: previous_box = self.last_observation """ Estimate the track speed direction with observations \Delta t steps away """ self.velocity = speed_direction(previous_box, bbox) """ Insert new observations. This is a ugly way to maintain both self.observations and self.history_observations. Bear it for the moment. """ self.last_observation = bbox self.observations[self.age] = bbox self.history_observations.append(bbox) self.time_since_update = 0 self.history = [] self.hits += 1 self.hit_streak += 1 self.kf.update(convert_bbox_to_z(bbox)) else: self.kf.update(bbox) def predict(self): """ Advances the state vector and returns the predicted bounding box estimate. """ if ((self.kf.x[6] + self.kf.x[2]) <= 0): self.kf.x[6] *= 0.0 self.kf.predict() self.age += 1 if (self.time_since_update > 0): self.hit_streak = 0 self.time_since_update += 1 self.history.append(convert_x_to_bbox(self.kf.x, score=self.score)) return self.history[-1] def get_state(self): return convert_x_to_bbox(self.kf.x, score=self.score) @register @serializable class OCSORTTracker(object): """ OCSORT tracker, support single class Args: det_thresh (float): threshold of detection score max_age (int): maximum number of missed misses before a track is deleted min_hits (int): minimum hits for associate iou_threshold (float): iou threshold for associate delta_t (int): delta_t of previous observation inertia (float): vdc_weight of angle_diff_cost for associate vertical_ratio (float): w/h, the vertical ratio of the bbox to filter bad results. If set <= 0 means no need to filter bboxes,usually set 1.6 for pedestrian tracking. min_box_area (int): min box area to filter out low quality boxes use_byte (bool): Whether use ByteTracker, default False """ def __init__(self, det_thresh=0.6, max_age=30, min_hits=3, iou_threshold=0.3, delta_t=3, inertia=0.2, vertical_ratio=-1, min_box_area=0, use_byte=False, use_angle_cost=False): self.det_thresh = det_thresh self.max_age = max_age self.min_hits = min_hits self.iou_threshold = iou_threshold self.delta_t = delta_t self.inertia = inertia self.vertical_ratio = vertical_ratio self.min_box_area = min_box_area self.use_byte = use_byte self.use_angle_cost = use_angle_cost self.trackers = [] self.frame_count = 0 KalmanBoxTracker.count = 0 def update(self, pred_dets, pred_embs=None): """ Args: pred_dets (np.array): Detection results of the image, the shape is [N, 6], means 'cls_id, score, x0, y0, x1, y1'. pred_embs (np.array): Embedding results of the image, the shape is [N, 128] or [N, 512], default as None. Return: tracking boxes (np.array): [M, 6], means 'x0, y0, x1, y1, score, id'. """ if pred_dets is None: return np.empty((0, 6)) self.frame_count += 1 bboxes = pred_dets[:, 2:] scores = pred_dets[:, 1:2] dets = np.concatenate((bboxes, scores), axis=1) scores = scores.squeeze(-1) inds_low = scores > 0.1 inds_high = scores < self.det_thresh inds_second = np.logical_and(inds_low, inds_high) # self.det_thresh > score > 0.1, for second matching dets_second = dets[inds_second] # detections for second matching remain_inds = scores > self.det_thresh dets = dets[remain_inds] # get predicted locations from existing trackers. trks = np.zeros((len(self.trackers), 5)) to_del = [] ret = [] for t, trk in enumerate(trks): pos = self.trackers[t].predict()[0] trk[:] = [pos[0], pos[1], pos[2], pos[3], 0] if np.any(np.isnan(pos)): to_del.append(t) trks = np.ma.compress_rows(np.ma.masked_invalid(trks)) for t in reversed(to_del): self.trackers.pop(t) if self.use_angle_cost: velocities = np.array([ trk.velocity if trk.velocity is not None else np.array((0, 0)) for trk in self.trackers ]) k_observations = np.array([ k_previous_obs(trk.observations, trk.age, self.delta_t) for trk in self.trackers ]) last_boxes = np.array([trk.last_observation for trk in self.trackers]) """ First round of association """ if self.use_angle_cost: matched, unmatched_dets, unmatched_trks = associate( dets, trks, self.iou_threshold, velocities, k_observations, self.inertia) else: matched, unmatched_dets, unmatched_trks = associate_only_iou( dets, trks, self.iou_threshold) for m in matched: self.trackers[m[1]].update( dets[m[0], :], angle_cost=self.use_angle_cost) """ Second round of associaton by OCR """ # BYTE association if self.use_byte and len(dets_second) > 0 and unmatched_trks.shape[ 0] > 0: u_trks = trks[unmatched_trks] iou_left = iou_batch( dets_second, u_trks) # iou between low score detections and unmatched tracks iou_left = np.array(iou_left) if iou_left.max() > self.iou_threshold: """ NOTE: by using a lower threshold, e.g., self.iou_threshold - 0.1, you may get a higher performance especially on MOT17/MOT20 datasets. But we keep it uniform here for simplicity """ matched_indices = linear_assignment(-iou_left) to_remove_trk_indices = [] for m in matched_indices: det_ind, trk_ind = m[0], unmatched_trks[m[1]] if iou_left[m[0], m[1]] < self.iou_threshold: continue self.trackers[trk_ind].update( dets_second[det_ind, :], angle_cost=self.use_angle_cost) to_remove_trk_indices.append(trk_ind) unmatched_trks = np.setdiff1d(unmatched_trks, np.array(to_remove_trk_indices)) if unmatched_dets.shape[0] > 0 and unmatched_trks.shape[0] > 0: left_dets = dets[unmatched_dets] left_trks = last_boxes[unmatched_trks] iou_left = iou_batch(left_dets, left_trks) iou_left = np.array(iou_left) if iou_left.max() > self.iou_threshold: """ NOTE: by using a lower threshold, e.g., self.iou_threshold - 0.1, you may get a higher performance especially on MOT17/MOT20 datasets. But we keep it uniform here for simplicity """ rematched_indices = linear_assignment(-iou_left) to_remove_det_indices = [] to_remove_trk_indices = [] for m in rematched_indices: det_ind, trk_ind = unmatched_dets[m[0]], unmatched_trks[m[ 1]] if iou_left[m[0], m[1]] < self.iou_threshold: continue self.trackers[trk_ind].update( dets[det_ind, :], angle_cost=self.use_angle_cost) to_remove_det_indices.append(det_ind) to_remove_trk_indices.append(trk_ind) unmatched_dets = np.setdiff1d(unmatched_dets, np.array(to_remove_det_indices)) unmatched_trks = np.setdiff1d(unmatched_trks, np.array(to_remove_trk_indices)) for m in unmatched_trks: self.trackers[m].update(None) # create and initialise new trackers for unmatched detections for i in unmatched_dets: trk = KalmanBoxTracker(dets[i, :], delta_t=self.delta_t) self.trackers.append(trk) i = len(self.trackers) for trk in reversed(self.trackers): if trk.last_observation.sum() < 0: d = trk.get_state()[0] else: d = trk.last_observation # tlbr + score if (trk.time_since_update < 1) and ( trk.hit_streak >= self.min_hits or self.frame_count <= self.min_hits): # +1 as MOT benchmark requires positive ret.append(np.concatenate((d, [trk.id + 1])).reshape(1, -1)) i -= 1 # remove dead tracklet if (trk.time_since_update > self.max_age): self.trackers.pop(i) if (len(ret) > 0): return np.concatenate(ret) return np.empty((0, 6)) ================================================ FILE: ppdet/modeling/mot/utils.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import cv2 import time import numpy as np from .visualization import plot_tracking_dict, plot_tracking __all__ = [ 'MOTTimer', 'Detection', 'write_mot_results', 'save_vis_results', 'load_det_results', 'preprocess_reid', 'get_crops', 'clip_box', 'scale_coords', ] class MOTTimer(object): """ This class used to compute and print the current FPS while evaling. """ def __init__(self): self.total_time = 0. self.calls = 0 self.start_time = 0. self.diff = 0. self.average_time = 0. self.duration = 0. def tic(self): # using time.time instead of time.clock because time time.clock # does not normalize for multithreading self.start_time = time.time() def toc(self, average=True): self.diff = time.time() - self.start_time self.total_time += self.diff self.calls += 1 self.average_time = self.total_time / self.calls if average: self.duration = self.average_time else: self.duration = self.diff return self.duration def clear(self): self.total_time = 0. self.calls = 0 self.start_time = 0. self.diff = 0. self.average_time = 0. self.duration = 0. class Detection(object): """ This class represents a bounding box detection in a single image. Args: tlwh (Tensor): Bounding box in format `(top left x, top left y, width, height)`. score (Tensor): Bounding box confidence score. feature (Tensor): A feature vector that describes the object contained in this image. cls_id (Tensor): Bounding box category id. """ def __init__(self, tlwh, score, feature, cls_id): self.tlwh = np.asarray(tlwh, dtype=np.float32) self.score = float(score) self.feature = np.asarray(feature, dtype=np.float32) self.cls_id = int(cls_id) def to_tlbr(self): """ Convert bounding box to format `(min x, min y, max x, max y)`, i.e., `(top left, bottom right)`. """ ret = self.tlwh.copy() ret[2:] += ret[:2] return ret def to_xyah(self): """ Convert bounding box to format `(center x, center y, aspect ratio, height)`, where the aspect ratio is `width / height`. """ ret = self.tlwh.copy() ret[:2] += ret[2:] / 2 ret[2] /= ret[3] return ret def write_mot_results(filename, results, data_type='mot', num_classes=1): # support single and multi classes if data_type in ['mot', 'mcmot']: save_format = '{frame},{id},{x1},{y1},{w},{h},{score},{cls_id},-1,-1\n' elif data_type == 'kitti': save_format = '{frame} {id} car 0 0 -10 {x1} {y1} {x2} {y2} -10 -10 -10 -1000 -1000 -1000 -10\n' else: raise ValueError(data_type) f = open(filename, 'w') for cls_id in range(num_classes): for frame_id, tlwhs, tscores, track_ids in results[cls_id]: if data_type == 'kitti': frame_id -= 1 for tlwh, score, track_id in zip(tlwhs, tscores, track_ids): if track_id < 0: continue if data_type == 'mot': cls_id = -1 x1, y1, w, h = tlwh x2, y2 = x1 + w, y1 + h line = save_format.format( frame=frame_id, id=track_id, x1=x1, y1=y1, x2=x2, y2=y2, w=w, h=h, score=score, cls_id=cls_id) f.write(line) print('MOT results save in {}'.format(filename)) def save_vis_results(data, frame_id, online_ids, online_tlwhs, online_scores, average_time, show_image, save_dir, num_classes=1, ids2names=[]): if show_image or save_dir is not None: assert 'ori_image' in data img0 = data['ori_image'].numpy()[0] if online_ids is None: online_im = img0 else: if isinstance(online_tlwhs, dict): online_im = plot_tracking_dict( img0, num_classes, online_tlwhs, online_ids, online_scores, frame_id=frame_id, fps=1. / average_time, ids2names=ids2names) else: online_im = plot_tracking( img0, online_tlwhs, online_ids, online_scores, frame_id=frame_id, fps=1. / average_time, ids2names=ids2names) if show_image: cv2.imshow('online_im', online_im) if save_dir is not None: cv2.imwrite( os.path.join(save_dir, '{:05d}.jpg'.format(frame_id)), online_im) def load_det_results(det_file, num_frames): assert os.path.exists(det_file) and os.path.isfile(det_file), \ '{} is not exist or not a file.'.format(det_file) labels = np.loadtxt(det_file, dtype='float32', delimiter=',') assert labels.shape[1] == 7, \ "Each line of {} should have 7 items: '[frame_id],[x0],[y0],[w],[h],[score],[class_id]'.".format(det_file) results_list = [] for frame_i in range(num_frames): results = {'bbox': [], 'score': [], 'cls_id': []} lables_with_frame = labels[labels[:, 0] == frame_i + 1] # each line of lables_with_frame: # [frame_id],[x0],[y0],[w],[h],[score],[class_id] for l in lables_with_frame: results['bbox'].append(l[1:5]) results['score'].append(l[5:6]) results['cls_id'].append(l[6:7]) results_list.append(results) return results_list def scale_coords(coords, input_shape, im_shape, scale_factor): # Note: ratio has only one value, scale_factor[0] == scale_factor[1] # # This function only used for JDE YOLOv3 or other detectors with # LetterBoxResize and JDEBBoxPostProcess, coords output from detector had # not scaled back to the origin image. ratio = scale_factor[0] pad_w = (input_shape[1] - int(im_shape[1])) / 2 pad_h = (input_shape[0] - int(im_shape[0])) / 2 coords[:, 0::2] -= pad_w coords[:, 1::2] -= pad_h coords[:, 0:4] /= ratio coords[:, :4] = np.clip(coords[:, :4], a_min=0, a_max=coords[:, :4].max()) return coords.round() def clip_box(xyxy, ori_image_shape): H, W = ori_image_shape xyxy[:, 0::2] = np.clip(xyxy[:, 0::2], a_min=0, a_max=W) xyxy[:, 1::2] = np.clip(xyxy[:, 1::2], a_min=0, a_max=H) w = xyxy[:, 2:3] - xyxy[:, 0:1] h = xyxy[:, 3:4] - xyxy[:, 1:2] mask = np.logical_and(h > 0, w > 0) keep_idx = np.nonzero(mask) return xyxy[keep_idx[0]], keep_idx def get_crops(xyxy, ori_img, w, h): crops = [] xyxy = xyxy.astype(np.int64) ori_img = ori_img.numpy() ori_img = np.squeeze(ori_img, axis=0).transpose(1, 0, 2) # [h,w,3]->[w,h,3] for i, bbox in enumerate(xyxy): crop = ori_img[bbox[0]:bbox[2], bbox[1]:bbox[3], :] crops.append(crop) crops = preprocess_reid(crops, w, h) return crops def preprocess_reid(imgs, w=64, h=192, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]): im_batch = [] for img in imgs: img = cv2.resize(img, (w, h)) img = img[:, :, ::-1].astype('float32').transpose((2, 0, 1)) / 255 img_mean = np.array(mean).reshape((3, 1, 1)) img_std = np.array(std).reshape((3, 1, 1)) img -= img_mean img /= img_std img = np.expand_dims(img, axis=0) im_batch.append(img) im_batch = np.concatenate(im_batch, 0) return im_batch ================================================ FILE: ppdet/modeling/mot/visualization.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import cv2 import numpy as np def get_color(idx): idx = idx * 3 color = ((37 * idx) % 255, (17 * idx) % 255, (29 * idx) % 255) return color def plot_tracking(image, tlwhs, obj_ids, scores=None, frame_id=0, fps=0., ids2names=[]): im = np.ascontiguousarray(np.copy(image)) im_h, im_w = im.shape[:2] top_view = np.zeros([im_w, im_w, 3], dtype=np.uint8) + 255 text_scale = max(1, image.shape[1] / 1600.) text_thickness = 2 line_thickness = max(1, int(image.shape[1] / 500.)) radius = max(5, int(im_w / 140.)) cv2.putText( im, 'frame: %d fps: %.2f num: %d' % (frame_id, fps, len(tlwhs)), (0, int(15 * text_scale)), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255), thickness=2) for i, tlwh in enumerate(tlwhs): x1, y1, w, h = tlwh intbox = tuple(map(int, (x1, y1, x1 + w, y1 + h))) obj_id = int(obj_ids[i]) id_text = '{}'.format(int(obj_id)) if ids2names != []: assert len( ids2names) == 1, "plot_tracking only supports single classes." id_text = '{}_'.format(ids2names[0]) + id_text _line_thickness = 1 if obj_id <= 0 else line_thickness color = get_color(abs(obj_id)) cv2.rectangle( im, intbox[0:2], intbox[2:4], color=color, thickness=line_thickness) cv2.putText( im, id_text, (intbox[0], intbox[1] - 10), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255), thickness=text_thickness) if scores is not None: text = '{:.2f}'.format(float(scores[i])) cv2.putText( im, text, (intbox[0], intbox[1] + 10), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 255, 255), thickness=text_thickness) return im def plot_tracking_dict(image, num_classes, tlwhs_dict, obj_ids_dict, scores_dict, frame_id=0, fps=0., ids2names=[]): im = np.ascontiguousarray(np.copy(image)) im_h, im_w = im.shape[:2] top_view = np.zeros([im_w, im_w, 3], dtype=np.uint8) + 255 text_scale = max(1, image.shape[1] / 1600.) text_thickness = 2 line_thickness = max(1, int(image.shape[1] / 500.)) radius = max(5, int(im_w / 140.)) for cls_id in range(num_classes): tlwhs = tlwhs_dict[cls_id] obj_ids = obj_ids_dict[cls_id] scores = scores_dict[cls_id] cv2.putText( im, 'frame: %d fps: %.2f num: %d' % (frame_id, fps, len(tlwhs)), (0, int(15 * text_scale)), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255), thickness=2) for i, tlwh in enumerate(tlwhs): x1, y1, w, h = tlwh intbox = tuple(map(int, (x1, y1, x1 + w, y1 + h))) obj_id = int(obj_ids[i]) id_text = '{}'.format(int(obj_id)) if ids2names != []: id_text = '{}_{}'.format(ids2names[cls_id], id_text) else: id_text = 'class{}_{}'.format(cls_id, id_text) _line_thickness = 1 if obj_id <= 0 else line_thickness color = get_color(abs(obj_id)) cv2.rectangle( im, intbox[0:2], intbox[2:4], color=color, thickness=line_thickness) cv2.putText( im, id_text, (intbox[0], intbox[1] - 10), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255), thickness=text_thickness) if scores is not None: text = '{:.2f}'.format(float(scores[i])) cv2.putText( im, text, (intbox[0], intbox[1] + 10), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 255, 255), thickness=text_thickness) return im ================================================ FILE: ppdet/modeling/necks/__init__.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import fpn from . import yolo_fpn from . import hrfpn from . import ttf_fpn from . import centernet_fpn from . import bifpn from . import csp_pan from . import es_pan from . import lc_pan from . import custom_pan from . import dilated_encoder from . import clrnet_fpn from .fpn import * from .yolo_fpn import * from .hrfpn import * from .ttf_fpn import * from .centernet_fpn import * from .blazeface_fpn import * from .bifpn import * from .csp_pan import * from .es_pan import * from .lc_pan import * from .custom_pan import * from .dilated_encoder import * from .channel_mapper import * from .clrnet_fpn import * ================================================ FILE: ppdet/modeling/necks/bifpn.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.nn.initializer import Constant from ppdet.core.workspace import register, serializable from ppdet.modeling.layers import ConvNormLayer from ..shape_spec import ShapeSpec __all__ = ['BiFPN'] class SeparableConvLayer(nn.Layer): def __init__(self, in_channels, out_channels=None, kernel_size=3, norm_type='bn', norm_groups=32, act='swish'): super(SeparableConvLayer, self).__init__() assert norm_type in ['bn', 'sync_bn', 'gn', None] assert act in ['swish', 'relu', None] self.in_channels = in_channels if out_channels is None: self.out_channels = self.in_channels self.norm_type = norm_type self.norm_groups = norm_groups self.depthwise_conv = nn.Conv2D( in_channels, in_channels, kernel_size, padding=kernel_size // 2, groups=in_channels, bias_attr=False) self.pointwise_conv = nn.Conv2D(in_channels, self.out_channels, 1) # norm type if self.norm_type in ['bn', 'sync_bn']: self.norm = nn.BatchNorm2D(self.out_channels) elif self.norm_type == 'gn': self.norm = nn.GroupNorm( num_groups=self.norm_groups, num_channels=self.out_channels) # activation if act == 'swish': self.act = nn.Swish() elif act == 'relu': self.act = nn.ReLU() def forward(self, x): if self.act is not None: x = self.act(x) out = self.depthwise_conv(x) out = self.pointwise_conv(out) if self.norm_type is not None: out = self.norm(out) return out class BiFPNCell(nn.Layer): def __init__(self, channels=256, num_levels=5, eps=1e-5, use_weighted_fusion=True, kernel_size=3, norm_type='bn', norm_groups=32, act='swish'): super(BiFPNCell, self).__init__() self.channels = channels self.num_levels = num_levels self.eps = eps self.use_weighted_fusion = use_weighted_fusion # up self.conv_up = nn.LayerList([ SeparableConvLayer( self.channels, kernel_size=kernel_size, norm_type=norm_type, norm_groups=norm_groups, act=act) for _ in range(self.num_levels - 1) ]) # down self.conv_down = nn.LayerList([ SeparableConvLayer( self.channels, kernel_size=kernel_size, norm_type=norm_type, norm_groups=norm_groups, act=act) for _ in range(self.num_levels - 1) ]) if self.use_weighted_fusion: self.up_weights = self.create_parameter( shape=[self.num_levels - 1, 2], attr=ParamAttr(initializer=Constant(1.))) self.down_weights = self.create_parameter( shape=[self.num_levels - 1, 3], attr=ParamAttr(initializer=Constant(1.))) def _feature_fusion_cell(self, conv_layer, lateral_feat, sampling_feat, route_feat=None, weights=None): if self.use_weighted_fusion: weights = F.relu(weights) weights = weights / (weights.sum() + self.eps) if route_feat is not None: out_feat = weights[0] * lateral_feat + \ weights[1] * sampling_feat + \ weights[2] * route_feat else: out_feat = weights[0] * lateral_feat + \ weights[1] * sampling_feat else: if route_feat is not None: out_feat = lateral_feat + sampling_feat + route_feat else: out_feat = lateral_feat + sampling_feat out_feat = conv_layer(out_feat) return out_feat def forward(self, feats): # feats: [P3 - P7] lateral_feats = [] # up up_feature = feats[-1] for i, feature in enumerate(feats[::-1]): if i == 0: lateral_feats.append(feature) else: shape = feature.shape up_feature = F.interpolate( up_feature, size=[shape[2], shape[3]]) lateral_feature = self._feature_fusion_cell( self.conv_up[i - 1], feature, up_feature, weights=self.up_weights[i - 1] if self.use_weighted_fusion else None) lateral_feats.append(lateral_feature) up_feature = lateral_feature out_feats = [] # down down_feature = lateral_feats[-1] for i, (lateral_feature, route_feature) in enumerate(zip(lateral_feats[::-1], feats)): if i == 0: out_feats.append(lateral_feature) else: down_feature = F.max_pool2d(down_feature, 3, 2, 1) if i == len(feats) - 1: route_feature = None weights = self.down_weights[ i - 1][:2] if self.use_weighted_fusion else None else: weights = self.down_weights[ i - 1] if self.use_weighted_fusion else None out_feature = self._feature_fusion_cell( self.conv_down[i - 1], lateral_feature, down_feature, route_feature, weights=weights) out_feats.append(out_feature) down_feature = out_feature return out_feats @register @serializable class BiFPN(nn.Layer): """ Bidirectional Feature Pyramid Network, see https://arxiv.org/abs/1911.09070 Args: in_channels (list[int]): input channels of each level which can be derived from the output shape of backbone by from_config. out_channel (int): output channel of each level. num_extra_levels (int): the number of extra stages added to the last level. default: 2 fpn_strides (List): The stride of each level. num_stacks (int): the number of stacks for BiFPN, default: 1. use_weighted_fusion (bool): use weighted feature fusion in BiFPN, default: True. norm_type (string|None): the normalization type in BiFPN module. If norm_type is None, norm will not be used after conv and if norm_type is string, bn, gn, sync_bn are available. default: bn. norm_groups (int): if you use gn, set this param. act (string|None): the activation function of BiFPN. """ def __init__(self, in_channels=(512, 1024, 2048), out_channel=256, num_extra_levels=2, fpn_strides=[8, 16, 32, 64, 128], num_stacks=1, use_weighted_fusion=True, norm_type='bn', norm_groups=32, act='swish'): super(BiFPN, self).__init__() assert num_stacks > 0, "The number of stacks of BiFPN is at least 1." assert norm_type in ['bn', 'sync_bn', 'gn', None] assert act in ['swish', 'relu', None] assert num_extra_levels >= 0, \ "The `num_extra_levels` must be non negative(>=0)." self.in_channels = in_channels self.out_channel = out_channel self.num_extra_levels = num_extra_levels self.num_stacks = num_stacks self.use_weighted_fusion = use_weighted_fusion self.norm_type = norm_type self.norm_groups = norm_groups self.act = act self.num_levels = len(self.in_channels) + self.num_extra_levels if len(fpn_strides) != self.num_levels: for i in range(self.num_extra_levels): fpn_strides += [fpn_strides[-1] * 2] self.fpn_strides = fpn_strides self.lateral_convs = nn.LayerList() for in_c in in_channels: self.lateral_convs.append( ConvNormLayer(in_c, self.out_channel, 1, 1)) if self.num_extra_levels > 0: self.extra_convs = nn.LayerList() for i in range(self.num_extra_levels): if i == 0: self.extra_convs.append( ConvNormLayer(self.in_channels[-1], self.out_channel, 3, 2)) else: self.extra_convs.append(nn.MaxPool2D(3, 2, 1)) self.bifpn_cells = nn.LayerList() for i in range(self.num_stacks): self.bifpn_cells.append( BiFPNCell( self.out_channel, self.num_levels, use_weighted_fusion=self.use_weighted_fusion, norm_type=self.norm_type, norm_groups=self.norm_groups, act=self.act)) @classmethod def from_config(cls, cfg, input_shape): return { 'in_channels': [i.channels for i in input_shape], 'fpn_strides': [i.stride for i in input_shape] } @property def out_shape(self): return [ ShapeSpec( channels=self.out_channel, stride=s) for s in self.fpn_strides ] def forward(self, feats): assert len(feats) == len(self.in_channels) fpn_feats = [] for conv_layer, feature in zip(self.lateral_convs, feats): fpn_feats.append(conv_layer(feature)) if self.num_extra_levels > 0: feat = feats[-1] for conv_layer in self.extra_convs: feat = conv_layer(feat) fpn_feats.append(feat) for bifpn_cell in self.bifpn_cells: fpn_feats = bifpn_cell(fpn_feats) return fpn_feats ================================================ FILE: ppdet/modeling/necks/blazeface_fpn.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn.functional as F from paddle import ParamAttr import paddle.nn as nn from paddle.nn.initializer import KaimingNormal from ppdet.core.workspace import register, serializable from ..shape_spec import ShapeSpec __all__ = ['BlazeNeck'] def hard_swish(x): return x * F.relu6(x + 3) / 6. class ConvBNLayer(nn.Layer): def __init__(self, in_channels, out_channels, kernel_size, stride, padding, num_groups=1, act='relu', conv_lr=0.1, conv_decay=0., norm_decay=0., norm_type='bn', name=None): super(ConvBNLayer, self).__init__() self.act = act self._conv = nn.Conv2D( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=num_groups, weight_attr=ParamAttr( learning_rate=conv_lr, initializer=KaimingNormal()), bias_attr=False) if norm_type in ['sync_bn', 'bn']: self._batch_norm = nn.BatchNorm2D(out_channels) def forward(self, x): x = self._conv(x) x = self._batch_norm(x) if self.act == "relu": x = F.relu(x) elif self.act == "relu6": x = F.relu6(x) elif self.act == 'leaky': x = F.leaky_relu(x) elif self.act == 'hard_swish': x = hard_swish(x) return x class FPN(nn.Layer): def __init__(self, in_channels, out_channels, name=None): super(FPN, self).__init__() self.conv1_fpn = ConvBNLayer( in_channels, out_channels // 2, kernel_size=1, padding=0, stride=1, act='leaky', name=name + '_output1') self.conv2_fpn = ConvBNLayer( in_channels, out_channels // 2, kernel_size=1, padding=0, stride=1, act='leaky', name=name + '_output2') self.conv3_fpn = ConvBNLayer( out_channels // 2, out_channels // 2, kernel_size=3, padding=1, stride=1, act='leaky', name=name + '_merge') def forward(self, input): output1 = self.conv1_fpn(input[0]) output2 = self.conv2_fpn(input[1]) up2 = F.upsample( output2, size=output1.shape[-2:], mode='nearest') output1 = paddle.add(output1, up2) output1 = self.conv3_fpn(output1) return output1, output2 class SSH(nn.Layer): def __init__(self, in_channels, out_channels, name=None): super(SSH, self).__init__() assert out_channels % 4 == 0 self.conv0_ssh = ConvBNLayer( in_channels, out_channels // 2, kernel_size=3, padding=1, stride=1, act=None, name=name + 'ssh_conv3') self.conv1_ssh = ConvBNLayer( out_channels // 2, out_channels // 4, kernel_size=3, padding=1, stride=1, act='leaky', name=name + 'ssh_conv5_1') self.conv2_ssh = ConvBNLayer( out_channels // 4, out_channels // 4, kernel_size=3, padding=1, stride=1, act=None, name=name + 'ssh_conv5_2') self.conv3_ssh = ConvBNLayer( out_channels // 4, out_channels // 4, kernel_size=3, padding=1, stride=1, act='leaky', name=name + 'ssh_conv7_1') self.conv4_ssh = ConvBNLayer( out_channels // 4, out_channels // 4, kernel_size=3, padding=1, stride=1, act=None, name=name + 'ssh_conv7_2') def forward(self, x): conv0 = self.conv0_ssh(x) conv1 = self.conv1_ssh(conv0) conv2 = self.conv2_ssh(conv1) conv3 = self.conv3_ssh(conv2) conv4 = self.conv4_ssh(conv3) concat = paddle.concat([conv0, conv2, conv4], axis=1) return F.relu(concat) @register @serializable class BlazeNeck(nn.Layer): def __init__(self, in_channel, neck_type="None", data_format='NCHW'): super(BlazeNeck, self).__init__() self.neck_type = neck_type self.reture_input = False self._out_channels = in_channel if self.neck_type == 'None': self.reture_input = True if "fpn" in self.neck_type: self.fpn = FPN(self._out_channels[0], self._out_channels[1], name='fpn') self._out_channels = [ self._out_channels[0] // 2, self._out_channels[1] // 2 ] if "ssh" in self.neck_type: self.ssh1 = SSH(self._out_channels[0], self._out_channels[0], name='ssh1') self.ssh2 = SSH(self._out_channels[1], self._out_channels[1], name='ssh2') self._out_channels = [self._out_channels[0], self._out_channels[1]] def forward(self, inputs): if self.reture_input: return inputs output1, output2 = None, None if "fpn" in self.neck_type: backout_4, backout_1 = inputs output1, output2 = self.fpn([backout_4, backout_1]) if self.neck_type == "only_fpn": return [output1, output2] if self.neck_type == "only_ssh": output1, output2 = inputs feature1 = self.ssh1(output1) feature2 = self.ssh2(output2) return [feature1, feature2] @property def out_shape(self): return [ ShapeSpec(channels=c) for c in [self._out_channels[0], self._out_channels[1]] ] ================================================ FILE: ppdet/modeling/necks/centernet_fpn.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np import math import paddle import paddle.nn as nn from paddle import ParamAttr from paddle.nn.initializer import Uniform import paddle.nn.functional as F from ppdet.core.workspace import register, serializable from ppdet.modeling.layers import ConvNormLayer from ppdet.modeling.backbones.hardnet import ConvLayer, HarDBlock from ..shape_spec import ShapeSpec __all__ = ['CenterNetDLAFPN', 'CenterNetHarDNetFPN'] # SGE attention class BasicConv(nn.Layer): def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, bn=True, bias_attr=False): super(BasicConv, self).__init__() self.out_channels = out_planes self.conv = nn.Conv2D( in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias_attr=bias_attr) self.bn = nn.BatchNorm2D( out_planes, epsilon=1e-5, momentum=0.01, weight_attr=False, bias_attr=False) if bn else None self.relu = nn.ReLU() if relu else None def forward(self, x): x = self.conv(x) if self.bn is not None: x = self.bn(x) if self.relu is not None: x = self.relu(x) return x class ChannelPool(nn.Layer): def forward(self, x): return paddle.concat( (paddle.max(x, 1).unsqueeze(1), paddle.mean(x, 1).unsqueeze(1)), axis=1) class SpatialGate(nn.Layer): def __init__(self): super(SpatialGate, self).__init__() kernel_size = 7 self.compress = ChannelPool() self.spatial = BasicConv( 2, 1, kernel_size, stride=1, padding=(kernel_size - 1) // 2, relu=False) def forward(self, x): x_compress = self.compress(x) x_out = self.spatial(x_compress) scale = F.sigmoid(x_out) # broadcasting return x * scale def fill_up_weights(up): weight = up.weight.numpy() f = math.ceil(weight.shape[2] / 2) c = (2 * f - 1 - f % 2) / (2. * f) for i in range(weight.shape[2]): for j in range(weight.shape[3]): weight[0, 0, i, j] = \ (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c)) for c in range(1, weight.shape[0]): weight[c, 0, :, :] = weight[0, 0, :, :] up.weight.set_value(weight) class IDAUp(nn.Layer): def __init__(self, ch_ins, ch_out, up_strides, dcn_v2=True): super(IDAUp, self).__init__() for i in range(1, len(ch_ins)): ch_in = ch_ins[i] up_s = int(up_strides[i]) fan_in = ch_in * 3 * 3 stdv = 1. / math.sqrt(fan_in) proj = nn.Sequential( ConvNormLayer( ch_in, ch_out, filter_size=3, stride=1, use_dcn=dcn_v2, bias_on=dcn_v2, norm_decay=None, dcn_lr_scale=1., dcn_regularizer=None, initializer=Uniform(-stdv, stdv)), nn.ReLU()) node = nn.Sequential( ConvNormLayer( ch_out, ch_out, filter_size=3, stride=1, use_dcn=dcn_v2, bias_on=dcn_v2, norm_decay=None, dcn_lr_scale=1., dcn_regularizer=None, initializer=Uniform(-stdv, stdv)), nn.ReLU()) kernel_size = up_s * 2 fan_in = ch_out * kernel_size * kernel_size stdv = 1. / math.sqrt(fan_in) up = nn.Conv2DTranspose( ch_out, ch_out, kernel_size=up_s * 2, stride=up_s, padding=up_s // 2, groups=ch_out, weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)), bias_attr=False) fill_up_weights(up) setattr(self, 'proj_' + str(i), proj) setattr(self, 'up_' + str(i), up) setattr(self, 'node_' + str(i), node) def forward(self, inputs, start_level, end_level): for i in range(start_level + 1, end_level): upsample = getattr(self, 'up_' + str(i - start_level)) project = getattr(self, 'proj_' + str(i - start_level)) inputs[i] = project(inputs[i]) inputs[i] = upsample(inputs[i]) node = getattr(self, 'node_' + str(i - start_level)) inputs[i] = node(paddle.add(inputs[i], inputs[i - 1])) return inputs class DLAUp(nn.Layer): def __init__(self, start_level, channels, scales, ch_in=None, dcn_v2=True): super(DLAUp, self).__init__() self.start_level = start_level if ch_in is None: ch_in = channels self.channels = channels channels = list(channels) scales = np.array(scales, dtype=int) for i in range(len(channels) - 1): j = -i - 2 setattr( self, 'ida_{}'.format(i), IDAUp( ch_in[j:], channels[j], scales[j:] // scales[j], dcn_v2=dcn_v2)) scales[j + 1:] = scales[j] ch_in[j + 1:] = [channels[j] for _ in channels[j + 1:]] def forward(self, inputs): out = [inputs[-1]] # start with 32 for i in range(len(inputs) - self.start_level - 1): ida = getattr(self, 'ida_{}'.format(i)) outputs = ida(inputs, len(inputs) - i - 2, len(inputs)) out.insert(0, outputs[-1]) return out @register @serializable class CenterNetDLAFPN(nn.Layer): """ Args: in_channels (list): number of input feature channels from backbone. [16, 32, 64, 128, 256, 512] by default, means the channels of DLA-34 down_ratio (int): the down ratio from images to heatmap, 4 by default last_level (int): the last level of input feature fed into the upsamplng block out_channel (int): the channel of the output feature, 0 by default means the channel of the input feature whose down ratio is `down_ratio` first_level (None): the first level of input feature fed into the upsamplng block. if None, the first level stands for logs(down_ratio) dcn_v2 (bool): whether use the DCNv2, True by default with_sge (bool): whether use SGE attention, False by default """ def __init__(self, in_channels, down_ratio=4, last_level=5, out_channel=0, first_level=None, dcn_v2=True, with_sge=False): super(CenterNetDLAFPN, self).__init__() self.first_level = int(np.log2( down_ratio)) if first_level is None else first_level assert self.first_level >= 0, "first level in CenterNetDLAFPN should be greater or equal to 0, but received {}".format( self.first_level) self.down_ratio = down_ratio self.last_level = last_level scales = [2**i for i in range(len(in_channels[self.first_level:]))] self.dla_up = DLAUp( self.first_level, in_channels[self.first_level:], scales, dcn_v2=dcn_v2) self.out_channel = out_channel if out_channel == 0: self.out_channel = in_channels[self.first_level] self.ida_up = IDAUp( in_channels[self.first_level:self.last_level], self.out_channel, [2**i for i in range(self.last_level - self.first_level)], dcn_v2=dcn_v2) self.with_sge = with_sge if self.with_sge: self.sge_attention = SpatialGate() @classmethod def from_config(cls, cfg, input_shape): return {'in_channels': [i.channels for i in input_shape]} def forward(self, body_feats): inputs = [body_feats[i] for i in range(len(body_feats))] dla_up_feats = self.dla_up(inputs) ida_up_feats = [] for i in range(self.last_level - self.first_level): ida_up_feats.append(dla_up_feats[i].clone()) self.ida_up(ida_up_feats, 0, len(ida_up_feats)) feat = ida_up_feats[-1] if self.with_sge: feat = self.sge_attention(feat) if self.down_ratio != 4: feat = F.interpolate( feat, scale_factor=self.down_ratio // 4, mode="bilinear", align_corners=True) return feat @property def out_shape(self): return [ShapeSpec(channels=self.out_channel, stride=self.down_ratio)] class TransitionUp(nn.Layer): def __init__(self, in_channels, out_channels): super().__init__() def forward(self, x, skip): w, h = skip.shape[2], skip.shape[3] out = F.interpolate(x, size=(w, h), mode="bilinear", align_corners=True) out = paddle.concat([out, skip], 1) return out @register @serializable class CenterNetHarDNetFPN(nn.Layer): """ Args: in_channels (list): number of input feature channels from backbone. [96, 214, 458, 784] by default, means the channels of HarDNet85 num_layers (int): HarDNet laters, 85 by default down_ratio (int): the down ratio from images to heatmap, 4 by default first_level (int|None): the first level of input feature fed into the upsamplng block. if None, the first level stands for logs(down_ratio) - 1 last_level (int): the last level of input feature fed into the upsamplng block out_channel (int): the channel of the output feature, 0 by default means the channel of the input feature whose down ratio is `down_ratio` """ def __init__(self, in_channels, num_layers=85, down_ratio=4, first_level=None, last_level=4, out_channel=0): super(CenterNetHarDNetFPN, self).__init__() self.first_level = int(np.log2( down_ratio)) - 1 if first_level is None else first_level assert self.first_level >= 0, "first level in CenterNetDLAFPN should be greater or equal to 0, but received {}".format( self.first_level) self.down_ratio = down_ratio self.last_level = last_level self.last_pool = nn.AvgPool2D(kernel_size=2, stride=2) assert num_layers in [68, 85], "HarDNet-{} not support.".format( num_layers) if num_layers == 85: self.last_proj = ConvLayer(784, 256, kernel_size=1) self.last_blk = HarDBlock(768, 80, 1.7, 8) self.skip_nodes = [1, 3, 8, 13] self.SC = [32, 32, 0] gr = [64, 48, 28] layers = [8, 8, 4] ch_list2 = [224 + self.SC[0], 160 + self.SC[1], 96 + self.SC[2]] channels = [96, 214, 458, 784] self.skip_lv = 3 elif num_layers == 68: self.last_proj = ConvLayer(654, 192, kernel_size=1) self.last_blk = HarDBlock(576, 72, 1.7, 8) self.skip_nodes = [1, 3, 8, 11] self.SC = [32, 32, 0] gr = [48, 32, 20] layers = [8, 8, 4] ch_list2 = [224 + self.SC[0], 96 + self.SC[1], 64 + self.SC[2]] channels = [64, 124, 328, 654] self.skip_lv = 2 self.transUpBlocks = nn.LayerList([]) self.denseBlocksUp = nn.LayerList([]) self.conv1x1_up = nn.LayerList([]) self.avg9x9 = nn.AvgPool2D(kernel_size=(9, 9), stride=1, padding=(4, 4)) prev_ch = self.last_blk.get_out_ch() for i in range(3): skip_ch = channels[3 - i] self.transUpBlocks.append(TransitionUp(prev_ch, prev_ch)) if i < self.skip_lv: cur_ch = prev_ch + skip_ch else: cur_ch = prev_ch self.conv1x1_up.append( ConvLayer( cur_ch, ch_list2[i], kernel_size=1)) cur_ch = ch_list2[i] cur_ch -= self.SC[i] cur_ch *= 3 blk = HarDBlock(cur_ch, gr[i], 1.7, layers[i]) self.denseBlocksUp.append(blk) prev_ch = blk.get_out_ch() prev_ch += self.SC[0] + self.SC[1] + self.SC[2] self.out_channel = prev_ch @classmethod def from_config(cls, cfg, input_shape): return {'in_channels': [i.channels for i in input_shape]} def forward(self, body_feats): x = body_feats[-1] x_sc = [] x = self.last_proj(x) x = self.last_pool(x) x2 = self.avg9x9(x) x3 = x / (x.sum((2, 3), keepdim=True) + 0.1) x = paddle.concat([x, x2, x3], 1) x = self.last_blk(x) for i in range(3): skip_x = body_feats[3 - i] x_up = self.transUpBlocks[i](x, skip_x) x_ch = self.conv1x1_up[i](x_up) if self.SC[i] > 0: end = x_ch.shape[1] new_st = end - self.SC[i] x_sc.append(x_ch[:, new_st:, :, :]) x_ch = x_ch[:, :new_st, :, :] x2 = self.avg9x9(x_ch) x3 = x_ch / (x_ch.sum((2, 3), keepdim=True) + 0.1) x_new = paddle.concat([x_ch, x2, x3], 1) x = self.denseBlocksUp[i](x_new) scs = [x] for i in range(3): if self.SC[i] > 0: scs.insert( 0, F.interpolate( x_sc[i], size=(x.shape[2], x.shape[3]), mode="bilinear", align_corners=True)) neck_feat = paddle.concat(scs, 1) return neck_feat @property def out_shape(self): return [ShapeSpec(channels=self.out_channel, stride=self.down_ratio)] ================================================ FILE: ppdet/modeling/necks/channel_mapper.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ this code is base on mmdet: git@github.com:open-mmlab/mmdetection.git """ import paddle.nn as nn from ppdet.core.workspace import register, serializable from ..backbones.hrnet import ConvNormLayer from ..shape_spec import ShapeSpec from ..initializer import xavier_uniform_, constant_ __all__ = ['ChannelMapper'] @register @serializable class ChannelMapper(nn.Layer): """Channel Mapper to reduce/increase channels of backbone features. This is used to reduce/increase channels of backbone features. Args: in_channels (List[int]): Number of input channels per scale. out_channels (int): Number of output channels (used at each scale). kernel_size (int, optional): kernel_size for reducing channels (used at each scale). Default: 3. conv_cfg (dict, optional): Config dict for convolution layer. Default: None. norm_cfg (dict, optional): Config dict for normalization layer. Default: None. act_cfg (dict, optional): Config dict for activation layer in ConvModule. Default: dict(type='ReLU'). num_outs (int, optional): Number of output feature maps. There would be extra_convs when num_outs larger than the length of in_channels. init_cfg (dict or list[dict], optional): Initialization config dict. """ def __init__(self, in_channels, out_channels, kernel_size=3, norm_type="gn", norm_groups=32, act='relu', num_outs=None, init_cfg=dict( type='Xavier', layer='Conv2d', distribution='uniform')): super(ChannelMapper, self).__init__() assert isinstance(in_channels, list) self.extra_convs = None if num_outs is None: num_outs = len(in_channels) self.convs = nn.LayerList() for in_channel in in_channels: self.convs.append( ConvNormLayer( ch_in=in_channel, ch_out=out_channels, filter_size=kernel_size, norm_type='gn', norm_groups=32, act=act)) if num_outs > len(in_channels): self.extra_convs = nn.LayerList() for i in range(len(in_channels), num_outs): if i == len(in_channels): in_channel = in_channels[-1] else: in_channel = out_channels self.extra_convs.append( ConvNormLayer( ch_in=in_channel, ch_out=out_channels, filter_size=3, stride=2, norm_type='gn', norm_groups=32, act=act)) self.init_weights() def forward(self, inputs): """Forward function.""" assert len(inputs) == len(self.convs) outs = [self.convs[i](inputs[i]) for i in range(len(inputs))] if self.extra_convs: for i in range(len(self.extra_convs)): if i == 0: outs.append(self.extra_convs[0](inputs[-1])) else: outs.append(self.extra_convs[i](outs[-1])) return tuple(outs) @property def out_shape(self): return [ ShapeSpec( channels=self.out_channel, stride=1. / s) for s in self.spatial_scales ] def init_weights(self): """Initialize the transformer weights.""" for p in self.parameters(): if p.rank() > 1: xavier_uniform_(p) if hasattr(p, 'bias') and p.bias is not None: constant_(p.bais) ================================================ FILE: ppdet/modeling/necks/clrnet_fpn.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.nn.initializer import XavierUniform from ppdet.modeling.initializer import kaiming_normal_, constant_ from ppdet.core.workspace import register, serializable from ppdet.modeling.layers import ConvNormLayer from ppdet.modeling.shape_spec import ShapeSpec __all__ = ['CLRFPN'] @register @serializable class CLRFPN(nn.Layer): """ Feature Pyramid Network, see https://arxiv.org/abs/1612.03144 Args: in_channels (list[int]): input channels of each level which can be derived from the output shape of backbone by from_config out_channel (int): output channel of each level spatial_scales (list[float]): the spatial scales between input feature maps and original input image which can be derived from the output shape of backbone by from_config has_extra_convs (bool): whether to add extra conv to the last level. default False extra_stage (int): the number of extra stages added to the last level. default 1 use_c5 (bool): Whether to use c5 as the input of extra stage, otherwise p5 is used. default True norm_type (string|None): The normalization type in FPN module. If norm_type is None, norm will not be used after conv and if norm_type is string, bn, gn, sync_bn are available. default None norm_decay (float): weight decay for normalization layer weights. default 0. freeze_norm (bool): whether to freeze normalization layer. default False relu_before_extra_convs (bool): whether to add relu before extra convs. default False """ def __init__(self, in_channels, out_channel, spatial_scales=[0.25, 0.125, 0.0625, 0.03125], has_extra_convs=False, extra_stage=1, use_c5=True, norm_type=None, norm_decay=0., freeze_norm=False, relu_before_extra_convs=True): super(CLRFPN, self).__init__() self.out_channel = out_channel for s in range(extra_stage): spatial_scales = spatial_scales + [spatial_scales[-1] / 2.] self.spatial_scales = spatial_scales self.has_extra_convs = has_extra_convs self.extra_stage = extra_stage self.use_c5 = use_c5 self.relu_before_extra_convs = relu_before_extra_convs self.norm_type = norm_type self.norm_decay = norm_decay self.freeze_norm = freeze_norm self.in_channels = in_channels self.lateral_convs = [] self.fpn_convs = [] fan = out_channel * 3 * 3 # stage index 0,1,2,3 stands for res2,res3,res4,res5 on ResNet Backbone # 0 <= st_stage < ed_stage <= 3 st_stage = 4 - len(in_channels) ed_stage = st_stage + len(in_channels) - 1 for i in range(st_stage, ed_stage + 1): # if i == 3: # lateral_name = 'fpn_inner_res5_sum' # else: # lateral_name = 'fpn_inner_res{}_sum_lateral'.format(i + 2) lateral_name = "lateral_convs.{}.conv".format(i - 1) in_c = in_channels[i - st_stage] if self.norm_type is not None: lateral = self.add_sublayer( lateral_name, ConvNormLayer( ch_in=in_c, ch_out=out_channel, filter_size=1, stride=1, norm_type=self.norm_type, norm_decay=self.norm_decay, freeze_norm=self.freeze_norm, initializer=XavierUniform(fan_out=in_c))) else: lateral = self.add_sublayer( lateral_name, nn.Conv2D( in_channels=in_c, out_channels=out_channel, kernel_size=1, weight_attr=ParamAttr( initializer=XavierUniform(fan_out=in_c)))) self.lateral_convs.append(lateral) fpn_name = "fpn_convs.{}.conv".format(i - 1) if self.norm_type is not None: fpn_conv = self.add_sublayer( fpn_name, ConvNormLayer( ch_in=out_channel, ch_out=out_channel, filter_size=3, stride=1, norm_type=self.norm_type, norm_decay=self.norm_decay, freeze_norm=self.freeze_norm, initializer=XavierUniform(fan_out=fan))) else: fpn_conv = self.add_sublayer( fpn_name, nn.Conv2D( in_channels=out_channel, out_channels=out_channel, kernel_size=3, padding=1, weight_attr=ParamAttr( initializer=XavierUniform(fan_out=fan)))) self.fpn_convs.append(fpn_conv) # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5) if self.has_extra_convs: for i in range(self.extra_stage): lvl = ed_stage + 1 + i if i == 0 and self.use_c5: in_c = in_channels[-1] else: in_c = out_channel extra_fpn_name = 'fpn_{}'.format(lvl + 2) if self.norm_type is not None: extra_fpn_conv = self.add_sublayer( extra_fpn_name, ConvNormLayer( ch_in=in_c, ch_out=out_channel, filter_size=3, stride=2, norm_type=self.norm_type, norm_decay=self.norm_decay, freeze_norm=self.freeze_norm, initializer=XavierUniform(fan_out=fan))) else: extra_fpn_conv = self.add_sublayer( extra_fpn_name, nn.Conv2D( in_channels=in_c, out_channels=out_channel, kernel_size=3, stride=2, padding=1, weight_attr=ParamAttr( initializer=XavierUniform(fan_out=fan)))) self.fpn_convs.append(extra_fpn_conv) self.init_weights() def init_weights(self): for m in self.lateral_convs: if isinstance(m, (nn.Conv1D, nn.Conv2D)): kaiming_normal_( m.weight, a=0, mode='fan_out', nonlinearity='relu') if m.bias is not None: constant_(m.bias, value=0.) elif isinstance(m, (nn.BatchNorm1D, nn.BatchNorm2D)): constant_(m.weight, value=1) constant_(m.bias, value=0) for m in self.fpn_convs: if isinstance(m, (nn.Conv1D, nn.Conv2D)): kaiming_normal_( m.weight, a=0, mode='fan_out', nonlinearity='relu') if m.bias is not None: constant_(m.bias, value=0.) elif isinstance(m, (nn.BatchNorm1D, nn.BatchNorm2D)): constant_(m.weight, value=1) constant_(m.bias, value=0) @classmethod def from_config(cls, cfg, input_shape): return {} def forward(self, body_feats): laterals = [] if len(body_feats) > len(self.in_channels): for _ in range(len(body_feats) - len(self.in_channels)): del body_feats[0] num_levels = len(body_feats) # print("body_feats",num_levels) for i in range(num_levels): laterals.append(self.lateral_convs[i](body_feats[i])) for i in range(1, num_levels): lvl = num_levels - i upsample = F.interpolate( laterals[lvl], scale_factor=2., mode='nearest', ) laterals[lvl - 1] += upsample fpn_output = [] for lvl in range(num_levels): fpn_output.append(self.fpn_convs[lvl](laterals[lvl])) if self.extra_stage > 0: # use max pool to get more levels on top of outputs (Faster R-CNN, Mask R-CNN) if not self.has_extra_convs: assert self.extra_stage == 1, 'extra_stage should be 1 if FPN has not extra convs' fpn_output.append(F.max_pool2d(fpn_output[-1], 1, stride=2)) # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5) else: if self.use_c5: extra_source = body_feats[-1] else: extra_source = fpn_output[-1] fpn_output.append(self.fpn_convs[num_levels](extra_source)) for i in range(1, self.extra_stage): if self.relu_before_extra_convs: fpn_output.append(self.fpn_convs[num_levels + i](F.relu( fpn_output[-1]))) else: fpn_output.append(self.fpn_convs[num_levels + i]( fpn_output[-1])) return fpn_output @property def out_shape(self): return [ ShapeSpec( channels=self.out_channel, stride=1. / s) for s in self.spatial_scales ] ================================================ FILE: ppdet/modeling/necks/csp_pan.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # The code is based on: # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/necks/yolox_pafpn.py import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from ppdet.core.workspace import register, serializable from ..shape_spec import ShapeSpec __all__ = ['CSPPAN'] class ConvBNLayer(nn.Layer): def __init__(self, in_channel=96, out_channel=96, kernel_size=3, stride=1, groups=1, act='leaky_relu'): super(ConvBNLayer, self).__init__() initializer = nn.initializer.KaimingUniform() self.conv = nn.Conv2D( in_channels=in_channel, out_channels=out_channel, kernel_size=kernel_size, groups=groups, padding=(kernel_size - 1) // 2, stride=stride, weight_attr=ParamAttr(initializer=initializer), bias_attr=False) self.bn = nn.BatchNorm2D(out_channel) if act == "hard_swish": act = 'hardswish' self.act = act def forward(self, x): x = self.bn(self.conv(x)) if self.act: x = getattr(F, self.act)(x) return x class DPModule(nn.Layer): """ Depth-wise and point-wise module. Args: in_channel (int): The input channels of this Module. out_channel (int): The output channels of this Module. kernel_size (int): The conv2d kernel size of this Module. stride (int): The conv2d's stride of this Module. act (str): The activation function of this Module, Now support `leaky_relu` and `hard_swish`. """ def __init__(self, in_channel=96, out_channel=96, kernel_size=3, stride=1, act='leaky_relu', use_act_in_out=True): super(DPModule, self).__init__() initializer = nn.initializer.KaimingUniform() self.use_act_in_out = use_act_in_out self.dwconv = nn.Conv2D( in_channels=in_channel, out_channels=out_channel, kernel_size=kernel_size, groups=out_channel, padding=(kernel_size - 1) // 2, stride=stride, weight_attr=ParamAttr(initializer=initializer), bias_attr=False) self.bn1 = nn.BatchNorm2D(out_channel) self.pwconv = nn.Conv2D( in_channels=out_channel, out_channels=out_channel, kernel_size=1, groups=1, padding=0, weight_attr=ParamAttr(initializer=initializer), bias_attr=False) self.bn2 = nn.BatchNorm2D(out_channel) if act == "hard_swish": act = 'hardswish' self.act = act def forward(self, x): x = self.bn1(self.dwconv(x)) if self.act: x = getattr(F, self.act)(x) x = self.bn2(self.pwconv(x)) if self.use_act_in_out and self.act: x = getattr(F, self.act)(x) return x class DarknetBottleneck(nn.Layer): """The basic bottleneck block used in Darknet. Each Block consists of two ConvModules and the input is added to the final output. Each ConvModule is composed of Conv, BN, and act. The first convLayer has filter size of 1x1 and the second one has the filter size of 3x3. Args: in_channels (int): The input channels of this Module. out_channels (int): The output channels of this Module. expansion (int): The kernel size of the convolution. Default: 0.5 add_identity (bool): Whether to add identity to the out. Default: True use_depthwise (bool): Whether to use depthwise separable convolution. Default: False """ def __init__(self, in_channels, out_channels, kernel_size=3, expansion=0.5, add_identity=True, use_depthwise=False, act="leaky_relu"): super(DarknetBottleneck, self).__init__() hidden_channels = int(out_channels * expansion) conv_func = DPModule if use_depthwise else ConvBNLayer self.conv1 = ConvBNLayer( in_channel=in_channels, out_channel=hidden_channels, kernel_size=1, act=act) self.conv2 = conv_func( in_channel=hidden_channels, out_channel=out_channels, kernel_size=kernel_size, stride=1, act=act) self.add_identity = \ add_identity and in_channels == out_channels def forward(self, x): identity = x out = self.conv1(x) out = self.conv2(out) if self.add_identity: return out + identity else: return out class CSPLayer(nn.Layer): """Cross Stage Partial Layer. Args: in_channels (int): The input channels of the CSP layer. out_channels (int): The output channels of the CSP layer. expand_ratio (float): Ratio to adjust the number of channels of the hidden layer. Default: 0.5 num_blocks (int): Number of blocks. Default: 1 add_identity (bool): Whether to add identity in blocks. Default: True use_depthwise (bool): Whether to depthwise separable convolution in blocks. Default: False """ def __init__(self, in_channels, out_channels, kernel_size=3, expand_ratio=0.5, num_blocks=1, add_identity=True, use_depthwise=False, act="leaky_relu"): super().__init__() mid_channels = int(out_channels * expand_ratio) self.main_conv = ConvBNLayer(in_channels, mid_channels, 1, act=act) self.short_conv = ConvBNLayer(in_channels, mid_channels, 1, act=act) self.final_conv = ConvBNLayer( 2 * mid_channels, out_channels, 1, act=act) self.blocks = nn.Sequential(* [ DarknetBottleneck( mid_channels, mid_channels, kernel_size, 1.0, add_identity, use_depthwise, act=act) for _ in range(num_blocks) ]) def forward(self, x): x_short = self.short_conv(x) x_main = self.main_conv(x) x_main = self.blocks(x_main) x_final = paddle.concat((x_main, x_short), axis=1) return self.final_conv(x_final) class Channel_T(nn.Layer): def __init__(self, in_channels=[116, 232, 464], out_channels=96, act="leaky_relu"): super(Channel_T, self).__init__() self.convs = nn.LayerList() for i in range(len(in_channels)): self.convs.append( ConvBNLayer( in_channels[i], out_channels, 1, act=act)) def forward(self, x): outs = [self.convs[i](x[i]) for i in range(len(x))] return outs @register @serializable class CSPPAN(nn.Layer): """Path Aggregation Network with CSP module. Args: in_channels (List[int]): Number of input channels per scale. out_channels (int): Number of output channels (used at each scale) kernel_size (int): The conv2d kernel size of this Module. num_features (int): Number of output features of CSPPAN module. num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 1 use_depthwise (bool): Whether to depthwise separable convolution in blocks. Default: True """ def __init__(self, in_channels, out_channels, kernel_size=5, num_features=3, num_csp_blocks=1, use_depthwise=True, act='hard_swish', spatial_scales=[0.125, 0.0625, 0.03125]): super(CSPPAN, self).__init__() self.conv_t = Channel_T(in_channels, out_channels, act=act) in_channels = [out_channels] * len(spatial_scales) self.in_channels = in_channels self.out_channels = out_channels self.spatial_scales = spatial_scales self.num_features = num_features conv_func = DPModule if use_depthwise else ConvBNLayer if self.num_features == 4: self.first_top_conv = conv_func( in_channels[0], in_channels[0], kernel_size, stride=2, act=act) self.second_top_conv = conv_func( in_channels[0], in_channels[0], kernel_size, stride=2, act=act) self.spatial_scales.append(self.spatial_scales[-1] / 2) # build top-down blocks self.upsample = nn.Upsample(scale_factor=2, mode='nearest') self.top_down_blocks = nn.LayerList() for idx in range(len(in_channels) - 1, 0, -1): self.top_down_blocks.append( CSPLayer( in_channels[idx - 1] * 2, in_channels[idx - 1], kernel_size=kernel_size, num_blocks=num_csp_blocks, add_identity=False, use_depthwise=use_depthwise, act=act)) # build bottom-up blocks self.downsamples = nn.LayerList() self.bottom_up_blocks = nn.LayerList() for idx in range(len(in_channels) - 1): self.downsamples.append( conv_func( in_channels[idx], in_channels[idx], kernel_size=kernel_size, stride=2, act=act)) self.bottom_up_blocks.append( CSPLayer( in_channels[idx] * 2, in_channels[idx + 1], kernel_size=kernel_size, num_blocks=num_csp_blocks, add_identity=False, use_depthwise=use_depthwise, act=act)) def forward(self, inputs): """ Args: inputs (tuple[Tensor]): input features. Returns: tuple[Tensor]: CSPPAN features. """ assert len(inputs) == len(self.in_channels) inputs = self.conv_t(inputs) # top-down path inner_outs = [inputs[-1]] for idx in range(len(self.in_channels) - 1, 0, -1): feat_heigh = inner_outs[0] feat_low = inputs[idx - 1] upsample_feat = self.upsample(feat_heigh) inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx]( paddle.concat([upsample_feat, feat_low], 1)) inner_outs.insert(0, inner_out) # bottom-up path outs = [inner_outs[0]] for idx in range(len(self.in_channels) - 1): feat_low = outs[-1] feat_height = inner_outs[idx + 1] downsample_feat = self.downsamples[idx](feat_low) out = self.bottom_up_blocks[idx](paddle.concat( [downsample_feat, feat_height], 1)) outs.append(out) top_features = None if self.num_features == 4: top_features = self.first_top_conv(inputs[-1]) top_features = top_features + self.second_top_conv(outs[-1]) outs.append(top_features) return tuple(outs) @property def out_shape(self): return [ ShapeSpec( channels=self.out_channels, stride=1. / s) for s in self.spatial_scales ] @classmethod def from_config(cls, cfg, input_shape): return {'in_channels': [i.channels for i in input_shape], } ================================================ FILE: ppdet/modeling/necks/custom_pan.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math import copy import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register, serializable from ppdet.modeling.layers import DropBlock, MultiHeadAttention from ppdet.modeling.ops import get_act_fn from ..backbones.cspresnet import ConvBNLayer, BasicBlock from ..shape_spec import ShapeSpec from ..initializer import linear_init_ __all__ = ['CustomCSPPAN'] def _get_clones(module, N): return nn.LayerList([copy.deepcopy(module) for _ in range(N)]) class SPP(nn.Layer): def __init__(self, ch_in, ch_out, k, pool_size, act='swish', data_format='NCHW'): super(SPP, self).__init__() self.pool = [] self.data_format = data_format for i, size in enumerate(pool_size): pool = self.add_sublayer( 'pool{}'.format(i), nn.MaxPool2D( kernel_size=size, stride=1, padding=size // 2, data_format=data_format, ceil_mode=False)) self.pool.append(pool) self.conv = ConvBNLayer(ch_in, ch_out, k, padding=k // 2, act=act) def forward(self, x): outs = [x] for pool in self.pool: outs.append(pool(x)) if self.data_format == 'NCHW': y = paddle.concat(outs, axis=1) else: y = paddle.concat(outs, axis=-1) y = self.conv(y) return y class CSPStage(nn.Layer): def __init__(self, block_fn, ch_in, ch_out, n, act='swish', spp=False, use_alpha=False): super(CSPStage, self).__init__() ch_mid = int(ch_out // 2) self.conv1 = ConvBNLayer(ch_in, ch_mid, 1, act=act) self.conv2 = ConvBNLayer(ch_in, ch_mid, 1, act=act) self.convs = nn.Sequential() next_ch_in = ch_mid for i in range(n): self.convs.add_sublayer( str(i), eval(block_fn)(next_ch_in, ch_mid, act=act, shortcut=False, use_alpha=use_alpha)) if i == (n - 1) // 2 and spp: self.convs.add_sublayer( 'spp', SPP(ch_mid * 4, ch_mid, 1, [5, 9, 13], act=act)) next_ch_in = ch_mid self.conv3 = ConvBNLayer(ch_mid * 2, ch_out, 1, act=act) def forward(self, x): y1 = self.conv1(x) y2 = self.conv2(x) y2 = self.convs(y2) y = paddle.concat([y1, y2], axis=1) y = self.conv3(y) return y class TransformerEncoderLayer(nn.Layer): def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", attn_dropout=None, act_dropout=None, normalize_before=False): super(TransformerEncoderLayer, self).__init__() attn_dropout = dropout if attn_dropout is None else attn_dropout act_dropout = dropout if act_dropout is None else act_dropout self.normalize_before = normalize_before self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout) # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train") self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train") self.activation = getattr(F, activation) self._reset_parameters() def _reset_parameters(self): linear_init_(self.linear1) linear_init_(self.linear2) @staticmethod def with_pos_embed(tensor, pos_embed): return tensor if pos_embed is None else tensor + pos_embed def forward(self, src, src_mask=None, pos_embed=None): residual = src if self.normalize_before: src = self.norm1(src) q = k = self.with_pos_embed(src, pos_embed) src = self.self_attn(q, k, value=src, attn_mask=src_mask) src = residual + self.dropout1(src) if not self.normalize_before: src = self.norm1(src) residual = src if self.normalize_before: src = self.norm2(src) src = self.linear2(self.dropout(self.activation(self.linear1(src)))) src = residual + self.dropout2(src) if not self.normalize_before: src = self.norm2(src) return src class TransformerEncoder(nn.Layer): def __init__(self, encoder_layer, num_layers, norm=None): super(TransformerEncoder, self).__init__() self.layers = _get_clones(encoder_layer, num_layers) self.num_layers = num_layers self.norm = norm def forward(self, src, src_mask=None, pos_embed=None): output = src for layer in self.layers: output = layer(output, src_mask=src_mask, pos_embed=pos_embed) if self.norm is not None: output = self.norm(output) return output @register @serializable class CustomCSPPAN(nn.Layer): __shared__ = [ 'norm_type', 'data_format', 'width_mult', 'depth_mult', 'trt', 'eval_size' ] def __init__(self, in_channels=[256, 512, 1024], out_channels=[1024, 512, 256], norm_type='bn', act='leaky', stage_fn='CSPStage', block_fn='BasicBlock', stage_num=1, block_num=3, drop_block=False, block_size=3, keep_prob=0.9, spp=False, data_format='NCHW', width_mult=1.0, depth_mult=1.0, use_alpha=False, trt=False, dim_feedforward=2048, dropout=0.1, activation='gelu', nhead=4, num_layers=4, attn_dropout=None, act_dropout=None, normalize_before=False, use_trans=False, eval_size=None): super(CustomCSPPAN, self).__init__() out_channels = [max(round(c * width_mult), 1) for c in out_channels] block_num = max(round(block_num * depth_mult), 1) act = get_act_fn( act, trt=trt) if act is None or isinstance(act, (str, dict)) else act self.num_blocks = len(in_channels) self.data_format = data_format self._out_channels = out_channels self.hidden_dim = in_channels[-1] in_channels = in_channels[::-1] self.use_trans = use_trans self.eval_size = eval_size if use_trans: if eval_size is not None: self.pos_embed = self.build_2d_sincos_position_embedding( eval_size[1] // 32, eval_size[0] // 32, embed_dim=self.hidden_dim) else: self.pos_embed = None encoder_layer = TransformerEncoderLayer( self.hidden_dim, nhead, dim_feedforward, dropout, activation, attn_dropout, act_dropout, normalize_before) encoder_norm = nn.LayerNorm( self.hidden_dim) if normalize_before else None self.encoder = TransformerEncoder(encoder_layer, num_layers, encoder_norm) fpn_stages = [] fpn_routes = [] for i, (ch_in, ch_out) in enumerate(zip(in_channels, out_channels)): if i > 0: ch_in += ch_pre // 2 stage = nn.Sequential() for j in range(stage_num): stage.add_sublayer( str(j), eval(stage_fn)(block_fn, ch_in if j == 0 else ch_out, ch_out, block_num, act=act, spp=(spp and i == 0), use_alpha=use_alpha)) if drop_block: stage.add_sublayer('drop', DropBlock(block_size, keep_prob)) fpn_stages.append(stage) if i < self.num_blocks - 1: fpn_routes.append( ConvBNLayer( ch_in=ch_out, ch_out=ch_out // 2, filter_size=1, stride=1, padding=0, act=act)) ch_pre = ch_out self.fpn_stages = nn.LayerList(fpn_stages) self.fpn_routes = nn.LayerList(fpn_routes) pan_stages = [] pan_routes = [] for i in reversed(range(self.num_blocks - 1)): pan_routes.append( ConvBNLayer( ch_in=out_channels[i + 1], ch_out=out_channels[i + 1], filter_size=3, stride=2, padding=1, act=act)) ch_in = out_channels[i] + out_channels[i + 1] ch_out = out_channels[i] stage = nn.Sequential() for j in range(stage_num): stage.add_sublayer( str(j), eval(stage_fn)(block_fn, ch_in if j == 0 else ch_out, ch_out, block_num, act=act, spp=False, use_alpha=use_alpha)) if drop_block: stage.add_sublayer('drop', DropBlock(block_size, keep_prob)) pan_stages.append(stage) self.pan_stages = nn.LayerList(pan_stages[::-1]) self.pan_routes = nn.LayerList(pan_routes[::-1]) def build_2d_sincos_position_embedding( self, w, h, embed_dim=1024, temperature=10000., ): grid_w = paddle.arange(int(w), dtype=paddle.float32) grid_h = paddle.arange(int(h), dtype=paddle.float32) grid_w, grid_h = paddle.meshgrid(grid_w, grid_h) assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding' pos_dim = embed_dim // 4 omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim omega = 1. / (temperature**omega) out_w = grid_w.flatten()[..., None] @omega[None] out_h = grid_h.flatten()[..., None] @omega[None] pos_emb = paddle.concat( [ paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h), paddle.cos(out_h) ], axis=1)[None, :, :] return pos_emb def forward(self, blocks, for_mot=False): if self.use_trans: last_feat = blocks[-1] n, c, h, w = last_feat.shape # flatten [B, C, H, W] to [B, HxW, C] src_flatten = last_feat.flatten(2).transpose([0, 2, 1]) if self.eval_size is not None and not self.training: pos_embed = self.pos_embed else: pos_embed = self.build_2d_sincos_position_embedding( w=w, h=h, embed_dim=self.hidden_dim) memory = self.encoder(src_flatten, pos_embed=pos_embed) last_feat_encode = memory.transpose([0, 2, 1]).reshape([n, c, h, w]) blocks[-1] = last_feat_encode blocks = blocks[::-1] fpn_feats = [] for i, block in enumerate(blocks): if i > 0: block = paddle.concat([route, block], axis=1) route = self.fpn_stages[i](block) fpn_feats.append(route) if i < self.num_blocks - 1: route = self.fpn_routes[i](route) route = F.interpolate( route, scale_factor=2., data_format=self.data_format) pan_feats = [fpn_feats[-1], ] route = fpn_feats[-1] for i in reversed(range(self.num_blocks - 1)): block = fpn_feats[i] route = self.pan_routes[i](route) block = paddle.concat([route, block], axis=1) route = self.pan_stages[i](block) pan_feats.append(route) return pan_feats[::-1] @classmethod def from_config(cls, cfg, input_shape): return {'in_channels': [i.channels for i in input_shape], } @property def out_shape(self): return [ShapeSpec(channels=c) for c in self._out_channels] ================================================ FILE: ppdet/modeling/necks/dilated_encoder.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn from paddle import ParamAttr from paddle.regularizer import L2Decay from paddle.nn.initializer import KaimingUniform, Constant, Normal from ppdet.core.workspace import register, serializable from ..shape_spec import ShapeSpec __all__ = ['DilatedEncoder'] class Bottleneck(nn.Layer): def __init__(self, in_channels, mid_channels, dilation): super(Bottleneck, self).__init__() self.conv1 = nn.Sequential(* [ nn.Conv2D( in_channels, mid_channels, 1, padding=0, weight_attr=ParamAttr(initializer=Normal( mean=0, std=0.01)), bias_attr=ParamAttr(initializer=Constant(0.0))), nn.BatchNorm2D( mid_channels, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))), nn.ReLU(), ]) self.conv2 = nn.Sequential(* [ nn.Conv2D( mid_channels, mid_channels, 3, padding=dilation, dilation=dilation, weight_attr=ParamAttr(initializer=Normal( mean=0, std=0.01)), bias_attr=ParamAttr(initializer=Constant(0.0))), nn.BatchNorm2D( mid_channels, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))), nn.ReLU(), ]) self.conv3 = nn.Sequential(* [ nn.Conv2D( mid_channels, in_channels, 1, padding=0, weight_attr=ParamAttr(initializer=Normal( mean=0, std=0.01)), bias_attr=ParamAttr(initializer=Constant(0.0))), nn.BatchNorm2D( in_channels, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))), nn.ReLU(), ]) def forward(self, x): identity = x y = self.conv3(self.conv2(self.conv1(x))) return y + identity @register class DilatedEncoder(nn.Layer): """ DilatedEncoder used in YOLOF """ def __init__(self, in_channels=[2048], out_channels=[512], block_mid_channels=128, num_residual_blocks=4, block_dilations=[2, 4, 6, 8]): super(DilatedEncoder, self).__init__() self.in_channels = in_channels self.out_channels = out_channels assert len(self.in_channels) == 1, "YOLOF only has one level feature." assert len(self.out_channels) == 1, "YOLOF only has one level feature." self.block_mid_channels = block_mid_channels self.num_residual_blocks = num_residual_blocks self.block_dilations = block_dilations out_ch = self.out_channels[0] self.lateral_conv = nn.Conv2D( self.in_channels[0], out_ch, 1, weight_attr=ParamAttr(initializer=KaimingUniform( negative_slope=1, nonlinearity='leaky_relu')), bias_attr=ParamAttr(initializer=Constant(value=0.0))) self.lateral_norm = nn.BatchNorm2D( out_ch, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) self.fpn_conv = nn.Conv2D( out_ch, out_ch, 3, padding=1, weight_attr=ParamAttr(initializer=KaimingUniform( negative_slope=1, nonlinearity='leaky_relu'))) self.fpn_norm = nn.BatchNorm2D( out_ch, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) encoder_blocks = [] for i in range(self.num_residual_blocks): encoder_blocks.append( Bottleneck( out_ch, self.block_mid_channels, dilation=block_dilations[i])) self.dilated_encoder_blocks = nn.Sequential(*encoder_blocks) def forward(self, inputs, for_mot=False): out = self.lateral_norm(self.lateral_conv(inputs[0])) out = self.fpn_norm(self.fpn_conv(out)) out = self.dilated_encoder_blocks(out) return [out] @classmethod def from_config(cls, cfg, input_shape): return {'in_channels': [i.channels for i in input_shape], } @property def out_shape(self): return [ShapeSpec(channels=c) for c in self.out_channels] ================================================ FILE: ppdet/modeling/necks/es_pan.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.regularizer import L2Decay from ppdet.core.workspace import register, serializable from ..shape_spec import ShapeSpec from ..backbones.esnet import SEModule from .csp_pan import ConvBNLayer, Channel_T, DPModule __all__ = ['ESPAN'] class ES_Block(nn.Layer): def __init__(self, in_channels, mid_channels, out_channels, kernel_size=5, stride=1, act='leaky_relu'): super(ES_Block, self).__init__() self._residual = ConvBNLayer( in_channel=in_channels, out_channel=out_channels, kernel_size=1, stride=1, groups=1, act=act) self._conv_pw = ConvBNLayer( in_channel=in_channels, out_channel=mid_channels // 2, kernel_size=1, stride=1, groups=1, act=act) self._conv_dw = ConvBNLayer( in_channel=mid_channels // 2, out_channel=mid_channels // 2, kernel_size=kernel_size, stride=stride, groups=mid_channels // 2, act=None) self._se = SEModule(mid_channels) self._conv_linear = ConvBNLayer( in_channel=mid_channels, out_channel=out_channels, kernel_size=1, stride=1, groups=1, act=act) self._out_conv = ConvBNLayer( in_channel=out_channels * 2, out_channel=out_channels, kernel_size=1, stride=1, groups=1, act=act) def forward(self, inputs): x1 = self._residual(inputs) x2 = self._conv_pw(inputs) x3 = self._conv_dw(x2) x3 = paddle.concat([x2, x3], axis=1) x3 = self._se(x3) x3 = self._conv_linear(x3) out = paddle.concat([x1, x3], axis=1) out = self._out_conv(out) return out @register @serializable class ESPAN(nn.Layer): """Path Aggregation Network with ES module. Args: in_channels (List[int]): Number of input channels per scale. out_channels (int): Number of output channels (used at each scale) kernel_size (int): The conv2d kernel size of this Module. num_features (int): Number of output features of CSPPAN module. num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 1 use_depthwise (bool): Whether to depthwise separable convolution in blocks. Default: True """ def __init__(self, in_channels, out_channels, kernel_size=5, num_features=3, use_depthwise=True, act='hard_swish', spatial_scales=[0.125, 0.0625, 0.03125]): super(ESPAN, self).__init__() self.conv_t = Channel_T(in_channels, out_channels, act=act) in_channels = [out_channels] * len(spatial_scales) self.in_channels = in_channels self.out_channels = out_channels self.spatial_scales = spatial_scales self.num_features = num_features conv_func = DPModule if use_depthwise else ConvBNLayer if self.num_features == 4: self.first_top_conv = conv_func( in_channels[0], in_channels[0], kernel_size, stride=2, act=act) self.second_top_conv = conv_func( in_channels[0], in_channels[0], kernel_size, stride=2, act=act) self.spatial_scales.append(self.spatial_scales[-1] / 2) # build top-down blocks self.upsample = nn.Upsample(scale_factor=2, mode='nearest') self.top_down_blocks = nn.LayerList() for idx in range(len(in_channels) - 1, 0, -1): self.top_down_blocks.append( ES_Block( in_channels[idx - 1] * 2, in_channels[idx - 1], in_channels[idx - 1], kernel_size=kernel_size, stride=1, act=act)) # build bottom-up blocks self.downsamples = nn.LayerList() self.bottom_up_blocks = nn.LayerList() for idx in range(len(in_channels) - 1): self.downsamples.append( conv_func( in_channels[idx], in_channels[idx], kernel_size=kernel_size, stride=2, act=act)) self.bottom_up_blocks.append( ES_Block( in_channels[idx] * 2, in_channels[idx + 1], in_channels[idx + 1], kernel_size=kernel_size, stride=1, act=act)) def forward(self, inputs): """ Args: inputs (tuple[Tensor]): input features. Returns: tuple[Tensor]: CSPPAN features. """ assert len(inputs) == len(self.in_channels) inputs = self.conv_t(inputs) # top-down path inner_outs = [inputs[-1]] for idx in range(len(self.in_channels) - 1, 0, -1): feat_heigh = inner_outs[0] feat_low = inputs[idx - 1] upsample_feat = self.upsample(feat_heigh) inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx]( paddle.concat([upsample_feat, feat_low], 1)) inner_outs.insert(0, inner_out) # bottom-up path outs = [inner_outs[0]] for idx in range(len(self.in_channels) - 1): feat_low = outs[-1] feat_height = inner_outs[idx + 1] downsample_feat = self.downsamples[idx](feat_low) out = self.bottom_up_blocks[idx](paddle.concat( [downsample_feat, feat_height], 1)) outs.append(out) top_features = None if self.num_features == 4: top_features = self.first_top_conv(inputs[-1]) top_features = top_features + self.second_top_conv(outs[-1]) outs.append(top_features) return tuple(outs) @property def out_shape(self): return [ ShapeSpec( channels=self.out_channels, stride=1. / s) for s in self.spatial_scales ] @classmethod def from_config(cls, cfg, input_shape): return {'in_channels': [i.channels for i in input_shape], } ================================================ FILE: ppdet/modeling/necks/fpn.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.nn.initializer import XavierUniform from ppdet.core.workspace import register, serializable from ppdet.modeling.layers import ConvNormLayer from ..shape_spec import ShapeSpec __all__ = ['FPN'] @register @serializable class FPN(nn.Layer): """ Feature Pyramid Network, see https://arxiv.org/abs/1612.03144 Args: in_channels (list[int]): input channels of each level which can be derived from the output shape of backbone by from_config out_channel (int): output channel of each level spatial_scales (list[float]): the spatial scales between input feature maps and original input image which can be derived from the output shape of backbone by from_config has_extra_convs (bool): whether to add extra conv to the last level. default False extra_stage (int): the number of extra stages added to the last level. default 1 use_c5 (bool): Whether to use c5 as the input of extra stage, otherwise p5 is used. default True norm_type (string|None): The normalization type in FPN module. If norm_type is None, norm will not be used after conv and if norm_type is string, bn, gn, sync_bn are available. default None norm_decay (float): weight decay for normalization layer weights. default 0. freeze_norm (bool): whether to freeze normalization layer. default False relu_before_extra_convs (bool): whether to add relu before extra convs. default False """ def __init__(self, in_channels, out_channel, spatial_scales=[0.25, 0.125, 0.0625, 0.03125], has_extra_convs=False, extra_stage=1, use_c5=True, norm_type=None, norm_decay=0., freeze_norm=False, relu_before_extra_convs=True): super(FPN, self).__init__() self.out_channel = out_channel for s in range(extra_stage): spatial_scales = spatial_scales + [spatial_scales[-1] / 2.] self.spatial_scales = spatial_scales self.has_extra_convs = has_extra_convs self.extra_stage = extra_stage self.use_c5 = use_c5 self.relu_before_extra_convs = relu_before_extra_convs self.norm_type = norm_type self.norm_decay = norm_decay self.freeze_norm = freeze_norm self.lateral_convs = [] self.fpn_convs = [] fan = out_channel * 3 * 3 # stage index 0,1,2,3 stands for res2,res3,res4,res5 on ResNet Backbone # 0 <= st_stage < ed_stage <= 3 st_stage = 4 - len(in_channels) ed_stage = st_stage + len(in_channels) - 1 for i in range(st_stage, ed_stage + 1): if i == 3: lateral_name = 'fpn_inner_res5_sum' else: lateral_name = 'fpn_inner_res{}_sum_lateral'.format(i + 2) in_c = in_channels[i - st_stage] if self.norm_type is not None: lateral = self.add_sublayer( lateral_name, ConvNormLayer( ch_in=in_c, ch_out=out_channel, filter_size=1, stride=1, norm_type=self.norm_type, norm_decay=self.norm_decay, freeze_norm=self.freeze_norm, initializer=XavierUniform(fan_out=in_c))) else: lateral = self.add_sublayer( lateral_name, nn.Conv2D( in_channels=in_c, out_channels=out_channel, kernel_size=1, weight_attr=ParamAttr( initializer=XavierUniform(fan_out=in_c)))) self.lateral_convs.append(lateral) fpn_name = 'fpn_res{}_sum'.format(i + 2) if self.norm_type is not None: fpn_conv = self.add_sublayer( fpn_name, ConvNormLayer( ch_in=out_channel, ch_out=out_channel, filter_size=3, stride=1, norm_type=self.norm_type, norm_decay=self.norm_decay, freeze_norm=self.freeze_norm, initializer=XavierUniform(fan_out=fan))) else: fpn_conv = self.add_sublayer( fpn_name, nn.Conv2D( in_channels=out_channel, out_channels=out_channel, kernel_size=3, padding=1, weight_attr=ParamAttr( initializer=XavierUniform(fan_out=fan)))) self.fpn_convs.append(fpn_conv) # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5) if self.has_extra_convs: for i in range(self.extra_stage): lvl = ed_stage + 1 + i if i == 0 and self.use_c5: in_c = in_channels[-1] else: in_c = out_channel extra_fpn_name = 'fpn_{}'.format(lvl + 2) if self.norm_type is not None: extra_fpn_conv = self.add_sublayer( extra_fpn_name, ConvNormLayer( ch_in=in_c, ch_out=out_channel, filter_size=3, stride=2, norm_type=self.norm_type, norm_decay=self.norm_decay, freeze_norm=self.freeze_norm, initializer=XavierUniform(fan_out=fan))) else: extra_fpn_conv = self.add_sublayer( extra_fpn_name, nn.Conv2D( in_channels=in_c, out_channels=out_channel, kernel_size=3, stride=2, padding=1, weight_attr=ParamAttr( initializer=XavierUniform(fan_out=fan)))) self.fpn_convs.append(extra_fpn_conv) @classmethod def from_config(cls, cfg, input_shape): return { 'in_channels': [i.channels for i in input_shape], 'spatial_scales': [1.0 / i.stride for i in input_shape], } def forward(self, body_feats): laterals = [] num_levels = len(body_feats) for i in range(num_levels): laterals.append(self.lateral_convs[i](body_feats[i])) for i in range(1, num_levels): lvl = num_levels - i upsample = F.interpolate( laterals[lvl], scale_factor=2., mode='nearest', ) laterals[lvl - 1] += upsample fpn_output = [] for lvl in range(num_levels): fpn_output.append(self.fpn_convs[lvl](laterals[lvl])) if self.extra_stage > 0: # use max pool to get more levels on top of outputs (Faster R-CNN, Mask R-CNN) if not self.has_extra_convs: assert self.extra_stage == 1, 'extra_stage should be 1 if FPN has not extra convs' fpn_output.append(F.max_pool2d(fpn_output[-1], 1, stride=2)) # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5) else: if self.use_c5: extra_source = body_feats[-1] else: extra_source = fpn_output[-1] fpn_output.append(self.fpn_convs[num_levels](extra_source)) for i in range(1, self.extra_stage): if self.relu_before_extra_convs: fpn_output.append(self.fpn_convs[num_levels + i](F.relu( fpn_output[-1]))) else: fpn_output.append(self.fpn_convs[num_levels + i]( fpn_output[-1])) return fpn_output @property def out_shape(self): return [ ShapeSpec( channels=self.out_channel, stride=1. / s) for s in self.spatial_scales ] ================================================ FILE: ppdet/modeling/necks/hrfpn.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn.functional as F import paddle.nn as nn from ppdet.core.workspace import register from ..shape_spec import ShapeSpec __all__ = ['HRFPN'] @register class HRFPN(nn.Layer): """ Args: in_channels (list): number of input feature channels from backbone out_channel (int): number of output feature channels share_conv (bool): whether to share conv for different layers' reduction extra_stage (int): add extra stage for returning HRFPN fpn_feats spatial_scales (list): feature map scaling factor """ def __init__(self, in_channels=[18, 36, 72, 144], out_channel=256, share_conv=False, extra_stage=1, spatial_scales=[1. / 4, 1. / 8, 1. / 16, 1. / 32], use_bias=False): super(HRFPN, self).__init__() in_channel = sum(in_channels) self.in_channel = in_channel self.out_channel = out_channel self.share_conv = share_conv for i in range(extra_stage): spatial_scales = spatial_scales + [spatial_scales[-1] / 2.] self.spatial_scales = spatial_scales self.num_out = len(self.spatial_scales) self.use_bias = use_bias bias_attr = False if use_bias is False else None self.reduction = nn.Conv2D( in_channels=in_channel, out_channels=out_channel, kernel_size=1, bias_attr=bias_attr) if share_conv: self.fpn_conv = nn.Conv2D( in_channels=out_channel, out_channels=out_channel, kernel_size=3, padding=1, bias_attr=bias_attr) else: self.fpn_conv = [] for i in range(self.num_out): conv_name = "fpn_conv_" + str(i) conv = self.add_sublayer( conv_name, nn.Conv2D( in_channels=out_channel, out_channels=out_channel, kernel_size=3, padding=1, bias_attr=bias_attr)) self.fpn_conv.append(conv) def forward(self, body_feats): num_backbone_stages = len(body_feats) outs = [] outs.append(body_feats[0]) # resize for i in range(1, num_backbone_stages): resized = F.interpolate( body_feats[i], scale_factor=2**i, mode='bilinear') outs.append(resized) # concat out = paddle.concat(outs, axis=1) assert out.shape[ 1] == self.in_channel, 'in_channel should be {}, be received {}'.format( out.shape[1], self.in_channel) # reduction out = self.reduction(out) # conv outs = [out] for i in range(1, self.num_out): outs.append(F.avg_pool2d(out, kernel_size=2**i, stride=2**i)) outputs = [] for i in range(self.num_out): conv_func = self.fpn_conv if self.share_conv else self.fpn_conv[i] conv = conv_func(outs[i]) outputs.append(conv) fpn_feats = [outputs[k] for k in range(self.num_out)] return fpn_feats @classmethod def from_config(cls, cfg, input_shape): return { 'in_channels': [i.channels for i in input_shape], 'spatial_scales': [1.0 / i.stride for i in input_shape], } @property def out_shape(self): return [ ShapeSpec( channels=self.out_channel, stride=1. / s) for s in self.spatial_scales ] ================================================ FILE: ppdet/modeling/necks/lc_pan.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.regularizer import L2Decay from ppdet.core.workspace import register, serializable from ..shape_spec import ShapeSpec from ..backbones.lcnet import DepthwiseSeparable from .csp_pan import ConvBNLayer, Channel_T, DPModule __all__ = ['LCPAN'] @register @serializable class LCPAN(nn.Layer): """Path Aggregation Network with LCNet module. Args: in_channels (List[int]): Number of input channels per scale. out_channels (int): Number of output channels (used at each scale) kernel_size (int): The conv2d kernel size of this Module. num_features (int): Number of output features of CSPPAN module. num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 1 use_depthwise (bool): Whether to depthwise separable convolution in blocks. Default: True """ def __init__(self, in_channels, out_channels, kernel_size=5, num_features=3, use_depthwise=True, act='hard_swish', spatial_scales=[0.125, 0.0625, 0.03125]): super(LCPAN, self).__init__() self.conv_t = Channel_T(in_channels, out_channels, act=act) in_channels = [out_channels] * len(spatial_scales) self.in_channels = in_channels self.out_channels = out_channels self.spatial_scales = spatial_scales self.num_features = num_features conv_func = DPModule if use_depthwise else ConvBNLayer NET_CONFIG = { #k, in_c, out_c, stride, use_se "block1": [ [kernel_size, out_channels * 2, out_channels * 2, 1, False], [kernel_size, out_channels * 2, out_channels, 1, False], ], "block2": [ [kernel_size, out_channels * 2, out_channels * 2, 1, False], [kernel_size, out_channels * 2, out_channels, 1, False], ] } if self.num_features == 4: self.first_top_conv = conv_func( in_channels[0], in_channels[0], kernel_size, stride=2, act=act) self.second_top_conv = conv_func( in_channels[0], in_channels[0], kernel_size, stride=2, act=act) self.spatial_scales.append(self.spatial_scales[-1] / 2) # build top-down blocks self.upsample = nn.Upsample(scale_factor=2, mode='nearest') self.top_down_blocks = nn.LayerList() for idx in range(len(in_channels) - 1, 0, -1): self.top_down_blocks.append( nn.Sequential(* [ DepthwiseSeparable( num_channels=in_c, num_filters=out_c, dw_size=k, stride=s, use_se=se) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG[ "block1"]) ])) # build bottom-up blocks self.downsamples = nn.LayerList() self.bottom_up_blocks = nn.LayerList() for idx in range(len(in_channels) - 1): self.downsamples.append( conv_func( in_channels[idx], in_channels[idx], kernel_size=kernel_size, stride=2, act=act)) self.bottom_up_blocks.append( nn.Sequential(* [ DepthwiseSeparable( num_channels=in_c, num_filters=out_c, dw_size=k, stride=s, use_se=se) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG[ "block2"]) ])) def forward(self, inputs): """ Args: inputs (tuple[Tensor]): input features. Returns: tuple[Tensor]: CSPPAN features. """ assert len(inputs) == len(self.in_channels) inputs = self.conv_t(inputs) # top-down path inner_outs = [inputs[-1]] for idx in range(len(self.in_channels) - 1, 0, -1): feat_heigh = inner_outs[0] feat_low = inputs[idx - 1] upsample_feat = self.upsample(feat_heigh) inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx]( paddle.concat([upsample_feat, feat_low], 1)) inner_outs.insert(0, inner_out) # bottom-up path outs = [inner_outs[0]] for idx in range(len(self.in_channels) - 1): feat_low = outs[-1] feat_height = inner_outs[idx + 1] downsample_feat = self.downsamples[idx](feat_low) out = self.bottom_up_blocks[idx](paddle.concat( [downsample_feat, feat_height], 1)) outs.append(out) top_features = None if self.num_features == 4: top_features = self.first_top_conv(inputs[-1]) top_features = top_features + self.second_top_conv(outs[-1]) outs.append(top_features) return tuple(outs) @property def out_shape(self): return [ ShapeSpec( channels=self.out_channels, stride=1. / s) for s in self.spatial_scales ] @classmethod def from_config(cls, cfg, input_shape): return {'in_channels': [i.channels for i in input_shape], } ================================================ FILE: ppdet/modeling/necks/ttf_fpn.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.nn.initializer import Constant, Uniform, Normal, XavierUniform from ppdet.core.workspace import register, serializable from paddle.regularizer import L2Decay from ppdet.modeling.layers import DeformableConvV2, ConvNormLayer, LiteConv import math from ppdet.modeling.ops import batch_norm from ..shape_spec import ShapeSpec __all__ = ['TTFFPN'] class Upsample(nn.Layer): def __init__(self, ch_in, ch_out, norm_type='bn'): super(Upsample, self).__init__() fan_in = ch_in * 3 * 3 stdv = 1. / math.sqrt(fan_in) self.dcn = DeformableConvV2( ch_in, ch_out, kernel_size=3, weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)), bias_attr=ParamAttr( initializer=Constant(0), regularizer=L2Decay(0.), learning_rate=2.), lr_scale=2., regularizer=L2Decay(0.)) self.bn = batch_norm( ch_out, norm_type=norm_type, initializer=Constant(1.)) def forward(self, feat): dcn = self.dcn(feat) bn = self.bn(dcn) relu = F.relu(bn) out = F.interpolate(relu, scale_factor=2., mode='bilinear') return out class DeConv(nn.Layer): def __init__(self, ch_in, ch_out, norm_type='bn'): super(DeConv, self).__init__() self.deconv = nn.Sequential() conv1 = ConvNormLayer( ch_in=ch_in, ch_out=ch_out, stride=1, filter_size=1, norm_type=norm_type, initializer=XavierUniform()) conv2 = nn.Conv2DTranspose( in_channels=ch_out, out_channels=ch_out, kernel_size=4, padding=1, stride=2, groups=ch_out, weight_attr=ParamAttr(initializer=XavierUniform()), bias_attr=False) bn = batch_norm(ch_out, norm_type=norm_type, norm_decay=0.) conv3 = ConvNormLayer( ch_in=ch_out, ch_out=ch_out, stride=1, filter_size=1, norm_type=norm_type, initializer=XavierUniform()) self.deconv.add_sublayer('conv1', conv1) self.deconv.add_sublayer('relu6_1', nn.ReLU6()) self.deconv.add_sublayer('conv2', conv2) self.deconv.add_sublayer('bn', bn) self.deconv.add_sublayer('relu6_2', nn.ReLU6()) self.deconv.add_sublayer('conv3', conv3) self.deconv.add_sublayer('relu6_3', nn.ReLU6()) def forward(self, inputs): return self.deconv(inputs) class LiteUpsample(nn.Layer): def __init__(self, ch_in, ch_out, norm_type='bn'): super(LiteUpsample, self).__init__() self.deconv = DeConv(ch_in, ch_out, norm_type=norm_type) self.conv = LiteConv(ch_in, ch_out, norm_type=norm_type) def forward(self, inputs): deconv_up = self.deconv(inputs) conv = self.conv(inputs) interp_up = F.interpolate(conv, scale_factor=2., mode='bilinear') return deconv_up + interp_up class ShortCut(nn.Layer): def __init__(self, layer_num, ch_in, ch_out, norm_type='bn', lite_neck=False, name=None): super(ShortCut, self).__init__() shortcut_conv = nn.Sequential() for i in range(layer_num): fan_out = 3 * 3 * ch_out std = math.sqrt(2. / fan_out) in_channels = ch_in if i == 0 else ch_out shortcut_name = name + '.conv.{}'.format(i) if lite_neck: shortcut_conv.add_sublayer( shortcut_name, LiteConv( in_channels=in_channels, out_channels=ch_out, with_act=i < layer_num - 1, norm_type=norm_type)) else: shortcut_conv.add_sublayer( shortcut_name, nn.Conv2D( in_channels=in_channels, out_channels=ch_out, kernel_size=3, padding=1, weight_attr=ParamAttr(initializer=Normal(0, std)), bias_attr=ParamAttr( learning_rate=2., regularizer=L2Decay(0.)))) if i < layer_num - 1: shortcut_conv.add_sublayer(shortcut_name + '.act', nn.ReLU()) self.shortcut = self.add_sublayer('shortcut', shortcut_conv) def forward(self, feat): out = self.shortcut(feat) return out @register @serializable class TTFFPN(nn.Layer): """ Args: in_channels (list): number of input feature channels from backbone. [128,256,512,1024] by default, means the channels of DarkNet53 backbone return_idx [1,2,3,4]. planes (list): the number of output feature channels of FPN. [256, 128, 64] by default shortcut_num (list): the number of convolution layers in each shortcut. [3,2,1] by default, means DarkNet53 backbone return_idx_1 has 3 convs in its shortcut, return_idx_2 has 2 convs and return_idx_3 has 1 conv. norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional. bn by default lite_neck (bool): whether to use lite conv in TTFNet FPN, False by default fusion_method (string): the method to fusion upsample and lateral layer. 'add' and 'concat' are optional, add by default """ __shared__ = ['norm_type'] def __init__(self, in_channels, planes=[256, 128, 64], shortcut_num=[3, 2, 1], norm_type='bn', lite_neck=False, fusion_method='add'): super(TTFFPN, self).__init__() self.planes = planes self.shortcut_num = shortcut_num[::-1] self.shortcut_len = len(shortcut_num) self.ch_in = in_channels[::-1] self.fusion_method = fusion_method self.upsample_list = [] self.shortcut_list = [] self.upper_list = [] for i, out_c in enumerate(self.planes): in_c = self.ch_in[i] if i == 0 else self.upper_list[-1] upsample_module = LiteUpsample if lite_neck else Upsample upsample = self.add_sublayer( 'upsample.' + str(i), upsample_module( in_c, out_c, norm_type=norm_type)) self.upsample_list.append(upsample) if i < self.shortcut_len: shortcut = self.add_sublayer( 'shortcut.' + str(i), ShortCut( self.shortcut_num[i], self.ch_in[i + 1], out_c, norm_type=norm_type, lite_neck=lite_neck, name='shortcut.' + str(i))) self.shortcut_list.append(shortcut) if self.fusion_method == 'add': upper_c = out_c elif self.fusion_method == 'concat': upper_c = out_c * 2 else: raise ValueError('Illegal fusion method. Expected add or\ concat, but received {}'.format(self.fusion_method)) self.upper_list.append(upper_c) def forward(self, inputs): feat = inputs[-1] for i, out_c in enumerate(self.planes): feat = self.upsample_list[i](feat) if i < self.shortcut_len: shortcut = self.shortcut_list[i](inputs[-i - 2]) if self.fusion_method == 'add': feat = feat + shortcut else: feat = paddle.concat([feat, shortcut], axis=1) return feat @classmethod def from_config(cls, cfg, input_shape): return {'in_channels': [i.channels for i in input_shape], } @property def out_shape(self): return [ShapeSpec(channels=self.upper_list[-1], )] ================================================ FILE: ppdet/modeling/necks/yolo_fpn.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register, serializable from ppdet.modeling.layers import DropBlock from ppdet.modeling.ops import get_act_fn from ..backbones.darknet import ConvBNLayer from ..shape_spec import ShapeSpec from ..backbones.csp_darknet import BaseConv, DWConv, CSPLayer __all__ = ['YOLOv3FPN', 'PPYOLOFPN', 'PPYOLOTinyFPN', 'PPYOLOPAN', 'YOLOCSPPAN'] def add_coord(x, data_format): b = x.shape[0] if data_format == 'NCHW': h, w = x.shape[2], x.shape[3] else: h, w = x.shape[1], x.shape[2] gx = paddle.cast(paddle.arange(w) / ((w - 1.) * 2.0) - 1., x.dtype) gy = paddle.cast(paddle.arange(h) / ((h - 1.) * 2.0) - 1., x.dtype) if data_format == 'NCHW': gx = gx.reshape([1, 1, 1, w]).expand([b, 1, h, w]) gy = gy.reshape([1, 1, h, 1]).expand([b, 1, h, w]) else: gx = gx.reshape([1, 1, w, 1]).expand([b, h, w, 1]) gy = gy.reshape([1, h, 1, 1]).expand([b, h, w, 1]) gx.stop_gradient = True gy.stop_gradient = True return gx, gy class YoloDetBlock(nn.Layer): def __init__(self, ch_in, channel, norm_type, freeze_norm=False, name='', data_format='NCHW'): """ YOLODetBlock layer for yolov3, see https://arxiv.org/abs/1804.02767 Args: ch_in (int): input channel channel (int): base channel norm_type (str): batch norm type freeze_norm (bool): whether to freeze norm, default False name (str): layer name data_format (str): data format, NCHW or NHWC """ super(YoloDetBlock, self).__init__() self.ch_in = ch_in self.channel = channel assert channel % 2 == 0, \ "channel {} cannot be divided by 2".format(channel) conv_def = [ ['conv0', ch_in, channel, 1, '.0.0'], ['conv1', channel, channel * 2, 3, '.0.1'], ['conv2', channel * 2, channel, 1, '.1.0'], ['conv3', channel, channel * 2, 3, '.1.1'], ['route', channel * 2, channel, 1, '.2'], ] self.conv_module = nn.Sequential() for idx, (conv_name, ch_in, ch_out, filter_size, post_name) in enumerate(conv_def): self.conv_module.add_sublayer( conv_name, ConvBNLayer( ch_in=ch_in, ch_out=ch_out, filter_size=filter_size, padding=(filter_size - 1) // 2, norm_type=norm_type, freeze_norm=freeze_norm, data_format=data_format, name=name + post_name)) self.tip = ConvBNLayer( ch_in=channel, ch_out=channel * 2, filter_size=3, padding=1, norm_type=norm_type, freeze_norm=freeze_norm, data_format=data_format, name=name + '.tip') def forward(self, inputs): route = self.conv_module(inputs) tip = self.tip(route) return route, tip class SPP(nn.Layer): def __init__(self, ch_in, ch_out, k, pool_size, norm_type='bn', freeze_norm=False, name='', act='leaky', data_format='NCHW'): """ SPP layer, which consist of four pooling layer follwed by conv layer Args: ch_in (int): input channel of conv layer ch_out (int): output channel of conv layer k (int): kernel size of conv layer norm_type (str): batch norm type freeze_norm (bool): whether to freeze norm, default False name (str): layer name act (str): activation function data_format (str): data format, NCHW or NHWC """ super(SPP, self).__init__() self.pool = [] self.data_format = data_format for size in pool_size: pool = self.add_sublayer( '{}.pool1'.format(name), nn.MaxPool2D( kernel_size=size, stride=1, padding=size // 2, data_format=data_format, ceil_mode=False)) self.pool.append(pool) self.conv = ConvBNLayer( ch_in, ch_out, k, padding=k // 2, norm_type=norm_type, freeze_norm=freeze_norm, name=name, act=act, data_format=data_format) def forward(self, x): outs = [x] for pool in self.pool: outs.append(pool(x)) if self.data_format == "NCHW": y = paddle.concat(outs, axis=1) else: y = paddle.concat(outs, axis=-1) y = self.conv(y) return y class CoordConv(nn.Layer): def __init__(self, ch_in, ch_out, filter_size, padding, norm_type, freeze_norm=False, name='', data_format='NCHW'): """ CoordConv layer, see https://arxiv.org/abs/1807.03247 Args: ch_in (int): input channel ch_out (int): output channel filter_size (int): filter size, default 3 padding (int): padding size, default 0 norm_type (str): batch norm type, default bn name (str): layer name data_format (str): data format, NCHW or NHWC """ super(CoordConv, self).__init__() self.conv = ConvBNLayer( ch_in + 2, ch_out, filter_size=filter_size, padding=padding, norm_type=norm_type, freeze_norm=freeze_norm, data_format=data_format, name=name) self.data_format = data_format def forward(self, x): gx, gy = add_coord(x, self.data_format) if self.data_format == 'NCHW': y = paddle.concat([x, gx, gy], axis=1) else: y = paddle.concat([x, gx, gy], axis=-1) y = self.conv(y) return y class PPYOLODetBlock(nn.Layer): def __init__(self, cfg, name, data_format='NCHW'): """ PPYOLODetBlock layer Args: cfg (list): layer configs for this block name (str): block name data_format (str): data format, NCHW or NHWC """ super(PPYOLODetBlock, self).__init__() self.conv_module = nn.Sequential() for idx, (conv_name, layer, args, kwargs) in enumerate(cfg[:-1]): kwargs.update( name='{}.{}'.format(name, conv_name), data_format=data_format) self.conv_module.add_sublayer(conv_name, layer(*args, **kwargs)) conv_name, layer, args, kwargs = cfg[-1] kwargs.update( name='{}.{}'.format(name, conv_name), data_format=data_format) self.tip = layer(*args, **kwargs) def forward(self, inputs): route = self.conv_module(inputs) tip = self.tip(route) return route, tip class PPYOLOTinyDetBlock(nn.Layer): def __init__(self, ch_in, ch_out, name, drop_block=False, block_size=3, keep_prob=0.9, data_format='NCHW'): """ PPYOLO Tiny DetBlock layer Args: ch_in (list): input channel number ch_out (list): output channel number name (str): block name drop_block: whether user DropBlock block_size: drop block size keep_prob: probability to keep block in DropBlock data_format (str): data format, NCHW or NHWC """ super(PPYOLOTinyDetBlock, self).__init__() self.drop_block_ = drop_block self.conv_module = nn.Sequential() cfgs = [ # name, in channels, out channels, filter_size, # stride, padding, groups ['.0', ch_in, ch_out, 1, 1, 0, 1], ['.1', ch_out, ch_out, 5, 1, 2, ch_out], ['.2', ch_out, ch_out, 1, 1, 0, 1], ['.route', ch_out, ch_out, 5, 1, 2, ch_out], ] for cfg in cfgs: conv_name, conv_ch_in, conv_ch_out, filter_size, stride, padding, \ groups = cfg self.conv_module.add_sublayer( name + conv_name, ConvBNLayer( ch_in=conv_ch_in, ch_out=conv_ch_out, filter_size=filter_size, stride=stride, padding=padding, groups=groups, name=name + conv_name)) self.tip = ConvBNLayer( ch_in=ch_out, ch_out=ch_out, filter_size=1, stride=1, padding=0, groups=1, name=name + conv_name) if self.drop_block_: self.drop_block = DropBlock( block_size=block_size, keep_prob=keep_prob, data_format=data_format, name=name + '.dropblock') def forward(self, inputs): if self.drop_block_: inputs = self.drop_block(inputs) route = self.conv_module(inputs) tip = self.tip(route) return route, tip class PPYOLODetBlockCSP(nn.Layer): def __init__(self, cfg, ch_in, ch_out, act, norm_type, name, data_format='NCHW'): """ PPYOLODetBlockCSP layer Args: cfg (list): layer configs for this block ch_in (int): input channel ch_out (int): output channel act (str): default mish name (str): block name data_format (str): data format, NCHW or NHWC """ super(PPYOLODetBlockCSP, self).__init__() self.data_format = data_format self.conv1 = ConvBNLayer( ch_in, ch_out, 1, padding=0, act=act, norm_type=norm_type, name=name + '.left', data_format=data_format) self.conv2 = ConvBNLayer( ch_in, ch_out, 1, padding=0, act=act, norm_type=norm_type, name=name + '.right', data_format=data_format) self.conv3 = ConvBNLayer( ch_out * 2, ch_out * 2, 1, padding=0, act=act, norm_type=norm_type, name=name, data_format=data_format) self.conv_module = nn.Sequential() for idx, (layer_name, layer, args, kwargs) in enumerate(cfg): kwargs.update(name=name + layer_name, data_format=data_format) self.conv_module.add_sublayer(layer_name, layer(*args, **kwargs)) def forward(self, inputs): conv_left = self.conv1(inputs) conv_right = self.conv2(inputs) conv_left = self.conv_module(conv_left) if self.data_format == 'NCHW': conv = paddle.concat([conv_left, conv_right], axis=1) else: conv = paddle.concat([conv_left, conv_right], axis=-1) conv = self.conv3(conv) return conv, conv @register @serializable class YOLOv3FPN(nn.Layer): __shared__ = ['norm_type', 'data_format'] def __init__(self, in_channels=[256, 512, 1024], norm_type='bn', freeze_norm=False, data_format='NCHW'): """ YOLOv3FPN layer Args: in_channels (list): input channels for fpn norm_type (str): batch norm type, default bn data_format (str): data format, NCHW or NHWC """ super(YOLOv3FPN, self).__init__() assert len(in_channels) > 0, "in_channels length should > 0" self.in_channels = in_channels self.num_blocks = len(in_channels) self._out_channels = [] self.yolo_blocks = [] self.routes = [] self.data_format = data_format for i in range(self.num_blocks): name = 'yolo_block.{}'.format(i) in_channel = in_channels[-i - 1] if i > 0: in_channel += 512 // (2**i) yolo_block = self.add_sublayer( name, YoloDetBlock( in_channel, channel=512 // (2**i), norm_type=norm_type, freeze_norm=freeze_norm, data_format=data_format, name=name)) self.yolo_blocks.append(yolo_block) # tip layer output channel doubled self._out_channels.append(1024 // (2**i)) if i < self.num_blocks - 1: name = 'yolo_transition.{}'.format(i) route = self.add_sublayer( name, ConvBNLayer( ch_in=512 // (2**i), ch_out=256 // (2**i), filter_size=1, stride=1, padding=0, norm_type=norm_type, freeze_norm=freeze_norm, data_format=data_format, name=name)) self.routes.append(route) def forward(self, blocks, for_mot=False): assert len(blocks) == self.num_blocks blocks = blocks[::-1] yolo_feats = [] # add embedding features output for multi-object tracking model if for_mot: emb_feats = [] for i, block in enumerate(blocks): if i > 0: if self.data_format == 'NCHW': block = paddle.concat([route, block], axis=1) else: block = paddle.concat([route, block], axis=-1) route, tip = self.yolo_blocks[i](block) yolo_feats.append(tip) if for_mot: # add embedding features output emb_feats.append(route) if i < self.num_blocks - 1: route = self.routes[i](route) route = F.interpolate( route, scale_factor=2., data_format=self.data_format) if for_mot: return {'yolo_feats': yolo_feats, 'emb_feats': emb_feats} else: return yolo_feats @classmethod def from_config(cls, cfg, input_shape): return {'in_channels': [i.channels for i in input_shape], } @property def out_shape(self): return [ShapeSpec(channels=c) for c in self._out_channels] @register @serializable class PPYOLOFPN(nn.Layer): __shared__ = ['norm_type', 'data_format'] def __init__(self, in_channels=[512, 1024, 2048], norm_type='bn', freeze_norm=False, data_format='NCHW', coord_conv=False, conv_block_num=2, drop_block=False, block_size=3, keep_prob=0.9, spp=False): """ PPYOLOFPN layer Args: in_channels (list): input channels for fpn norm_type (str): batch norm type, default bn data_format (str): data format, NCHW or NHWC coord_conv (bool): whether use CoordConv or not conv_block_num (int): conv block num of each pan block drop_block (bool): whether use DropBlock or not block_size (int): block size of DropBlock keep_prob (float): keep probability of DropBlock spp (bool): whether use spp or not """ super(PPYOLOFPN, self).__init__() assert len(in_channels) > 0, "in_channels length should > 0" self.in_channels = in_channels self.num_blocks = len(in_channels) # parse kwargs self.coord_conv = coord_conv self.drop_block = drop_block self.block_size = block_size self.keep_prob = keep_prob self.spp = spp self.conv_block_num = conv_block_num self.data_format = data_format if self.coord_conv: ConvLayer = CoordConv else: ConvLayer = ConvBNLayer if self.drop_block: dropblock_cfg = [[ 'dropblock', DropBlock, [self.block_size, self.keep_prob], dict() ]] else: dropblock_cfg = [] self._out_channels = [] self.yolo_blocks = [] self.routes = [] for i, ch_in in enumerate(self.in_channels[::-1]): if i > 0: ch_in += 512 // (2**i) channel = 64 * (2**self.num_blocks) // (2**i) base_cfg = [] c_in, c_out = ch_in, channel for j in range(self.conv_block_num): base_cfg += [ [ 'conv{}'.format(2 * j), ConvLayer, [c_in, c_out, 1], dict( padding=0, norm_type=norm_type, freeze_norm=freeze_norm) ], [ 'conv{}'.format(2 * j + 1), ConvBNLayer, [c_out, c_out * 2, 3], dict( padding=1, norm_type=norm_type, freeze_norm=freeze_norm) ], ] c_in, c_out = c_out * 2, c_out base_cfg += [[ 'route', ConvLayer, [c_in, c_out, 1], dict( padding=0, norm_type=norm_type, freeze_norm=freeze_norm) ], [ 'tip', ConvLayer, [c_out, c_out * 2, 3], dict( padding=1, norm_type=norm_type, freeze_norm=freeze_norm) ]] if self.conv_block_num == 2: if i == 0: if self.spp: spp_cfg = [[ 'spp', SPP, [channel * 4, channel, 1], dict( pool_size=[5, 9, 13], norm_type=norm_type, freeze_norm=freeze_norm) ]] else: spp_cfg = [] cfg = base_cfg[0:3] + spp_cfg + base_cfg[ 3:4] + dropblock_cfg + base_cfg[4:6] else: cfg = base_cfg[0:2] + dropblock_cfg + base_cfg[2:6] elif self.conv_block_num == 0: if self.spp and i == 0: spp_cfg = [[ 'spp', SPP, [c_in * 4, c_in, 1], dict( pool_size=[5, 9, 13], norm_type=norm_type, freeze_norm=freeze_norm) ]] else: spp_cfg = [] cfg = spp_cfg + dropblock_cfg + base_cfg name = 'yolo_block.{}'.format(i) yolo_block = self.add_sublayer(name, PPYOLODetBlock(cfg, name)) self.yolo_blocks.append(yolo_block) self._out_channels.append(channel * 2) if i < self.num_blocks - 1: name = 'yolo_transition.{}'.format(i) route = self.add_sublayer( name, ConvBNLayer( ch_in=channel, ch_out=256 // (2**i), filter_size=1, stride=1, padding=0, norm_type=norm_type, freeze_norm=freeze_norm, data_format=data_format, name=name)) self.routes.append(route) def forward(self, blocks, for_mot=False): assert len(blocks) == self.num_blocks blocks = blocks[::-1] yolo_feats = [] # add embedding features output for multi-object tracking model if for_mot: emb_feats = [] for i, block in enumerate(blocks): if i > 0: if self.data_format == 'NCHW': block = paddle.concat([route, block], axis=1) else: block = paddle.concat([route, block], axis=-1) route, tip = self.yolo_blocks[i](block) yolo_feats.append(tip) if for_mot: # add embedding features output emb_feats.append(route) if i < self.num_blocks - 1: route = self.routes[i](route) route = F.interpolate( route, scale_factor=2., data_format=self.data_format) if for_mot: return {'yolo_feats': yolo_feats, 'emb_feats': emb_feats} else: return yolo_feats @classmethod def from_config(cls, cfg, input_shape): return {'in_channels': [i.channels for i in input_shape], } @property def out_shape(self): return [ShapeSpec(channels=c) for c in self._out_channels] @register @serializable class PPYOLOTinyFPN(nn.Layer): __shared__ = ['norm_type', 'data_format'] def __init__(self, in_channels=[80, 56, 34], detection_block_channels=[160, 128, 96], norm_type='bn', data_format='NCHW', **kwargs): """ PPYOLO Tiny FPN layer Args: in_channels (list): input channels for fpn detection_block_channels (list): channels in fpn norm_type (str): batch norm type, default bn data_format (str): data format, NCHW or NHWC kwargs: extra key-value pairs, such as parameter of DropBlock and spp """ super(PPYOLOTinyFPN, self).__init__() assert len(in_channels) > 0, "in_channels length should > 0" self.in_channels = in_channels[::-1] assert len(detection_block_channels ) > 0, "detection_block_channelslength should > 0" self.detection_block_channels = detection_block_channels self.data_format = data_format self.num_blocks = len(in_channels) # parse kwargs self.drop_block = kwargs.get('drop_block', False) self.block_size = kwargs.get('block_size', 3) self.keep_prob = kwargs.get('keep_prob', 0.9) self.spp_ = kwargs.get('spp', False) if self.spp_: self.spp = SPP(self.in_channels[0] * 4, self.in_channels[0], k=1, pool_size=[5, 9, 13], norm_type=norm_type, name='spp') self._out_channels = [] self.yolo_blocks = [] self.routes = [] for i, ( ch_in, ch_out ) in enumerate(zip(self.in_channels, self.detection_block_channels)): name = 'yolo_block.{}'.format(i) if i > 0: ch_in += self.detection_block_channels[i - 1] yolo_block = self.add_sublayer( name, PPYOLOTinyDetBlock( ch_in, ch_out, name, drop_block=self.drop_block, block_size=self.block_size, keep_prob=self.keep_prob)) self.yolo_blocks.append(yolo_block) self._out_channels.append(ch_out) if i < self.num_blocks - 1: name = 'yolo_transition.{}'.format(i) route = self.add_sublayer( name, ConvBNLayer( ch_in=ch_out, ch_out=ch_out, filter_size=1, stride=1, padding=0, norm_type=norm_type, data_format=data_format, name=name)) self.routes.append(route) def forward(self, blocks, for_mot=False): assert len(blocks) == self.num_blocks blocks = blocks[::-1] yolo_feats = [] # add embedding features output for multi-object tracking model if for_mot: emb_feats = [] for i, block in enumerate(blocks): if i == 0 and self.spp_: block = self.spp(block) if i > 0: if self.data_format == 'NCHW': block = paddle.concat([route, block], axis=1) else: block = paddle.concat([route, block], axis=-1) route, tip = self.yolo_blocks[i](block) yolo_feats.append(tip) if for_mot: # add embedding features output emb_feats.append(route) if i < self.num_blocks - 1: route = self.routes[i](route) route = F.interpolate( route, scale_factor=2., data_format=self.data_format) if for_mot: return {'yolo_feats': yolo_feats, 'emb_feats': emb_feats} else: return yolo_feats @classmethod def from_config(cls, cfg, input_shape): return {'in_channels': [i.channels for i in input_shape], } @property def out_shape(self): return [ShapeSpec(channels=c) for c in self._out_channels] @register @serializable class PPYOLOPAN(nn.Layer): __shared__ = ['norm_type', 'data_format'] def __init__(self, in_channels=[512, 1024, 2048], norm_type='bn', data_format='NCHW', act='mish', conv_block_num=3, drop_block=False, block_size=3, keep_prob=0.9, spp=False): """ PPYOLOPAN layer with SPP, DropBlock and CSP connection. Args: in_channels (list): input channels for fpn norm_type (str): batch norm type, default bn data_format (str): data format, NCHW or NHWC act (str): activation function, default mish conv_block_num (int): conv block num of each pan block drop_block (bool): whether use DropBlock or not block_size (int): block size of DropBlock keep_prob (float): keep probability of DropBlock spp (bool): whether use spp or not """ super(PPYOLOPAN, self).__init__() assert len(in_channels) > 0, "in_channels length should > 0" self.in_channels = in_channels self.num_blocks = len(in_channels) # parse kwargs self.drop_block = drop_block self.block_size = block_size self.keep_prob = keep_prob self.spp = spp self.conv_block_num = conv_block_num self.data_format = data_format if self.drop_block: dropblock_cfg = [[ 'dropblock', DropBlock, [self.block_size, self.keep_prob], dict() ]] else: dropblock_cfg = [] # fpn self.fpn_blocks = [] self.fpn_routes = [] fpn_channels = [] for i, ch_in in enumerate(self.in_channels[::-1]): if i > 0: ch_in += 512 // (2**(i - 1)) channel = 512 // (2**i) base_cfg = [] for j in range(self.conv_block_num): base_cfg += [ # name, layer, args [ '{}.0'.format(j), ConvBNLayer, [channel, channel, 1], dict( padding=0, act=act, norm_type=norm_type) ], [ '{}.1'.format(j), ConvBNLayer, [channel, channel, 3], dict( padding=1, act=act, norm_type=norm_type) ] ] if i == 0 and self.spp: base_cfg[3] = [ 'spp', SPP, [channel * 4, channel, 1], dict( pool_size=[5, 9, 13], act=act, norm_type=norm_type) ] cfg = base_cfg[:4] + dropblock_cfg + base_cfg[4:] name = 'fpn.{}'.format(i) fpn_block = self.add_sublayer( name, PPYOLODetBlockCSP(cfg, ch_in, channel, act, norm_type, name, data_format)) self.fpn_blocks.append(fpn_block) fpn_channels.append(channel * 2) if i < self.num_blocks - 1: name = 'fpn_transition.{}'.format(i) route = self.add_sublayer( name, ConvBNLayer( ch_in=channel * 2, ch_out=channel, filter_size=1, stride=1, padding=0, act=act, norm_type=norm_type, data_format=data_format, name=name)) self.fpn_routes.append(route) # pan self.pan_blocks = [] self.pan_routes = [] self._out_channels = [512 // (2**(self.num_blocks - 2)), ] for i in reversed(range(self.num_blocks - 1)): name = 'pan_transition.{}'.format(i) route = self.add_sublayer( name, ConvBNLayer( ch_in=fpn_channels[i + 1], ch_out=fpn_channels[i + 1], filter_size=3, stride=2, padding=1, act=act, norm_type=norm_type, data_format=data_format, name=name)) self.pan_routes = [route, ] + self.pan_routes base_cfg = [] ch_in = fpn_channels[i] + fpn_channels[i + 1] channel = 512 // (2**i) for j in range(self.conv_block_num): base_cfg += [ # name, layer, args [ '{}.0'.format(j), ConvBNLayer, [channel, channel, 1], dict( padding=0, act=act, norm_type=norm_type) ], [ '{}.1'.format(j), ConvBNLayer, [channel, channel, 3], dict( padding=1, act=act, norm_type=norm_type) ] ] cfg = base_cfg[:4] + dropblock_cfg + base_cfg[4:] name = 'pan.{}'.format(i) pan_block = self.add_sublayer( name, PPYOLODetBlockCSP(cfg, ch_in, channel, act, norm_type, name, data_format)) self.pan_blocks = [pan_block, ] + self.pan_blocks self._out_channels.append(channel * 2) self._out_channels = self._out_channels[::-1] def forward(self, blocks, for_mot=False): assert len(blocks) == self.num_blocks blocks = blocks[::-1] fpn_feats = [] # add embedding features output for multi-object tracking model if for_mot: emb_feats = [] for i, block in enumerate(blocks): if i > 0: if self.data_format == 'NCHW': block = paddle.concat([route, block], axis=1) else: block = paddle.concat([route, block], axis=-1) route, tip = self.fpn_blocks[i](block) fpn_feats.append(tip) if for_mot: # add embedding features output emb_feats.append(route) if i < self.num_blocks - 1: route = self.fpn_routes[i](route) route = F.interpolate( route, scale_factor=2., data_format=self.data_format) pan_feats = [fpn_feats[-1], ] route = fpn_feats[self.num_blocks - 1] for i in reversed(range(self.num_blocks - 1)): block = fpn_feats[i] route = self.pan_routes[i](route) if self.data_format == 'NCHW': block = paddle.concat([route, block], axis=1) else: block = paddle.concat([route, block], axis=-1) route, tip = self.pan_blocks[i](block) pan_feats.append(tip) if for_mot: return {'yolo_feats': pan_feats[::-1], 'emb_feats': emb_feats} else: return pan_feats[::-1] @classmethod def from_config(cls, cfg, input_shape): return {'in_channels': [i.channels for i in input_shape], } @property def out_shape(self): return [ShapeSpec(channels=c) for c in self._out_channels] @register @serializable class YOLOCSPPAN(nn.Layer): """ YOLO CSP-PAN, used in YOLOv5 and YOLOX. """ __shared__ = ['depth_mult', 'data_format', 'act', 'trt'] def __init__(self, depth_mult=1.0, in_channels=[256, 512, 1024], depthwise=False, data_format='NCHW', act='silu', trt=False): super(YOLOCSPPAN, self).__init__() self.in_channels = in_channels self._out_channels = in_channels Conv = DWConv if depthwise else BaseConv self.data_format = data_format act = get_act_fn( act, trt=trt) if act is None or isinstance(act, (str, dict)) else act self.upsample = nn.Upsample(scale_factor=2, mode="nearest") # top-down fpn self.lateral_convs = nn.LayerList() self.fpn_blocks = nn.LayerList() for idx in range(len(in_channels) - 1, 0, -1): self.lateral_convs.append( BaseConv( int(in_channels[idx]), int(in_channels[idx - 1]), 1, 1, act=act)) self.fpn_blocks.append( CSPLayer( int(in_channels[idx - 1] * 2), int(in_channels[idx - 1]), round(3 * depth_mult), shortcut=False, depthwise=depthwise, act=act)) # bottom-up pan self.downsample_convs = nn.LayerList() self.pan_blocks = nn.LayerList() for idx in range(len(in_channels) - 1): self.downsample_convs.append( Conv( int(in_channels[idx]), int(in_channels[idx]), 3, stride=2, act=act)) self.pan_blocks.append( CSPLayer( int(in_channels[idx] * 2), int(in_channels[idx + 1]), round(3 * depth_mult), shortcut=False, depthwise=depthwise, act=act)) def forward(self, feats, for_mot=False): assert len(feats) == len(self.in_channels) # top-down fpn inner_outs = [feats[-1]] for idx in range(len(self.in_channels) - 1, 0, -1): feat_heigh = inner_outs[0] feat_low = feats[idx - 1] feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx]( feat_heigh) inner_outs[0] = feat_heigh upsample_feat = F.interpolate( feat_heigh, scale_factor=2., mode="nearest", data_format=self.data_format) inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx]( paddle.concat( [upsample_feat, feat_low], axis=1)) inner_outs.insert(0, inner_out) # bottom-up pan outs = [inner_outs[0]] for idx in range(len(self.in_channels) - 1): feat_low = outs[-1] feat_height = inner_outs[idx + 1] downsample_feat = self.downsample_convs[idx](feat_low) out = self.pan_blocks[idx](paddle.concat( [downsample_feat, feat_height], axis=1)) outs.append(out) return outs @classmethod def from_config(cls, cfg, input_shape): return {'in_channels': [i.channels for i in input_shape], } @property def out_shape(self): return [ShapeSpec(channels=c) for c in self._out_channels] ================================================ FILE: ppdet/modeling/ops.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn.functional as F import paddle.nn as nn from paddle import ParamAttr from paddle.regularizer import L2Decay try: import paddle._legacy_C_ops as C_ops except: import paddle._C_ops as C_ops try: from paddle.framework import in_dynamic_or_pir_mode HAVE_PIR = True except: HAVE_PIR = False from paddle import in_dynamic_mode from paddle.common_ops_import import Variable, LayerHelper, check_variable_and_dtype, check_type, check_dtype __all__ = [ 'prior_box', 'generate_proposals', 'box_coder', 'multiclass_nms', 'distribute_fpn_proposals', 'matrix_nms', 'batch_norm', 'mish', 'silu', 'swish', 'identity', 'anchor_generator' ] def identity(x): return x def mish(x): return F.mish(x) if hasattr(F, mish) else x * F.tanh(F.softplus(x)) def silu(x): return F.silu(x) def swish(x): return x * F.sigmoid(x) TRT_ACT_SPEC = {'swish': swish, 'silu': swish} ACT_SPEC = {'mish': mish, 'silu': silu} def get_act_fn(act=None, trt=False): assert act is None or isinstance(act, ( str, dict)), 'name of activation should be str, dict or None' if not act: return identity if isinstance(act, dict): name = act['name'] act.pop('name') kwargs = act else: name = act kwargs = dict() if trt and name in TRT_ACT_SPEC: fn = TRT_ACT_SPEC[name] elif name in ACT_SPEC: fn = ACT_SPEC[name] else: fn = getattr(F, name) return lambda x: fn(x, **kwargs) def batch_norm(ch, norm_type='bn', norm_decay=0., freeze_norm=False, initializer=None, data_format='NCHW'): norm_lr = 0. if freeze_norm else 1. weight_attr = ParamAttr( initializer=initializer, learning_rate=norm_lr, regularizer=L2Decay(norm_decay), trainable=False if freeze_norm else True) bias_attr = ParamAttr( learning_rate=norm_lr, regularizer=L2Decay(norm_decay), trainable=False if freeze_norm else True) if norm_type in ['sync_bn', 'bn']: norm_layer = nn.BatchNorm2D( ch, weight_attr=weight_attr, bias_attr=bias_attr, data_format=data_format) norm_params = norm_layer.parameters() if freeze_norm: for param in norm_params: param.stop_gradient = True return norm_layer @paddle.jit.not_to_static def anchor_generator(input, anchor_sizes=None, aspect_ratios=None, variance=[0.1, 0.1, 0.2, 0.2], stride=None, offset=0.5): """ **Anchor generator operator** Generate anchors for Faster RCNN algorithm. Each position of the input produce N anchors, N = size(anchor_sizes) * size(aspect_ratios). The order of generated anchors is firstly aspect_ratios loop then anchor_sizes loop. Args: input(Variable): 4-D Tensor with shape [N,C,H,W]. The input feature map. anchor_sizes(float32|list|tuple, optional): The anchor sizes of generated anchors, given in absolute pixels e.g. [64., 128., 256., 512.]. For instance, the anchor size of 64 means the area of this anchor equals to 64**2. None by default. aspect_ratios(float32|list|tuple, optional): The height / width ratios of generated anchors, e.g. [0.5, 1.0, 2.0]. None by default. variance(list|tuple, optional): The variances to be used in box regression deltas. The data type is float32, [0.1, 0.1, 0.2, 0.2] by default. stride(list|tuple, optional): The anchors stride across width and height. The data type is float32. e.g. [16.0, 16.0]. None by default. offset(float32, optional): Prior boxes center offset. 0.5 by default. Returns: Tuple: Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4]. H is the height of input, W is the width of input, num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized. Variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. H is the height of input, W is the width of input num_anchors is the box count of each position. Each variance is in (xcenter, ycenter, w, h) format. Examples: .. code-block:: python import paddle.fluid as fluid conv1 = fluid.data(name='conv1', shape=[None, 48, 16, 16], dtype='float32') anchor, var = fluid.layers.anchor_generator( input=conv1, anchor_sizes=[64, 128, 256, 512], aspect_ratios=[0.5, 1.0, 2.0], variance=[0.1, 0.1, 0.2, 0.2], stride=[16.0, 16.0], offset=0.5) """ def _is_list_or_tuple_(data): return (isinstance(data, list) or isinstance(data, tuple)) if not _is_list_or_tuple_(anchor_sizes): anchor_sizes = [anchor_sizes] if not _is_list_or_tuple_(aspect_ratios): aspect_ratios = [aspect_ratios] if not (_is_list_or_tuple_(stride) and len(stride) == 2): raise ValueError('stride should be a list or tuple ', 'with length 2, (stride_width, stride_height).') anchor_sizes = list(map(float, anchor_sizes)) aspect_ratios = list(map(float, aspect_ratios)) stride = list(map(float, stride)) if in_dynamic_mode(): attrs = ('anchor_sizes', anchor_sizes, 'aspect_ratios', aspect_ratios, 'variances', variance, 'stride', stride, 'offset', offset) anchor, var = C_ops.anchor_generator(input, *attrs) return anchor, var helper = LayerHelper("anchor_generator", **locals()) dtype = helper.input_dtype() attrs = { 'anchor_sizes': anchor_sizes, 'aspect_ratios': aspect_ratios, 'variances': variance, 'stride': stride, 'offset': offset } anchor = helper.create_variable_for_type_inference(dtype) var = helper.create_variable_for_type_inference(dtype) helper.append_op( type="anchor_generator", inputs={"Input": input}, outputs={"Anchors": anchor, "Variances": var}, attrs=attrs, ) anchor.stop_gradient = True var.stop_gradient = True return anchor, var @paddle.jit.not_to_static def distribute_fpn_proposals(fpn_rois, min_level, max_level, refer_level, refer_scale, pixel_offset=False, rois_num=None, name=None): r""" **This op only takes LoDTensor as input.** In Feature Pyramid Networks (FPN) models, it is needed to distribute all proposals into different FPN level, with respect to scale of the proposals, the referring scale and the referring level. Besides, to restore the order of proposals, we return an array which indicates the original index of rois in current proposals. To compute FPN level for each roi, the formula is given as follows: .. math:: roi\_scale &= \sqrt{BBoxArea(fpn\_roi)} level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level) where BBoxArea is a function to compute the area of each roi. Args: fpn_rois(Variable): 2-D Tensor with shape [N, 4] and data type is float32 or float64. The input fpn_rois. min_level(int32): The lowest level of FPN layer where the proposals come from. max_level(int32): The highest level of FPN layer where the proposals come from. refer_level(int32): The referring level of FPN layer with specified scale. refer_scale(int32): The referring scale of FPN layer with specified level. rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. The shape is [B] and data type is int32. B is the number of images. If it is not None then return a list of 1-D Tensor. Each element is the output RoIs' number of each image on the corresponding level and the shape is [B]. None by default. name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default. Returns: Tuple: multi_rois(List) : A list of 2-D LoDTensor with shape [M, 4] and data type of float32 and float64. The length is max_level-min_level+1. The proposals in each FPN level. restore_ind(Variable): A 2-D Tensor with shape [N, 1], N is the number of total rois. The data type is int32. It is used to restore the order of fpn_rois. rois_num_per_level(List): A list of 1-D Tensor and each Tensor is the RoIs' number in each image on the corresponding level. The shape is [B] and data type of int32. B is the number of images Examples: .. code-block:: python import paddle from ppdet.modeling import ops paddle.enable_static() fpn_rois = paddle.static.data( name='data', shape=[None, 4], dtype='float32', lod_level=1) multi_rois, restore_ind = ops.distribute_fpn_proposals( fpn_rois=fpn_rois, min_level=2, max_level=5, refer_level=4, refer_scale=224) """ num_lvl = max_level - min_level + 1 if in_dynamic_mode(): assert rois_num is not None, "rois_num should not be None in dygraph mode." attrs = ('min_level', min_level, 'max_level', max_level, 'refer_level', refer_level, 'refer_scale', refer_scale, 'pixel_offset', pixel_offset) multi_rois, restore_ind, rois_num_per_level = C_ops.distribute_fpn_proposals( fpn_rois, rois_num, num_lvl, num_lvl, *attrs) return multi_rois, restore_ind, rois_num_per_level else: check_variable_and_dtype(fpn_rois, 'fpn_rois', ['float32', 'float64'], 'distribute_fpn_proposals') helper = LayerHelper('distribute_fpn_proposals', **locals()) dtype = helper.input_dtype('fpn_rois') multi_rois = [ helper.create_variable_for_type_inference(dtype) for i in range(num_lvl) ] restore_ind = helper.create_variable_for_type_inference(dtype='int32') inputs = {'FpnRois': fpn_rois} outputs = { 'MultiFpnRois': multi_rois, 'RestoreIndex': restore_ind, } if rois_num is not None: inputs['RoisNum'] = rois_num rois_num_per_level = [ helper.create_variable_for_type_inference(dtype='int32') for i in range(num_lvl) ] outputs['MultiLevelRoIsNum'] = rois_num_per_level else: rois_num_per_level = None helper.append_op( type='distribute_fpn_proposals', inputs=inputs, outputs=outputs, attrs={ 'min_level': min_level, 'max_level': max_level, 'refer_level': refer_level, 'refer_scale': refer_scale, 'pixel_offset': pixel_offset }) return multi_rois, restore_ind, rois_num_per_level @paddle.jit.not_to_static def prior_box(input, image, min_sizes, max_sizes=None, aspect_ratios=[1.], variance=[0.1, 0.1, 0.2, 0.2], flip=False, clip=False, steps=[0.0, 0.0], offset=0.5, min_max_aspect_ratios_order=False, name=None): """ This op generates prior boxes for SSD(Single Shot MultiBox Detector) algorithm. Each position of the input produce N prior boxes, N is determined by the count of min_sizes, max_sizes and aspect_ratios, The size of the box is in range(min_size, max_size) interval, which is generated in sequence according to the aspect_ratios. Parameters: input(Tensor): 4-D tensor(NCHW), the data type should be float32 or float64. image(Tensor): 4-D tensor(NCHW), the input image data of PriorBoxOp, the data type should be float32 or float64. min_sizes(list|tuple|float): the min sizes of generated prior boxes. max_sizes(list|tuple|None): the max sizes of generated prior boxes. Default: None. aspect_ratios(list|tuple|float): the aspect ratios of generated prior boxes. Default: [1.]. variance(list|tuple): the variances to be encoded in prior boxes. Default:[0.1, 0.1, 0.2, 0.2]. flip(bool): Whether to flip aspect ratios. Default:False. clip(bool): Whether to clip out-of-boundary boxes. Default: False. step(list|tuple): Prior boxes step across width and height, If step[0] equals to 0.0 or step[1] equals to 0.0, the prior boxes step across height or weight of the input will be automatically calculated. Default: [0., 0.] offset(float): Prior boxes center offset. Default: 0.5 min_max_aspect_ratios_order(bool): If set True, the output prior box is in order of [min, max, aspect_ratios], which is consistent with Caffe. Please note, this order affects the weights order of convolution layer followed by and does not affect the final detection results. Default: False. name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` Returns: Tuple: A tuple with two Variable (boxes, variances) boxes(Tensor): the output prior boxes of PriorBox. 4-D tensor, the layout is [H, W, num_priors, 4]. H is the height of input, W is the width of input, num_priors is the total box count of each position of input. variances(Tensor): the expanded variances of PriorBox. 4-D tensor, the layput is [H, W, num_priors, 4]. H is the height of input, W is the width of input num_priors is the total box count of each position of input Examples: .. code-block:: python import paddle from ppdet.modeling import ops paddle.enable_static() input = paddle.static.data(name="input", shape=[None,3,6,9]) image = paddle.static.data(name="image", shape=[None,3,9,12]) box, var = ops.prior_box( input=input, image=image, min_sizes=[100.], clip=True, flip=True) """ return paddle.vision.ops.prior_box( input, image, min_sizes, max_sizes, aspect_ratios, variance, flip, clip, steps, offset, min_max_aspect_ratios_order, name, ) @paddle.jit.not_to_static def multiclass_nms(bboxes, scores, score_threshold, nms_top_k, keep_top_k, nms_threshold=0.3, normalized=True, nms_eta=1., background_label=-1, return_index=False, return_rois_num=True, rois_num=None, name=None): """ This operator is to do multi-class non maximum suppression (NMS) on boxes and scores. In the NMS step, this operator greedily selects a subset of detection bounding boxes that have high scores larger than score_threshold, if providing this threshold, then selects the largest nms_top_k confidences scores if nms_top_k is larger than -1. Then this operator pruns away boxes that have high IOU (intersection over union) overlap with already selected boxes by adaptive threshold NMS based on parameters of nms_threshold and nms_eta. Aftern NMS step, at most keep_top_k number of total bboxes are to be kept per image if keep_top_k is larger than -1. Args: bboxes (Tensor): Two types of bboxes are supported: 1. (Tensor) A 3-D Tensor with shape [N, M, 4 or 8 16 24 32] represents the predicted locations of M bounding bboxes, N is the batch size. Each bounding box has four coordinate values and the layout is [xmin, ymin, xmax, ymax], when box size equals to 4. 2. (LoDTensor) A 3-D Tensor with shape [M, C, 4] M is the number of bounding boxes, C is the class number scores (Tensor): Two types of scores are supported: 1. (Tensor) A 3-D Tensor with shape [N, C, M] represents the predicted confidence predictions. N is the batch size, C is the class number, M is number of bounding boxes. For each category there are total M scores which corresponding M bounding boxes. Please note, M is equal to the 2nd dimension of BBoxes. 2. (LoDTensor) A 2-D LoDTensor with shape [M, C]. M is the number of bbox, C is the class number. In this case, input BBoxes should be the second case with shape [M, C, 4]. background_label (int): The index of background label, the background label will be ignored. If set to -1, then all categories will be considered. Default: 0 score_threshold (float): Threshold to filter out bounding boxes with low confidence score. If not provided, consider all boxes. nms_top_k (int): Maximum number of detections to be kept according to the confidences after the filtering detections based on score_threshold. nms_threshold (float): The threshold to be used in NMS. Default: 0.3 nms_eta (float): The threshold to be used in NMS. Default: 1.0 keep_top_k (int): Number of total bboxes to be kept per image after NMS step. -1 means keeping all bboxes after NMS step. normalized (bool): Whether detections are normalized. Default: True return_index(bool): Whether return selected index. Default: False rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. The shape is [B] and data type is int32. B is the number of images. If it is not None then return a list of 1-D Tensor. Each element is the output RoIs' number of each image on the corresponding level and the shape is [B]. None by default. name(str): Name of the multiclass nms op. Default: None. Returns: A tuple with two Variables: (Out, Index) if return_index is True, otherwise, a tuple with one Variable(Out) is returned. Out: A 2-D LoDTensor with shape [No, 6] represents the detections. Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax] or A 2-D LoDTensor with shape [No, 10] represents the detections. Each row has 10 values: [label, confidence, x1, y1, x2, y2, x3, y3, x4, y4]. No is the total number of detections. If all images have not detected results, all elements in LoD will be 0, and output tensor is empty (None). Index: Only return when return_index is True. A 2-D LoDTensor with shape [No, 1] represents the selected index which type is Integer. The index is the absolute value cross batches. No is the same number as Out. If the index is used to gather other attribute such as age, one needs to reshape the input(N, M, 1) to (N * M, 1) as first, where N is the batch size and M is the number of boxes. Examples: .. code-block:: python import paddle from ppdet.modeling import ops boxes = paddle.static.data(name='bboxes', shape=[81, 4], dtype='float32', lod_level=1) scores = paddle.static.data(name='scores', shape=[81], dtype='float32', lod_level=1) out, index = ops.multiclass_nms(bboxes=boxes, scores=scores, background_label=0, score_threshold=0.5, nms_top_k=400, nms_threshold=0.3, keep_top_k=200, normalized=False, return_index=True) """ helper = LayerHelper('multiclass_nms3', **locals()) if HAVE_PIR and in_dynamic_or_pir_mode(): # https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/phi/ops/yaml/ops.yaml#L3175 attrs = (score_threshold, nms_top_k, keep_top_k, nms_threshold, normalized, nms_eta, background_label, ) output, index, nms_rois_num = paddle._C_ops.multiclass_nms3(bboxes, scores, rois_num, *attrs) if not return_index: index = None return output, nms_rois_num, index elif in_dynamic_mode(): attrs = ('background_label', background_label, 'score_threshold', score_threshold, 'nms_top_k', nms_top_k, 'nms_threshold', nms_threshold, 'keep_top_k', keep_top_k, 'nms_eta', nms_eta, 'normalized', normalized) output, index, nms_rois_num = C_ops.multiclass_nms3(bboxes, scores, rois_num, *attrs) if not return_index: index = None return output, nms_rois_num, index else: output = helper.create_variable_for_type_inference(dtype=bboxes.dtype) index = helper.create_variable_for_type_inference(dtype='int32') inputs = {'BBoxes': bboxes, 'Scores': scores} outputs = {'Out': output, 'Index': index} if rois_num is not None: inputs['RoisNum'] = rois_num if return_rois_num: nms_rois_num = helper.create_variable_for_type_inference( dtype='int32') outputs['NmsRoisNum'] = nms_rois_num helper.append_op( type="multiclass_nms3", inputs=inputs, attrs={ 'background_label': background_label, 'score_threshold': score_threshold, 'nms_top_k': nms_top_k, 'nms_threshold': nms_threshold, 'keep_top_k': keep_top_k, 'nms_eta': nms_eta, 'normalized': normalized }, outputs=outputs) output.stop_gradient = True index.stop_gradient = True if not return_index: index = None if not return_rois_num: nms_rois_num = None return output, nms_rois_num, index @paddle.jit.not_to_static def matrix_nms(bboxes, scores, score_threshold, post_threshold, nms_top_k, keep_top_k, use_gaussian=False, gaussian_sigma=2., background_label=0, normalized=True, return_index=False, return_rois_num=True, name=None): """ **Matrix NMS** This operator does matrix non maximum suppression (NMS). First selects a subset of candidate bounding boxes that have higher scores than score_threshold (if provided), then the top k candidate is selected if nms_top_k is larger than -1. Score of the remaining candidate are then decayed according to the Matrix NMS scheme. Aftern NMS step, at most keep_top_k number of total bboxes are to be kept per image if keep_top_k is larger than -1. Args: bboxes (Tensor): A 3-D Tensor with shape [N, M, 4] represents the predicted locations of M bounding bboxes, N is the batch size. Each bounding box has four coordinate values and the layout is [xmin, ymin, xmax, ymax], when box size equals to 4. The data type is float32 or float64. scores (Tensor): A 3-D Tensor with shape [N, C, M] represents the predicted confidence predictions. N is the batch size, C is the class number, M is number of bounding boxes. For each category there are total M scores which corresponding M bounding boxes. Please note, M is equal to the 2nd dimension of BBoxes. The data type is float32 or float64. score_threshold (float): Threshold to filter out bounding boxes with low confidence score. post_threshold (float): Threshold to filter out bounding boxes with low confidence score AFTER decaying. nms_top_k (int): Maximum number of detections to be kept according to the confidences after the filtering detections based on score_threshold. keep_top_k (int): Number of total bboxes to be kept per image after NMS step. -1 means keeping all bboxes after NMS step. use_gaussian (bool): Use Gaussian as the decay function. Default: False gaussian_sigma (float): Sigma for Gaussian decay function. Default: 2.0 background_label (int): The index of background label, the background label will be ignored. If set to -1, then all categories will be considered. Default: 0 normalized (bool): Whether detections are normalized. Default: True return_index(bool): Whether return selected index. Default: False return_rois_num(bool): whether return rois_num. Default: True name(str): Name of the matrix nms op. Default: None. Returns: A tuple with three Tensor: (Out, Index, RoisNum) if return_index is True, otherwise, a tuple with two Tensor (Out, RoisNum) is returned. Out (Tensor): A 2-D Tensor with shape [No, 6] containing the detection results. Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax] (After version 1.3, when no boxes detected, the lod is changed from {0} to {1}) Index (Tensor): A 2-D Tensor with shape [No, 1] containing the selected indices, which are absolute values cross batches. rois_num (Tensor): A 1-D Tensor with shape [N] containing the number of detected boxes in each image. Examples: .. code-block:: python import paddle from ppdet.modeling import ops boxes = paddle.static.data(name='bboxes', shape=[None,81, 4], dtype='float32', lod_level=1) scores = paddle.static.data(name='scores', shape=[None,81], dtype='float32', lod_level=1) out = ops.matrix_nms(bboxes=boxes, scores=scores, background_label=0, score_threshold=0.5, post_threshold=0.1, nms_top_k=400, keep_top_k=200, normalized=False) """ check_variable_and_dtype(bboxes, 'BBoxes', ['float32', 'float64'], 'matrix_nms') check_variable_and_dtype(scores, 'Scores', ['float32', 'float64'], 'matrix_nms') check_type(score_threshold, 'score_threshold', float, 'matrix_nms') check_type(post_threshold, 'post_threshold', float, 'matrix_nms') check_type(nms_top_k, 'nums_top_k', int, 'matrix_nms') check_type(keep_top_k, 'keep_top_k', int, 'matrix_nms') check_type(normalized, 'normalized', bool, 'matrix_nms') check_type(use_gaussian, 'use_gaussian', bool, 'matrix_nms') check_type(gaussian_sigma, 'gaussian_sigma', float, 'matrix_nms') check_type(background_label, 'background_label', int, 'matrix_nms') if in_dynamic_mode(): attrs = ('background_label', background_label, 'score_threshold', score_threshold, 'post_threshold', post_threshold, 'nms_top_k', nms_top_k, 'gaussian_sigma', gaussian_sigma, 'use_gaussian', use_gaussian, 'keep_top_k', keep_top_k, 'normalized', normalized) out, index, rois_num = C_ops.matrix_nms(bboxes, scores, *attrs) if not return_index: index = None if not return_rois_num: rois_num = None return out, rois_num, index else: helper = LayerHelper('matrix_nms', **locals()) output = helper.create_variable_for_type_inference(dtype=bboxes.dtype) index = helper.create_variable_for_type_inference(dtype='int32') outputs = {'Out': output, 'Index': index} if return_rois_num: rois_num = helper.create_variable_for_type_inference(dtype='int32') outputs['RoisNum'] = rois_num helper.append_op( type="matrix_nms", inputs={'BBoxes': bboxes, 'Scores': scores}, attrs={ 'background_label': background_label, 'score_threshold': score_threshold, 'post_threshold': post_threshold, 'nms_top_k': nms_top_k, 'gaussian_sigma': gaussian_sigma, 'use_gaussian': use_gaussian, 'keep_top_k': keep_top_k, 'normalized': normalized }, outputs=outputs) output.stop_gradient = True if not return_index: index = None if not return_rois_num: rois_num = None return output, rois_num, index @paddle.jit.not_to_static def box_coder(prior_box, prior_box_var, target_box, code_type="encode_center_size", box_normalized=True, axis=0, name=None): r""" **Box Coder Layer** Encode/Decode the target bounding box with the priorbox information. The Encoding schema described below: .. math:: ox = (tx - px) / pw / pxv oy = (ty - py) / ph / pyv ow = \log(\abs(tw / pw)) / pwv oh = \log(\abs(th / ph)) / phv The Decoding schema described below: .. math:: ox = (pw * pxv * tx * + px) - tw / 2 oy = (ph * pyv * ty * + py) - th / 2 ow = \exp(pwv * tw) * pw + tw / 2 oh = \exp(phv * th) * ph + th / 2 where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the priorbox's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`, `phv` denote the variance of the priorbox and `ox`, `oy`, `ow`, `oh` denote the encoded/decoded coordinates, width and height. During Box Decoding, two modes for broadcast are supported. Say target box has shape [N, M, 4], and the shape of prior box can be [N, 4] or [M, 4]. Then prior box will broadcast to target box along the assigned axis. Args: prior_box(Tensor): Box list prior_box is a 2-D Tensor with shape [M, 4] holds M boxes and data type is float32 or float64. Each box is represented as [xmin, ymin, xmax, ymax], [xmin, ymin] is the left top coordinate of the anchor box, if the input is image feature map, they are close to the origin of the coordinate system. [xmax, ymax] is the right bottom coordinate of the anchor box. prior_box_var(List|Tensor|None): prior_box_var supports three types of input. One is Tensor with shape [M, 4] which holds M group and data type is float32 or float64. The second is list consist of 4 elements shared by all boxes and data type is float32 or float64. Other is None and not involved in calculation. target_box(Tensor): This input can be a 2-D LoDTensor with shape [N, 4] when code_type is 'encode_center_size'. This input also can be a 3-D Tensor with shape [N, M, 4] when code_type is 'decode_center_size'. Each box is represented as [xmin, ymin, xmax, ymax]. The data type is float32 or float64. code_type(str): The code type used with the target box. It can be `encode_center_size` or `decode_center_size`. `encode_center_size` by default. box_normalized(bool): Whether treat the priorbox as a normalized box. Set true by default. axis(int): Which axis in PriorBox to broadcast for box decode, for example, if axis is 0 and TargetBox has shape [N, M, 4] and PriorBox has shape [M, 4], then PriorBox will broadcast to [N, M, 4] for decoding. It is only valid when code type is `decode_center_size`. Set 0 by default. name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default. Returns: Tensor: output_box(Tensor): When code_type is 'encode_center_size', the output tensor of box_coder_op with shape [N, M, 4] representing the result of N target boxes encoded with M Prior boxes and variances. When code_type is 'decode_center_size', N represents the batch size and M represents the number of decoded boxes. Examples: .. code-block:: python import paddle from ppdet.modeling import ops paddle.enable_static() # For encode prior_box_encode = paddle.static.data(name='prior_box_encode', shape=[512, 4], dtype='float32') target_box_encode = paddle.static.data(name='target_box_encode', shape=[81, 4], dtype='float32') output_encode = ops.box_coder(prior_box=prior_box_encode, prior_box_var=[0.1,0.1,0.2,0.2], target_box=target_box_encode, code_type="encode_center_size") # For decode prior_box_decode = paddle.static.data(name='prior_box_decode', shape=[512, 4], dtype='float32') target_box_decode = paddle.static.data(name='target_box_decode', shape=[512, 81, 4], dtype='float32') output_decode = ops.box_coder(prior_box=prior_box_decode, prior_box_var=[0.1,0.1,0.2,0.2], target_box=target_box_decode, code_type="decode_center_size", box_normalized=False, axis=1) """ check_variable_and_dtype(prior_box, 'prior_box', ['float32', 'float64'], 'box_coder') check_variable_and_dtype(target_box, 'target_box', ['float32', 'float64'], 'box_coder') if in_dynamic_mode(): if isinstance(prior_box_var, Variable): output_box = C_ops.box_coder( prior_box, prior_box_var, target_box, "code_type", code_type, "box_normalized", box_normalized, "axis", axis) elif isinstance(prior_box_var, list): output_box = C_ops.box_coder( prior_box, None, target_box, "code_type", code_type, "box_normalized", box_normalized, "axis", axis, "variance", prior_box_var) else: raise TypeError( "Input variance of box_coder must be Variable or list") return output_box else: helper = LayerHelper("box_coder", **locals()) output_box = helper.create_variable_for_type_inference( dtype=prior_box.dtype) inputs = {"PriorBox": prior_box, "TargetBox": target_box} attrs = { "code_type": code_type, "box_normalized": box_normalized, "axis": axis } if isinstance(prior_box_var, Variable): inputs['PriorBoxVar'] = prior_box_var elif isinstance(prior_box_var, list): attrs['variance'] = prior_box_var else: raise TypeError( "Input variance of box_coder must be Variable or list") helper.append_op( type="box_coder", inputs=inputs, attrs=attrs, outputs={"OutputBox": output_box}) return output_box @paddle.jit.not_to_static def generate_proposals(scores, bbox_deltas, im_shape, anchors, variances, pre_nms_top_n=6000, post_nms_top_n=1000, nms_thresh=0.5, min_size=0.1, eta=1.0, pixel_offset=False, return_rois_num=False, name=None): """ **Generate proposal Faster-RCNN** This operation proposes RoIs according to each box with their probability to be a foreground object and the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals could be used to train detection net. For generating proposals, this operation performs following steps: 1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) 2. Calculate box locations as proposals candidates. 3. Clip boxes to image 4. Remove predicted boxes with small area. 5. Apply NMS to get final proposals as output. Args: scores(Tensor): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object. N is batch size, A is number of anchors, H and W are height and width of the feature map. The data type must be float32. bbox_deltas(Tensor): A 4-D Tensor with shape [N, 4*A, H, W] represents the difference between predicted box location and anchor location. The data type must be float32. im_shape(Tensor): A 2-D Tensor with shape [N, 2] represents H, W, the origin image size or input size. The data type can be float32 or float64. anchors(Tensor): A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map, num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized. The data type must be float32. variances(Tensor): A 4-D Tensor. The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) format. The data type must be float32. pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. The data type must be float32. `6000` by default. post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. The data type must be float32. `1000` by default. nms_thresh(float): Threshold in NMS. The data type must be float32. `0.5` by default. min_size(float): Remove predicted boxes with either height or width < min_size. The data type must be float32. `0.1` by default. eta(float): Apply in adaptive NMS, if adaptive `threshold > 0.5`, `adaptive_threshold = adaptive_threshold * eta` in each iteration. return_rois_num(bool): When setting True, it will return a 1D Tensor with shape [N, ] that includes Rois's num of each image in one batch. The N is the image's num. For example, the tensor has values [4,5] that represents the first image has 4 Rois, the second image has 5 Rois. It only used in rcnn model. 'False' by default. name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default. Returns: tuple: A tuple with format ``(rpn_rois, rpn_roi_probs)``. - **rpn_rois**: The generated RoIs. 2-D Tensor with shape ``[N, 4]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``. - **rpn_roi_probs**: The scores of generated RoIs. 2-D Tensor with shape ``[N, 1]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``. Examples: .. code-block:: python import paddle from ppdet.modeling import ops paddle.enable_static() scores = paddle.static.data(name='scores', shape=[None, 4, 5, 5], dtype='float32') bbox_deltas = paddle.static.data(name='bbox_deltas', shape=[None, 16, 5, 5], dtype='float32') im_shape = paddle.static.data(name='im_shape', shape=[None, 2], dtype='float32') anchors = paddle.static.data(name='anchors', shape=[None, 5, 4, 4], dtype='float32') variances = paddle.static.data(name='variances', shape=[None, 5, 10, 4], dtype='float32') rois, roi_probs = ops.generate_proposals(scores, bbox_deltas, im_shape, anchors, variances) """ if in_dynamic_mode(): assert return_rois_num, "return_rois_num should be True in dygraph mode." attrs = ('pre_nms_topN', pre_nms_top_n, 'post_nms_topN', post_nms_top_n, 'nms_thresh', nms_thresh, 'min_size', min_size, 'eta', eta, 'pixel_offset', pixel_offset) rpn_rois, rpn_roi_probs, rpn_rois_num = C_ops.generate_proposals_v2( scores, bbox_deltas, im_shape, anchors, variances, *attrs) if not return_rois_num: rpn_rois_num = None return rpn_rois, rpn_roi_probs, rpn_rois_num else: helper = LayerHelper('generate_proposals_v2', **locals()) check_variable_and_dtype(scores, 'scores', ['float32'], 'generate_proposals_v2') check_variable_and_dtype(bbox_deltas, 'bbox_deltas', ['float32'], 'generate_proposals_v2') check_variable_and_dtype(im_shape, 'im_shape', ['float32', 'float64'], 'generate_proposals_v2') check_variable_and_dtype(anchors, 'anchors', ['float32'], 'generate_proposals_v2') check_variable_and_dtype(variances, 'variances', ['float32'], 'generate_proposals_v2') rpn_rois = helper.create_variable_for_type_inference( dtype=bbox_deltas.dtype) rpn_roi_probs = helper.create_variable_for_type_inference( dtype=scores.dtype) outputs = { 'RpnRois': rpn_rois, 'RpnRoiProbs': rpn_roi_probs, } if return_rois_num: rpn_rois_num = helper.create_variable_for_type_inference( dtype='int32') rpn_rois_num.stop_gradient = True outputs['RpnRoisNum'] = rpn_rois_num helper.append_op( type="generate_proposals_v2", inputs={ 'Scores': scores, 'BboxDeltas': bbox_deltas, 'ImShape': im_shape, 'Anchors': anchors, 'Variances': variances }, attrs={ 'pre_nms_topN': pre_nms_top_n, 'post_nms_topN': post_nms_top_n, 'nms_thresh': nms_thresh, 'min_size': min_size, 'eta': eta, 'pixel_offset': pixel_offset }, outputs=outputs) rpn_rois.stop_gradient = True rpn_roi_probs.stop_gradient = True if not return_rois_num: rpn_rois_num = None return rpn_rois, rpn_roi_probs, rpn_rois_num def sigmoid_cross_entropy_with_logits(input, label, ignore_index=-100, normalize=False): output = F.binary_cross_entropy_with_logits(input, label, reduction='none') mask_tensor = paddle.cast(label != ignore_index, 'float32') output = paddle.multiply(output, mask_tensor) if normalize: sum_valid_mask = paddle.sum(mask_tensor) output = output / sum_valid_mask return output def smooth_l1(input, label, inside_weight=None, outside_weight=None, sigma=None): input_new = paddle.multiply(input, inside_weight) label_new = paddle.multiply(label, inside_weight) delta = 1 / (sigma * sigma) out = F.smooth_l1_loss(input_new, label_new, reduction='none', delta=delta) out = paddle.multiply(out, outside_weight) out = out / delta out = paddle.reshape(out, shape=[out.shape[0], -1]) out = paddle.sum(out, axis=1) return out def channel_shuffle(x, groups): batch_size, num_channels, height, width = x.shape[0:4] assert num_channels % groups == 0, 'num_channels should be divisible by groups' channels_per_group = num_channels // groups x = paddle.reshape( x=x, shape=[batch_size, groups, channels_per_group, height, width]) x = paddle.transpose(x=x, perm=[0, 2, 1, 3, 4]) x = paddle.reshape(x=x, shape=[batch_size, num_channels, height, width]) return x def get_static_shape(tensor): shape = paddle.shape(tensor) shape.stop_gradient = True return shape ================================================ FILE: ppdet/modeling/post_process.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from ppdet.modeling.bbox_utils import nonempty_bbox from .transformers import bbox_cxcywh_to_xyxy try: from collections.abc import Sequence except Exception: from collections import Sequence __all__ = [ 'BBoxPostProcess', 'MaskPostProcess', 'JDEBBoxPostProcess', 'CenterNetPostProcess', 'DETRPostProcess', 'SparsePostProcess', 'DETRBBoxSemiPostProcess' ] @register class BBoxPostProcess(object): __shared__ = ['num_classes', 'export_onnx', 'export_eb'] __inject__ = ['decode', 'nms'] def __init__(self, num_classes=80, decode=None, nms=None, export_onnx=False, export_eb=False): super(BBoxPostProcess, self).__init__() self.num_classes = num_classes self.decode = decode self.nms = nms self.export_onnx = export_onnx self.export_eb = export_eb def __call__(self, head_out, rois, im_shape, scale_factor): """ Decode the bbox and do NMS if needed. Args: head_out (tuple): bbox_pred and cls_prob of bbox_head output. rois (tuple): roi and rois_num of rpn_head output. im_shape (Tensor): The shape of the input image. scale_factor (Tensor): The scale factor of the input image. export_onnx (bool): whether export model to onnx Returns: bbox_pred (Tensor): The output prediction with shape [N, 6], including labels, scores and bboxes. The size of bboxes are corresponding to the input image, the bboxes may be used in other branch. bbox_num (Tensor): The number of prediction boxes of each batch with shape [1], and is N. """ if self.nms is not None: bboxes, score = self.decode(head_out, rois, im_shape, scale_factor) bbox_pred, bbox_num, before_nms_indexes = self.nms(bboxes, score, self.num_classes) else: bbox_pred, bbox_num = self.decode(head_out, rois, im_shape, scale_factor) if self.export_onnx: # add fake box after postprocess when exporting onnx fake_bboxes = paddle.to_tensor( np.array( [[0., 0.0, 0.0, 0.0, 1.0, 1.0]], dtype='float32')) bbox_pred = paddle.concat([bbox_pred, fake_bboxes]) bbox_num = bbox_num + 1 if self.nms is not None: return bbox_pred, bbox_num, before_nms_indexes else: return bbox_pred, bbox_num def get_pred(self, bboxes, bbox_num, im_shape, scale_factor): """ Rescale, clip and filter the bbox from the output of NMS to get final prediction. Notes: Currently only support bs = 1. Args: bboxes (Tensor): The output bboxes with shape [N, 6] after decode and NMS, including labels, scores and bboxes. bbox_num (Tensor): The number of prediction boxes of each batch with shape [1], and is N. im_shape (Tensor): The shape of the input image. scale_factor (Tensor): The scale factor of the input image. Returns: pred_result (Tensor): The final prediction results with shape [N, 6] including labels, scores and bboxes. """ if self.export_eb: # enable rcnn models for edgeboard hw to skip the following postprocess. return bboxes, bboxes, bbox_num if not self.export_onnx: bboxes_list = [] bbox_num_list = [] id_start = 0 fake_bboxes = paddle.to_tensor( np.array( [[0., 0.0, 0.0, 0.0, 1.0, 1.0]], dtype='float32')) fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32')) # add fake bbox when output is empty for each batch for i in range(bbox_num.shape[0]): if bbox_num[i] == 0: bboxes_i = fake_bboxes bbox_num_i = fake_bbox_num else: bboxes_i = bboxes[id_start:id_start + bbox_num[i], :] bbox_num_i = bbox_num[i:i + 1] # id_start: 0-dim, bbox_num: 1-dim. Use bbox_num[i] instead of bbox_num[i:i+1] in pir. id_start += bbox_num[i] bboxes_list.append(bboxes_i) bbox_num_list.append(bbox_num_i) bboxes = paddle.concat(bboxes_list) bbox_num = paddle.concat(bbox_num_list) origin_shape = paddle.floor(im_shape / scale_factor + 0.5) if not self.export_onnx: origin_shape_list = [] scale_factor_list = [] # scale_factor: scale_y, scale_x for i in range(bbox_num.shape[0]): expand_shape = paddle.expand(origin_shape[i:i + 1, :], [bbox_num[i:i + 1], 2]) scale_y, scale_x = scale_factor[i, 0], scale_factor[i, 1] # TODO(PIR): something wrong with slice op, remove unsqueeze in the future. scale_y = paddle.unsqueeze(scale_y, 0) scale_x = paddle.unsqueeze(scale_x, 0) scale = paddle.concat([scale_x, scale_y, scale_x, scale_y]) expand_scale = paddle.expand(scale, [bbox_num[i:i + 1], 4]) origin_shape_list.append(expand_shape) scale_factor_list.append(expand_scale) self.origin_shape_list = paddle.concat(origin_shape_list) scale_factor_list = paddle.concat(scale_factor_list) else: # simplify the computation for bs=1 when exporting onnx scale_y, scale_x = scale_factor[0][0], scale_factor[0][1] scale = paddle.concat( [scale_x, scale_y, scale_x, scale_y]).unsqueeze(0) self.origin_shape_list = paddle.expand(origin_shape, [bbox_num[0:1], 2]) scale_factor_list = paddle.expand(scale, [bbox_num[0:1], 4]) # bboxes: [N, 6], label, score, bbox pred_label = bboxes[:, 0:1] pred_score = bboxes[:, 1:2] pred_bbox = bboxes[:, 2:] # rescale bbox to original image scaled_bbox = pred_bbox / scale_factor_list origin_h = self.origin_shape_list[:, 0] origin_w = self.origin_shape_list[:, 1] zeros = paddle.zeros_like(origin_h) # clip bbox to [0, original_size] x1 = paddle.maximum(paddle.minimum(scaled_bbox[:, 0], origin_w), zeros) y1 = paddle.maximum(paddle.minimum(scaled_bbox[:, 1], origin_h), zeros) x2 = paddle.maximum(paddle.minimum(scaled_bbox[:, 2], origin_w), zeros) y2 = paddle.maximum(paddle.minimum(scaled_bbox[:, 3], origin_h), zeros) pred_bbox = paddle.stack([x1, y1, x2, y2], axis=-1) # filter empty bbox keep_mask = nonempty_bbox(pred_bbox, return_mask=True) keep_mask = paddle.unsqueeze(keep_mask, [1]) pred_label = paddle.where(keep_mask, pred_label, paddle.ones_like(pred_label) * -1) pred_result = paddle.concat([pred_label, pred_score, pred_bbox], axis=1) return bboxes, pred_result, bbox_num def get_origin_shape(self, ): return self.origin_shape_list @register class MaskPostProcess(object): __shared__ = ['export_onnx', 'assign_on_cpu'] """ refer to: https://github.com/facebookresearch/detectron2/layers/mask_ops.py Get Mask output according to the output from model """ def __init__(self, binary_thresh=0.5, export_onnx=False, assign_on_cpu=False): super(MaskPostProcess, self).__init__() self.binary_thresh = binary_thresh self.export_onnx = export_onnx self.assign_on_cpu = assign_on_cpu def __call__(self, mask_out, bboxes, bbox_num, origin_shape): """ Decode the mask_out and paste the mask to the origin image. Args: mask_out (Tensor): mask_head output with shape [N, 28, 28]. bbox_pred (Tensor): The output bboxes with shape [N, 6] after decode and NMS, including labels, scores and bboxes. bbox_num (Tensor): The number of prediction boxes of each batch with shape [1], and is N. origin_shape (Tensor): The origin shape of the input image, the tensor shape is [N, 2], and each row is [h, w]. Returns: pred_result (Tensor): The final prediction mask results with shape [N, h, w] in binary mask style. """ num_mask = mask_out.shape[0] origin_shape = paddle.cast(origin_shape, 'int32') device = paddle.device.get_device() if self.export_onnx: h, w = origin_shape[0][0], origin_shape[0][1] mask_onnx = paste_mask(mask_out[:, None, :, :], bboxes[:, 2:], h, w, self.assign_on_cpu) mask_onnx = mask_onnx >= self.binary_thresh pred_result = paddle.cast(mask_onnx, 'int32') else: max_h = paddle.max(origin_shape[:, 0]) max_w = paddle.max(origin_shape[:, 1]) pred_result = paddle.zeros( [num_mask, max_h, max_w], dtype='int32') - 1 id_start = 0 for i in range(bbox_num.shape[0]): bboxes_i = bboxes[id_start:id_start + bbox_num[i], :] mask_out_i = mask_out[id_start:id_start + bbox_num[i], :, :] im_h = origin_shape[i, 0] im_w = origin_shape[i, 1] pred_mask = paste_mask(mask_out_i[:, None, :, :], bboxes_i[:, 2:], im_h, im_w, self.assign_on_cpu) pred_mask = paddle.cast(pred_mask >= self.binary_thresh, 'int32') pred_result[id_start:id_start + bbox_num[i], :im_h, : im_w] = pred_mask id_start += bbox_num[i] if self.assign_on_cpu: paddle.set_device(device) return pred_result @register class JDEBBoxPostProcess(nn.Layer): __shared__ = ['num_classes'] __inject__ = ['decode', 'nms'] def __init__(self, num_classes=1, decode=None, nms=None, return_idx=True): super(JDEBBoxPostProcess, self).__init__() self.num_classes = num_classes self.decode = decode self.nms = nms self.return_idx = return_idx self.fake_bbox_pred = paddle.to_tensor( np.array( [[-1, 0.0, 0.0, 0.0, 0.0, 0.0]], dtype='float32')) self.fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32')) self.fake_nms_keep_idx = paddle.to_tensor( np.array( [[0]], dtype='int32')) self.fake_yolo_boxes_out = paddle.to_tensor( np.array( [[[0.0, 0.0, 0.0, 0.0]]], dtype='float32')) self.fake_yolo_scores_out = paddle.to_tensor( np.array( [[[0.0]]], dtype='float32')) self.fake_boxes_idx = paddle.to_tensor(np.array([[0]], dtype='int64')) def forward(self, head_out, anchors): """ Decode the bbox and do NMS for JDE model. Args: head_out (list): Bbox_pred and cls_prob of bbox_head output. anchors (list): Anchors of JDE model. Returns: boxes_idx (Tensor): The index of kept bboxes after decode 'JDEBox'. bbox_pred (Tensor): The output is the prediction with shape [N, 6] including labels, scores and bboxes. bbox_num (Tensor): The number of prediction of each batch with shape [N]. nms_keep_idx (Tensor): The index of kept bboxes after NMS. """ boxes_idx, yolo_boxes_scores = self.decode(head_out, anchors) if len(boxes_idx) == 0: boxes_idx = self.fake_boxes_idx yolo_boxes_out = self.fake_yolo_boxes_out yolo_scores_out = self.fake_yolo_scores_out else: yolo_boxes = paddle.gather_nd(yolo_boxes_scores, boxes_idx) # TODO: only support bs=1 now yolo_boxes_out = paddle.reshape( yolo_boxes[:, :4], shape=[1, len(boxes_idx), 4]) yolo_scores_out = paddle.reshape( yolo_boxes[:, 4:5], shape=[1, 1, len(boxes_idx)]) boxes_idx = boxes_idx[:, 1:] if self.return_idx: bbox_pred, bbox_num, nms_keep_idx = self.nms( yolo_boxes_out, yolo_scores_out, self.num_classes) if bbox_pred.shape[0] == 0: bbox_pred = self.fake_bbox_pred bbox_num = self.fake_bbox_num nms_keep_idx = self.fake_nms_keep_idx return boxes_idx, bbox_pred, bbox_num, nms_keep_idx else: bbox_pred, bbox_num, _ = self.nms(yolo_boxes_out, yolo_scores_out, self.num_classes) if bbox_pred.shape[0] == 0: bbox_pred = self.fake_bbox_pred bbox_num = self.fake_bbox_num return _, bbox_pred, bbox_num, _ @register class CenterNetPostProcess(object): """ Postprocess the model outputs to get final prediction: 1. Do NMS for heatmap to get top `max_per_img` bboxes. 2. Decode bboxes using center offset and box size. 3. Rescale decoded bboxes reference to the origin image shape. Args: max_per_img(int): the maximum number of predicted objects in a image, 500 by default. down_ratio(int): the down ratio from images to heatmap, 4 by default. regress_ltrb (bool): whether to regress left/top/right/bottom or width/height for a box, true by default. """ __shared__ = ['down_ratio'] def __init__(self, max_per_img=500, down_ratio=4, regress_ltrb=True): super(CenterNetPostProcess, self).__init__() self.max_per_img = max_per_img self.down_ratio = down_ratio self.regress_ltrb = regress_ltrb # _simple_nms() _topk() are same as TTFBox in ppdet/modeling/layers.py def _simple_nms(self, heat, kernel=3): """ Use maxpool to filter the max score, get local peaks. """ pad = (kernel - 1) // 2 hmax = F.max_pool2d(heat, kernel, stride=1, padding=pad) keep = paddle.cast(hmax == heat, 'float32') return heat * keep def _topk(self, scores): """ Select top k scores and decode to get xy coordinates. """ k = self.max_per_img shape_fm = paddle.shape(scores) shape_fm.stop_gradient = True cat, height, width = shape_fm[1], shape_fm[2], shape_fm[3] # batch size is 1 scores_r = paddle.reshape(scores, [cat, -1]) topk_scores, topk_inds = paddle.topk(scores_r, k) topk_ys = topk_inds // width topk_xs = topk_inds % width topk_score_r = paddle.reshape(topk_scores, [-1]) topk_score, topk_ind = paddle.topk(topk_score_r, k) k_t = paddle.full(topk_ind.shape, k, dtype='int64') topk_clses = paddle.cast(paddle.floor_divide(topk_ind, k_t), 'float32') topk_inds = paddle.reshape(topk_inds, [-1]) topk_ys = paddle.reshape(topk_ys, [-1, 1]) topk_xs = paddle.reshape(topk_xs, [-1, 1]) topk_inds = paddle.gather(topk_inds, topk_ind) topk_ys = paddle.gather(topk_ys, topk_ind) topk_xs = paddle.gather(topk_xs, topk_ind) return topk_score, topk_inds, topk_clses, topk_ys, topk_xs def __call__(self, hm, wh, reg, im_shape, scale_factor): # 1.get clses and scores, note that hm had been done sigmoid heat = self._simple_nms(hm) scores, inds, topk_clses, ys, xs = self._topk(heat) clses = topk_clses.unsqueeze(1) scores = scores.unsqueeze(1) # 2.get bboxes, note only support batch_size=1 now reg_t = paddle.transpose(reg, [0, 2, 3, 1]) reg = paddle.reshape(reg_t, [-1, reg_t.shape[-1]]) reg = paddle.gather(reg, inds) xs = paddle.cast(xs, 'float32') ys = paddle.cast(ys, 'float32') xs = xs + reg[:, 0:1] ys = ys + reg[:, 1:2] wh_t = paddle.transpose(wh, [0, 2, 3, 1]) wh = paddle.reshape(wh_t, [-1, wh_t.shape[-1]]) wh = paddle.gather(wh, inds) if self.regress_ltrb: x1 = xs - wh[:, 0:1] y1 = ys - wh[:, 1:2] x2 = xs + wh[:, 2:3] y2 = ys + wh[:, 3:4] else: x1 = xs - wh[:, 0:1] / 2 y1 = ys - wh[:, 1:2] / 2 x2 = xs + wh[:, 0:1] / 2 y2 = ys + wh[:, 1:2] / 2 n, c, feat_h, feat_w = paddle.shape(hm) padw = (feat_w * self.down_ratio - im_shape[0, 1]) / 2 padh = (feat_h * self.down_ratio - im_shape[0, 0]) / 2 x1 = x1 * self.down_ratio y1 = y1 * self.down_ratio x2 = x2 * self.down_ratio y2 = y2 * self.down_ratio x1 = x1 - padw y1 = y1 - padh x2 = x2 - padw y2 = y2 - padh bboxes = paddle.concat([x1, y1, x2, y2], axis=1) scale_y = scale_factor[:, 0:1] scale_x = scale_factor[:, 1:2] scale_expand = paddle.concat( [scale_x, scale_y, scale_x, scale_y], axis=1) boxes_shape = bboxes.shape[:] scale_expand = paddle.expand(scale_expand, shape=boxes_shape) bboxes = paddle.divide(bboxes, scale_expand) results = paddle.concat([clses, scores, bboxes], axis=1) return results, paddle.shape(results)[0:1], inds, topk_clses, ys, xs @register class DETRPostProcess(object): __shared__ = ['num_classes', 'use_focal_loss', 'with_mask'] __inject__ = [] def __init__(self, num_classes=80, num_top_queries=100, dual_queries=False, dual_groups=0, use_focal_loss=False, with_mask=False, mask_stride=4, mask_threshold=0.5, use_avg_mask_score=False, bbox_decode_type='origin'): super(DETRPostProcess, self).__init__() assert bbox_decode_type in ['origin', 'pad'] self.num_classes = num_classes self.num_top_queries = num_top_queries self.dual_queries = dual_queries self.dual_groups = dual_groups self.use_focal_loss = use_focal_loss self.with_mask = with_mask self.mask_stride = mask_stride self.mask_threshold = mask_threshold self.use_avg_mask_score = use_avg_mask_score self.bbox_decode_type = bbox_decode_type def _mask_postprocess(self, mask_pred, score_pred): mask_score = F.sigmoid(mask_pred) mask_pred = (mask_score > self.mask_threshold).astype(mask_score.dtype) if self.use_avg_mask_score: avg_mask_score = (mask_pred * mask_score).sum([-2, -1]) / ( mask_pred.sum([-2, -1]) + 1e-6) score_pred *= avg_mask_score return mask_pred.flatten(0, 1).astype('int32'), score_pred def __call__(self, head_out, im_shape, scale_factor, pad_shape): """ Decode the bbox and mask. Args: head_out (tuple): bbox_pred, cls_logit and masks of bbox_head output. im_shape (Tensor): The shape of the input image without padding. scale_factor (Tensor): The scale factor of the input image. pad_shape (Tensor): The shape of the input image with padding. Returns: bbox_pred (Tensor): The output prediction with shape [N, 6], including labels, scores and bboxes. The size of bboxes are corresponding to the input image, the bboxes may be used in other branch. bbox_num (Tensor): The number of prediction boxes of each batch with shape [bs], and is N. """ bboxes, logits, masks = head_out if self.dual_queries: num_queries = logits.shape[1] logits, bboxes = logits[:, :int(num_queries // (self.dual_groups + 1)), :], \ bboxes[:, :int(num_queries // (self.dual_groups + 1)), :] bbox_pred = bbox_cxcywh_to_xyxy(bboxes) # calculate the original shape of the image origin_shape = paddle.floor(im_shape / scale_factor + 0.5) img_h, img_w = paddle.split(origin_shape, 2, axis=-1) if self.bbox_decode_type == 'pad': # calculate the shape of the image with padding out_shape = pad_shape / im_shape * origin_shape out_shape = out_shape.flip(1).tile([1, 2]).unsqueeze(1) elif self.bbox_decode_type == 'origin': out_shape = origin_shape.flip(1).tile([1, 2]).unsqueeze(1) else: raise Exception( f'Wrong `bbox_decode_type`: {self.bbox_decode_type}.') bbox_pred *= out_shape scores = F.sigmoid(logits) if self.use_focal_loss else F.softmax( logits)[:, :, :-1] if not self.use_focal_loss: scores, labels = scores.max(-1), scores.argmax(-1) if scores.shape[1] > self.num_top_queries: scores, index = paddle.topk( scores, self.num_top_queries, axis=-1) batch_ind = paddle.arange( end=scores.shape[0]).unsqueeze(-1).tile( [1, self.num_top_queries]) index = paddle.stack([batch_ind, index], axis=-1) labels = paddle.gather_nd(labels, index) bbox_pred = paddle.gather_nd(bbox_pred, index) else: scores, index = paddle.topk( scores.flatten(1), self.num_top_queries, axis=-1) labels = index % self.num_classes index = index // self.num_classes batch_ind = paddle.arange(end=scores.shape[0]).unsqueeze(-1).tile( [1, self.num_top_queries]) index = paddle.stack([batch_ind, index], axis=-1) bbox_pred = paddle.gather_nd(bbox_pred, index) mask_pred = None if self.with_mask: assert masks is not None assert masks.shape[0] == 1 masks = paddle.gather_nd(masks, index) if self.bbox_decode_type == 'pad': masks = F.interpolate( masks, scale_factor=self.mask_stride, mode="bilinear", align_corners=False) # TODO: Support prediction with bs>1. # remove padding for input image h, w = im_shape.astype('int32')[0] masks = masks[..., :h, :w] # get pred_mask in the original resolution. img_h = img_h[0].astype('int32') img_w = img_w[0].astype('int32') masks = F.interpolate( masks, size=[img_h, img_w], mode="bilinear", align_corners=False) mask_pred, scores = self._mask_postprocess(masks, scores) bbox_pred = paddle.concat( [ labels.unsqueeze(-1).astype('float32'), scores.unsqueeze(-1), bbox_pred ], axis=-1) bbox_num = paddle.to_tensor( self.num_top_queries, dtype='int32').tile([bbox_pred.shape[0]]) bbox_pred = bbox_pred.reshape([-1, 6]) return bbox_pred, bbox_num, mask_pred @register class SparsePostProcess(object): __shared__ = ['num_classes', 'assign_on_cpu'] def __init__(self, num_proposals, num_classes=80, binary_thresh=0.5, assign_on_cpu=False): super(SparsePostProcess, self).__init__() self.num_classes = num_classes self.num_proposals = num_proposals self.binary_thresh = binary_thresh self.assign_on_cpu = assign_on_cpu def __call__(self, scores, bboxes, scale_factor, ori_shape, masks=None): assert len(scores) == len(bboxes) == \ len(ori_shape) == len(scale_factor) device = paddle.device.get_device() batch_size = len(ori_shape) scores = F.sigmoid(scores) has_mask = masks is not None if has_mask: masks = F.sigmoid(masks) masks = masks.reshape([batch_size, -1, *masks.shape[1:]]) bbox_pred = [] mask_pred = [] if has_mask else None bbox_num = paddle.zeros([batch_size], dtype='int32') for i in range(batch_size): score = scores[i] bbox = bboxes[i] score, indices = score.flatten(0, 1).topk( self.num_proposals, sorted=False) label = indices % self.num_classes if has_mask: mask = masks[i] mask = mask.flatten(0, 1)[indices] H, W = ori_shape[i][0], ori_shape[i][1] bbox = bbox[paddle.cast(indices / self.num_classes, indices.dtype)] bbox /= scale_factor[i] bbox[:, 0::2] = paddle.clip(bbox[:, 0::2], 0, W) bbox[:, 1::2] = paddle.clip(bbox[:, 1::2], 0, H) keep = ((bbox[:, 2] - bbox[:, 0]).numpy() > 1.) & \ ((bbox[:, 3] - bbox[:, 1]).numpy() > 1.) if keep.sum() == 0: bbox = paddle.zeros([1, 6], dtype='float32') if has_mask: mask = paddle.zeros([1, H, W], dtype='uint8') else: label = paddle.to_tensor(label.numpy()[keep]).astype( 'float32').unsqueeze(-1) score = paddle.to_tensor(score.numpy()[keep]).astype( 'float32').unsqueeze(-1) bbox = paddle.to_tensor(bbox.numpy()[keep]).astype('float32') if has_mask: mask = paddle.to_tensor(mask.numpy()[keep]).astype( 'float32').unsqueeze(1) mask = paste_mask(mask, bbox, H, W, self.assign_on_cpu) mask = paddle.cast(mask >= self.binary_thresh, 'uint8') bbox = paddle.concat([label, score, bbox], axis=-1) bbox_num[i] = bbox.shape[0] bbox_pred.append(bbox) if has_mask: mask_pred.append(mask) bbox_pred = paddle.concat(bbox_pred) mask_pred = paddle.concat(mask_pred) if has_mask else None if self.assign_on_cpu: paddle.set_device(device) if has_mask: return bbox_pred, bbox_num, mask_pred else: return bbox_pred, bbox_num def paste_mask(masks, boxes, im_h, im_w, assign_on_cpu=False): """ Paste the mask prediction to the original image. """ x0_int, y0_int = 0, 0 x1_int, y1_int = im_w, im_h x0, y0, x1, y1 = paddle.split(boxes, 4, axis=1) N = masks.shape[0] img_y = paddle.arange(y0_int, y1_int) + 0.5 img_x = paddle.arange(x0_int, x1_int) + 0.5 img_y = (img_y - y0) / (y1 - y0) * 2 - 1 img_x = (img_x - x0) / (x1 - x0) * 2 - 1 # img_x, img_y have shapes (N, w), (N, h) if assign_on_cpu: paddle.set_device('cpu') gx = img_x[:, None, :].expand( [N, img_y.shape[1], img_x.shape[1]]) gy = img_y[:, :, None].expand( [N, img_y.shape[1], img_x.shape[1]]) grid = paddle.stack([gx, gy], axis=3) img_masks = F.grid_sample(masks, grid, align_corners=False) return img_masks[:, 0] def multiclass_nms(bboxs, num_classes, match_threshold=0.6, match_metric='iou'): final_boxes = [] for c in range(num_classes): idxs = bboxs[:, 0] == c if np.count_nonzero(idxs) == 0: continue r = nms(bboxs[idxs, 1:], match_threshold, match_metric) final_boxes.append(np.concatenate([np.full((r.shape[0], 1), c), r], 1)) return final_boxes def nms(dets, match_threshold=0.6, match_metric='iou'): """ Apply NMS to avoid detecting too many overlapping bounding boxes. Args: dets: shape [N, 5], [score, x1, y1, x2, y2] match_metric: 'iou' or 'ios' match_threshold: overlap thresh for match metric. """ if dets.shape[0] == 0: return dets[[], :] scores = dets[:, 0] x1 = dets[:, 1] y1 = dets[:, 2] x2 = dets[:, 3] y2 = dets[:, 4] areas = (x2 - x1 + 1) * (y2 - y1 + 1) order = scores.argsort()[::-1] keep = [] while order.size > 0: i = order[0] keep.append(i) xx1 = np.maximum(x1[i], x1[order[1:]]) yy1 = np.maximum(y1[i], y1[order[1:]]) xx2 = np.minimum(x2[i], x2[order[1:]]) yy2 = np.minimum(y2[i], y2[order[1:]]) w = np.maximum(0.0, xx2 - xx1 + 1) h = np.maximum(0.0, yy2 - yy1 + 1) inter = w * h if match_metric == 'iou': union = areas[i] + areas[order[1:]] - inter match_value = inter / union elif match_metric == 'ios': smaller = np.minimum(areas[i], areas[order[1:]]) match_value = inter / smaller else: raise ValueError() inds = np.where(match_value < match_threshold)[0] order = order[inds + 1] dets = dets[keep, :] return dets @register class DETRBBoxSemiPostProcess(object): __shared__ = ['num_classes', 'use_focal_loss'] __inject__ = [] def __init__(self, num_classes=80, num_top_queries=100, use_focal_loss=False): super(DETRBBoxSemiPostProcess, self).__init__() self.num_classes = num_classes self.num_top_queries = num_top_queries self.use_focal_loss = use_focal_loss def __call__(self, head_out): """ Decode the bbox. Args: head_out (tuple): bbox_pred, cls_logit and masks of bbox_head output. im_shape (Tensor): The shape of the input image. scale_factor (Tensor): The scale factor of the input image. Returns: bbox_pred (Tensor): The output prediction with shape [N, 6], including labels, scores and bboxes. The size of bboxes are corresponding to the input image, the bboxes may be used in other branch. bbox_num (Tensor): The number of prediction boxes of each batch with shape [bs], and is N. """ bboxes, logits, masks = head_out bbox_pred = bboxes scores = F.softmax(logits, axis=2) import copy soft_scores = copy.deepcopy(scores) scores, index = paddle.topk(scores.max(-1), 300, axis=-1) batch_ind = paddle.arange(end=scores.shape[0]).unsqueeze(-1).tile( [1, 300]) index = paddle.stack([batch_ind, index], axis=-1) labels = paddle.gather_nd(soft_scores.argmax(-1), index).astype('int32') score_class = paddle.gather_nd(soft_scores, index) bbox_pred = paddle.gather_nd(bbox_pred, index) bbox_pred = paddle.concat( [ labels.unsqueeze(-1).astype('float32'), score_class, scores.unsqueeze(-1), bbox_pred ], axis=-1) bbox_num = paddle.to_tensor( bbox_pred.shape[1], dtype='int32').tile([bbox_pred.shape[0]]) bbox_pred = bbox_pred.reshape([-1, bbox_pred.shape[-1]]) return bbox_pred, bbox_num ================================================ FILE: ppdet/modeling/proposal_generator/__init__.py ================================================ from . import rpn_head from . import embedding_rpn_head from .rpn_head import * from .embedding_rpn_head import * ================================================ FILE: ppdet/modeling/proposal_generator/anchor_generator.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # The code is based on # https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/anchor_generator.py import math import paddle import paddle.nn as nn import numpy as np from ppdet.core.workspace import register __all__ = ['AnchorGenerator', 'RetinaAnchorGenerator', 'S2ANetAnchorGenerator'] @register class AnchorGenerator(nn.Layer): """ Generate anchors according to the feature maps Args: anchor_sizes (list[float] | list[list[float]]): The anchor sizes at each feature point. list[float] means all feature levels share the same sizes. list[list[float]] means the anchor sizes for each level. The sizes stand for the scale of input size. aspect_ratios (list[float] | list[list[float]]): The aspect ratios at each feature point. list[float] means all feature levels share the same ratios. list[list[float]] means the aspect ratios for each level. strides (list[float]): The strides of feature maps which generate anchors offset (float): The offset of the coordinate of anchors, default 0. """ def __init__(self, anchor_sizes=[32, 64, 128, 256, 512], aspect_ratios=[0.5, 1.0, 2.0], strides=[16.0], variance=[1.0, 1.0, 1.0, 1.0], offset=0.): super(AnchorGenerator, self).__init__() self.anchor_sizes = anchor_sizes self.aspect_ratios = aspect_ratios self.strides = strides self.variance = variance self.cell_anchors = self._calculate_anchors(len(strides)) self.offset = offset def _broadcast_params(self, params, num_features): if not isinstance(params[0], (list, tuple)): # list[float] return [params] * num_features if len(params) == 1: return list(params) * num_features return params def generate_cell_anchors(self, sizes, aspect_ratios): anchors = [] for size in sizes: area = size**2.0 for aspect_ratio in aspect_ratios: w = math.sqrt(area / aspect_ratio) h = aspect_ratio * w x0, y0, x1, y1 = -w / 2.0, -h / 2.0, w / 2.0, h / 2.0 anchors.append([x0, y0, x1, y1]) return paddle.to_tensor(anchors, dtype='float32') def _calculate_anchors(self, num_features): sizes = self._broadcast_params(self.anchor_sizes, num_features) aspect_ratios = self._broadcast_params(self.aspect_ratios, num_features) cell_anchors = [ self.generate_cell_anchors(s, a) for s, a in zip(sizes, aspect_ratios) ] [ self.register_buffer( t.name, t, persistable=False) for t in cell_anchors ] return cell_anchors def _create_grid_offsets(self, size, stride, offset): grid_height, grid_width = size[0], size[1] shifts_x = paddle.arange( offset * stride, grid_width * stride, step=stride, dtype='float32') shifts_y = paddle.arange( offset * stride, grid_height * stride, step=stride, dtype='float32') shift_y, shift_x = paddle.meshgrid(shifts_y, shifts_x) shift_x = paddle.reshape(shift_x, [-1]) shift_y = paddle.reshape(shift_y, [-1]) return shift_x, shift_y def _grid_anchors(self, grid_sizes): anchors = [] for size, stride, base_anchors in zip(grid_sizes, self.strides, self.cell_anchors): shift_x, shift_y = self._create_grid_offsets(size, stride, self.offset) shifts = paddle.stack((shift_x, shift_y, shift_x, shift_y), axis=1) shifts = paddle.reshape(shifts, [-1, 1, 4]) base_anchors = paddle.reshape(base_anchors, [1, -1, 4]) anchors.append(paddle.reshape(shifts + base_anchors, [-1, 4])) return anchors def forward(self, input): grid_sizes = [feature_map.shape[-2:] for feature_map in input] anchors_over_all_feature_maps = self._grid_anchors(grid_sizes) return anchors_over_all_feature_maps @property def num_anchors(self): """ Returns: int: number of anchors at every pixel location, on that feature map. For example, if at every pixel we use anchors of 3 aspect ratios and 5 sizes, the number of anchors is 15. For FPN models, `num_anchors` on every feature map is the same. """ return len(self.cell_anchors[0]) @register class RetinaAnchorGenerator(AnchorGenerator): def __init__(self, octave_base_scale=4, scales_per_octave=3, aspect_ratios=[0.5, 1.0, 2.0], strides=[8.0, 16.0, 32.0, 64.0, 128.0], variance=[1.0, 1.0, 1.0, 1.0], offset=0.0): anchor_sizes = [] for s in strides: anchor_sizes.append([ s * octave_base_scale * 2**(i/scales_per_octave) \ for i in range(scales_per_octave)]) super(RetinaAnchorGenerator, self).__init__( anchor_sizes=anchor_sizes, aspect_ratios=aspect_ratios, strides=strides, variance=variance, offset=offset) @register class S2ANetAnchorGenerator(nn.Layer): """ AnchorGenerator by paddle """ def __init__(self, base_size, scales, ratios, scale_major=True, ctr=None): super(S2ANetAnchorGenerator, self).__init__() self.base_size = base_size self.scales = paddle.to_tensor(scales) self.ratios = paddle.to_tensor(ratios) self.scale_major = scale_major self.ctr = ctr self.base_anchors = self.gen_base_anchors() @property def num_base_anchors(self): return self.base_anchors.shape[0] def gen_base_anchors(self): w = self.base_size h = self.base_size if self.ctr is None: x_ctr = 0.5 * (w - 1) y_ctr = 0.5 * (h - 1) else: x_ctr, y_ctr = self.ctr h_ratios = paddle.sqrt(self.ratios) w_ratios = 1 / h_ratios if self.scale_major: ws = (w * w_ratios[:] * self.scales[:].astype(w_ratios.dtype)).reshape([-1]) hs = (h * h_ratios[:] * self.scales[:].astype(h_ratios.dtype)).reshape([-1]) else: ws = (w * self.scales[:].astype(w_ratios.dtype) * w_ratios[:]).reshape([-1]) hs = (h * self.scales[:].astype(h_ratios.dtype) * h_ratios[:]).reshape([-1]) base_anchors = paddle.stack( [ x_ctr - 0.5 * (ws - 1), y_ctr - 0.5 * (hs - 1), x_ctr + 0.5 * (ws - 1), y_ctr + 0.5 * (hs - 1) ], axis=-1) base_anchors = paddle.round(base_anchors) return base_anchors def _meshgrid(self, x, y, row_major=True): yy, xx = paddle.meshgrid(y, x) yy = yy.reshape([-1]) xx = xx.reshape([-1]) if row_major: return xx, yy else: return yy, xx def forward(self, featmap_size, stride=16): # featmap_size*stride project it to original area feat_h = featmap_size[0] feat_w = featmap_size[1] shift_x = paddle.arange(0, feat_w, 1, 'int32') * stride shift_y = paddle.arange(0, feat_h, 1, 'int32') * stride shift_xx, shift_yy = self._meshgrid(shift_x, shift_y) shifts = paddle.stack([shift_xx, shift_yy, shift_xx, shift_yy], axis=-1) all_anchors = self.base_anchors[:, :] + shifts[:, :].astype(self.base_anchors.dtype) all_anchors = all_anchors.cast(paddle.float32).reshape( [feat_h * feat_w, 4]) all_anchors = self.rect2rbox(all_anchors) return all_anchors def valid_flags(self, featmap_size, valid_size): feat_h, feat_w = featmap_size valid_h, valid_w = valid_size assert valid_h <= feat_h and valid_w <= feat_w valid_x = paddle.zeros([feat_w], dtype='int32') valid_y = paddle.zeros([feat_h], dtype='int32') valid_x[:valid_w] = 1 valid_y[:valid_h] = 1 valid_xx, valid_yy = self._meshgrid(valid_x, valid_y) valid = valid_xx & valid_yy valid = paddle.reshape(valid, [-1, 1]) valid = paddle.expand(valid, [-1, self.num_base_anchors]).reshape([-1]) return valid def rect2rbox(self, bboxes): """ :param bboxes: shape (L, 4) (xmin, ymin, xmax, ymax) :return: dbboxes: shape (L, 5) (x_ctr, y_ctr, w, h, angle) """ x1, y1, x2, y2 = paddle.split(bboxes, 4, axis=-1) x_ctr = (x1 + x2) / 2.0 y_ctr = (y1 + y2) / 2.0 edges1 = paddle.abs(x2 - x1) edges2 = paddle.abs(y2 - y1) rbox_w = paddle.maximum(edges1, edges2) rbox_h = paddle.minimum(edges1, edges2) # set angle inds = edges1 < edges2 inds = paddle.cast(inds, paddle.float32) rboxes_angle = inds * np.pi / 2.0 rboxes = paddle.concat( (x_ctr, y_ctr, rbox_w, rbox_h, rboxes_angle), axis=-1) return rboxes ================================================ FILE: ppdet/modeling/proposal_generator/embedding_rpn_head.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # This code is referenced from: https://github.com/open-mmlab/mmdetection import paddle from paddle import nn from ppdet.core.workspace import register __all__ = ['EmbeddingRPNHead'] @register class EmbeddingRPNHead(nn.Layer): __shared__ = ['proposal_embedding_dim'] def __init__(self, num_proposals, proposal_embedding_dim=256): super(EmbeddingRPNHead, self).__init__() self.num_proposals = num_proposals self.proposal_embedding_dim = proposal_embedding_dim self._init_layers() self._init_weights() def _init_layers(self): self.init_proposal_bboxes = nn.Embedding(self.num_proposals, 4) self.init_proposal_features = nn.Embedding(self.num_proposals, self.proposal_embedding_dim) def _init_weights(self): init_bboxes = paddle.empty_like(self.init_proposal_bboxes.weight) init_bboxes[:, :2] = 0.5 init_bboxes[:, 2:] = 1.0 self.init_proposal_bboxes.weight.set_value(init_bboxes) @staticmethod def bbox_cxcywh_to_xyxy(x): cxcy, wh = paddle.split(x, 2, axis=-1) return paddle.concat([cxcy - 0.5 * wh, cxcy + 0.5 * wh], axis=-1) def forward(self, img_whwh): proposal_bboxes = self.init_proposal_bboxes.weight.clone() proposal_bboxes = self.bbox_cxcywh_to_xyxy(proposal_bboxes) proposal_bboxes = proposal_bboxes.unsqueeze(0) * img_whwh.unsqueeze(1) proposal_features = self.init_proposal_features.weight.clone() proposal_features = proposal_features.unsqueeze(0).tile( [img_whwh.shape[0], 1, 1]) return proposal_bboxes, proposal_features ================================================ FILE: ppdet/modeling/proposal_generator/proposal_generator.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle from ppdet.core.workspace import register, serializable from .. import ops @register @serializable class ProposalGenerator(object): """ Proposal generation module For more details, please refer to the document of generate_proposals in ppdet/modeing/ops.py Args: pre_nms_top_n (int): Number of total bboxes to be kept per image before NMS. default 6000 post_nms_top_n (int): Number of total bboxes to be kept per image after NMS. default 1000 nms_thresh (float): Threshold in NMS. default 0.5 min_size (flaot): Remove predicted boxes with either height or width < min_size. default 0.1 eta (float): Apply in adaptive NMS, if adaptive `threshold > 0.5`, `adaptive_threshold = adaptive_threshold * eta` in each iteration. default 1. topk_after_collect (bool): whether to adopt topk after batch collection. If topk_after_collect is true, box filter will not be used after NMS at each image in proposal generation. default false """ def __init__(self, pre_nms_top_n=12000, post_nms_top_n=2000, nms_thresh=.5, min_size=.1, eta=1., topk_after_collect=False): super(ProposalGenerator, self).__init__() self.pre_nms_top_n = pre_nms_top_n self.post_nms_top_n = post_nms_top_n self.nms_thresh = nms_thresh self.min_size = min_size self.eta = eta self.topk_after_collect = topk_after_collect def __call__(self, scores, bbox_deltas, anchors, im_shape): top_n = self.pre_nms_top_n if self.topk_after_collect else self.post_nms_top_n variances = paddle.ones_like(anchors) if hasattr(paddle.vision.ops, "generate_proposals"): generate_proposals = getattr(paddle.vision.ops, "generate_proposals") else: generate_proposals = ops.generate_proposals rpn_rois, rpn_rois_prob, rpn_rois_num = generate_proposals( scores, bbox_deltas, im_shape, anchors, variances, pre_nms_top_n=self.pre_nms_top_n, post_nms_top_n=top_n, nms_thresh=self.nms_thresh, min_size=self.min_size, eta=self.eta, return_rois_num=True) return rpn_rois, rpn_rois_prob, rpn_rois_num, self.post_nms_top_n ================================================ FILE: ppdet/modeling/proposal_generator/rpn_head.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle.nn.initializer import Normal from ppdet.core.workspace import register from .anchor_generator import AnchorGenerator from .target_layer import RPNTargetAssign from .proposal_generator import ProposalGenerator from ..cls_utils import _get_class_default_kwargs class RPNFeat(nn.Layer): """ Feature extraction in RPN head Args: in_channel (int): Input channel out_channel (int): Output channel """ def __init__(self, in_channel=1024, out_channel=1024): super(RPNFeat, self).__init__() # rpn feat is shared with each level self.rpn_conv = nn.Conv2D( in_channels=in_channel, out_channels=out_channel, kernel_size=3, padding=1, weight_attr=paddle.ParamAttr(initializer=Normal( mean=0., std=0.01))) self.rpn_conv.skip_quant = True def forward(self, feats): rpn_feats = [] for feat in feats: rpn_feats.append(F.relu(self.rpn_conv(feat))) return rpn_feats @register class RPNHead(nn.Layer): """ Region Proposal Network Args: anchor_generator (dict): configure of anchor generation rpn_target_assign (dict): configure of rpn targets assignment train_proposal (dict): configure of proposals generation at the stage of training test_proposal (dict): configure of proposals generation at the stage of prediction in_channel (int): channel of input feature maps which can be derived by from_config """ __shared__ = ['export_onnx'] __inject__ = ['loss_rpn_bbox'] def __init__(self, anchor_generator=_get_class_default_kwargs(AnchorGenerator), rpn_target_assign=_get_class_default_kwargs(RPNTargetAssign), train_proposal=_get_class_default_kwargs(ProposalGenerator, 12000, 2000), test_proposal=_get_class_default_kwargs(ProposalGenerator), in_channel=1024, export_onnx=False, loss_rpn_bbox=None): super(RPNHead, self).__init__() self.anchor_generator = anchor_generator self.rpn_target_assign = rpn_target_assign self.train_proposal = train_proposal self.test_proposal = test_proposal self.export_onnx = export_onnx if isinstance(anchor_generator, dict): self.anchor_generator = AnchorGenerator(**anchor_generator) if isinstance(rpn_target_assign, dict): self.rpn_target_assign = RPNTargetAssign(**rpn_target_assign) if isinstance(train_proposal, dict): self.train_proposal = ProposalGenerator(**train_proposal) if isinstance(test_proposal, dict): self.test_proposal = ProposalGenerator(**test_proposal) self.loss_rpn_bbox = loss_rpn_bbox num_anchors = self.anchor_generator.num_anchors self.rpn_feat = RPNFeat(in_channel, in_channel) # rpn head is shared with each level # rpn roi classification scores self.rpn_rois_score = nn.Conv2D( in_channels=in_channel, out_channels=num_anchors, kernel_size=1, padding=0, weight_attr=paddle.ParamAttr(initializer=Normal( mean=0., std=0.01))) self.rpn_rois_score.skip_quant = True # rpn roi bbox regression deltas self.rpn_rois_delta = nn.Conv2D( in_channels=in_channel, out_channels=4 * num_anchors, kernel_size=1, padding=0, weight_attr=paddle.ParamAttr(initializer=Normal( mean=0., std=0.01))) self.rpn_rois_delta.skip_quant = True @classmethod def from_config(cls, cfg, input_shape): # FPN share same rpn head if isinstance(input_shape, (list, tuple)): input_shape = input_shape[0] return {'in_channel': input_shape.channels} def forward(self, feats, inputs): rpn_feats = self.rpn_feat(feats) scores = [] deltas = [] for rpn_feat in rpn_feats: rrs = self.rpn_rois_score(rpn_feat) rrd = self.rpn_rois_delta(rpn_feat) scores.append(rrs) deltas.append(rrd) anchors = self.anchor_generator(rpn_feats) rois, rois_num = self._gen_proposal(scores, deltas, anchors, inputs) if self.training: loss = self.get_loss(scores, deltas, anchors, inputs) return rois, rois_num, loss else: return rois, rois_num, None def _gen_proposal(self, scores, bbox_deltas, anchors, inputs): """ scores (list[Tensor]): Multi-level scores prediction bbox_deltas (list[Tensor]): Multi-level deltas prediction anchors (list[Tensor]): Multi-level anchors inputs (dict): ground truth info """ prop_gen = self.train_proposal if self.training else self.test_proposal im_shape = inputs['im_shape'] # Collect multi-level proposals for each batch # Get 'topk' of them as final output if self.export_onnx: # bs = 1 when exporting onnx onnx_rpn_rois_list = [] onnx_rpn_prob_list = [] onnx_rpn_rois_num_list = [] for rpn_score, rpn_delta, anchor in zip(scores, bbox_deltas, anchors): onnx_rpn_rois, onnx_rpn_rois_prob, onnx_rpn_rois_num, onnx_post_nms_top_n = prop_gen( scores=rpn_score[0:1], bbox_deltas=rpn_delta[0:1], anchors=anchor, im_shape=im_shape[0:1]) onnx_rpn_rois_list.append(onnx_rpn_rois) onnx_rpn_prob_list.append(onnx_rpn_rois_prob) onnx_rpn_rois_num_list.append(onnx_rpn_rois_num) onnx_rpn_rois = paddle.concat(onnx_rpn_rois_list) onnx_rpn_prob = paddle.concat(onnx_rpn_prob_list).flatten() onnx_top_n = paddle.to_tensor(onnx_post_nms_top_n).cast('int32') onnx_num_rois = paddle.shape(onnx_rpn_prob)[0].cast('int32') k = paddle.minimum(onnx_top_n, onnx_num_rois) onnx_topk_prob, onnx_topk_inds = paddle.topk(onnx_rpn_prob, k) onnx_topk_rois = paddle.gather(onnx_rpn_rois, onnx_topk_inds) # TODO(wangguanzhong): Now bs_rois_collect in export_onnx is moved outside conditional branch # due to problems in dy2static of paddle. Will fix it when updating paddle framework. # bs_rois_collect = [onnx_topk_rois] # bs_rois_num_collect = paddle.shape(onnx_topk_rois)[0] else: bs_rois_collect = [] bs_rois_num_collect = [] batch_size = im_shape.shape[0] # Generate proposals for each level and each batch. # Discard batch-computing to avoid sorting bbox cross different batches. for i in range(batch_size): rpn_rois_list = [] rpn_prob_list = [] rpn_rois_num_list = [] for rpn_score, rpn_delta, anchor in zip(scores, bbox_deltas, anchors): rpn_rois, rpn_rois_prob, rpn_rois_num, post_nms_top_n = prop_gen( scores=rpn_score[i:i + 1], bbox_deltas=rpn_delta[i:i + 1], anchors=anchor, im_shape=im_shape[i:i + 1]) rpn_rois_list.append(rpn_rois) rpn_prob_list.append(rpn_rois_prob) rpn_rois_num_list.append(rpn_rois_num) if len(scores) > 1: rpn_rois = paddle.concat(rpn_rois_list) rpn_prob = paddle.concat(rpn_prob_list).flatten() num_rois = rpn_prob.shape[0] num_rois = paddle.shape(rpn_prob)[0].cast('int32') if num_rois > post_nms_top_n: topk_prob, topk_inds = paddle.topk(rpn_prob, post_nms_top_n) topk_rois = paddle.gather(rpn_rois, topk_inds) else: topk_rois = rpn_rois topk_prob = rpn_prob topk_inds = paddle.zeros(shape=[post_nms_top_n], dtype="int64") else: topk_rois = rpn_rois_list[0] topk_prob = rpn_prob_list[0].flatten() bs_rois_collect.append(topk_rois) bs_rois_num_collect.append(paddle.shape(topk_rois)[0:1]) # TODO(PIR): remove this after pir bug fixed rpn_rois_list = None rpn_prob_list = None rpn_rois_num_list = None bs_rois_num_collect = paddle.concat(bs_rois_num_collect) if self.export_onnx: output_rois = [onnx_topk_rois] output_rois_num = paddle.shape(onnx_topk_rois)[0] else: output_rois = bs_rois_collect output_rois_num = bs_rois_num_collect return output_rois, output_rois_num def get_loss(self, pred_scores, pred_deltas, anchors, inputs): """ pred_scores (list[Tensor]): Multi-level scores prediction pred_deltas (list[Tensor]): Multi-level deltas prediction anchors (list[Tensor]): Multi-level anchors inputs (dict): ground truth info, including im, gt_bbox, gt_score """ anchors = [paddle.reshape(a, shape=(-1, 4)) for a in anchors] anchors = paddle.concat(anchors) scores = [ paddle.reshape( paddle.transpose( v, perm=[0, 2, 3, 1]), shape=(v.shape[0], -1, 1)) for v in pred_scores ] scores = paddle.concat(scores, axis=1) deltas = [ paddle.reshape( paddle.transpose( v, perm=[0, 2, 3, 1]), shape=(v.shape[0], -1, 4)) for v in pred_deltas ] deltas = paddle.concat(deltas, axis=1) score_tgt, bbox_tgt, loc_tgt, norm = self.rpn_target_assign(inputs, anchors) scores = paddle.reshape(x=scores, shape=(-1, )) deltas = paddle.reshape(x=deltas, shape=(-1, 4)) score_tgt = paddle.concat(score_tgt) score_tgt.stop_gradient = True pos_mask = score_tgt == 1 pos_ind = paddle.nonzero(pos_mask) valid_mask = score_tgt >= 0 valid_ind = paddle.nonzero(valid_mask) # cls loss if valid_ind.shape[0] == 0: loss_rpn_cls = paddle.zeros([1], dtype='float32') else: score_pred = paddle.gather(scores, valid_ind) score_label = paddle.gather(score_tgt, valid_ind).cast('float32') score_label.stop_gradient = True loss_rpn_cls = F.binary_cross_entropy_with_logits( logit=score_pred, label=score_label, reduction="sum") # reg loss if pos_ind.shape[0] == 0: loss_rpn_reg = paddle.zeros([1], dtype='float32') else: loc_pred = paddle.gather(deltas, pos_ind) loc_tgt = paddle.concat(loc_tgt) loc_tgt = paddle.gather(loc_tgt, pos_ind) loc_tgt.stop_gradient = True if self.loss_rpn_bbox is None: loss_rpn_reg = paddle.abs(loc_pred - loc_tgt).sum() else: loss_rpn_reg = self.loss_rpn_bbox(loc_pred, loc_tgt).sum() return { 'loss_rpn_cls': loss_rpn_cls / norm, 'loss_rpn_reg': loss_rpn_reg / norm } ================================================ FILE: ppdet/modeling/proposal_generator/target.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np import paddle from ..bbox_utils import bbox2delta, bbox_overlaps def rpn_anchor_target(anchors, gt_boxes, rpn_batch_size_per_im, rpn_positive_overlap, rpn_negative_overlap, rpn_fg_fraction, use_random=True, batch_size=1, ignore_thresh=-1, is_crowd=None, weights=[1., 1., 1., 1.], assign_on_cpu=False): tgt_labels = [] tgt_bboxes = [] tgt_deltas = [] for i in range(batch_size): gt_bbox = gt_boxes[i] is_crowd_i = is_crowd[i] if is_crowd else None # Step1: match anchor and gt_bbox matches, match_labels = label_box( anchors, gt_bbox, rpn_positive_overlap, rpn_negative_overlap, True, ignore_thresh, is_crowd_i, assign_on_cpu) # Step2: sample anchor fg_inds, bg_inds = subsample_labels(match_labels, rpn_batch_size_per_im, rpn_fg_fraction, 0, use_random) # Fill with the ignore label (-1), then set positive and negative labels labels = paddle.full(match_labels.shape, -1, dtype='int32') if bg_inds.shape[0] > 0: labels = paddle.scatter(labels, bg_inds, paddle.zeros_like(bg_inds)) if fg_inds.shape[0] > 0: labels = paddle.scatter(labels, fg_inds, paddle.ones_like(fg_inds)) # Step3: make output if gt_bbox.shape[0] == 0: matched_gt_boxes = paddle.zeros([matches.shape[0], 4]) tgt_delta = paddle.zeros([matches.shape[0], 4]) else: matched_gt_boxes = paddle.gather(gt_bbox, matches) tgt_delta = bbox2delta(anchors, matched_gt_boxes, weights) matched_gt_boxes.stop_gradient = True tgt_delta.stop_gradient = True labels.stop_gradient = True tgt_labels.append(labels) tgt_bboxes.append(matched_gt_boxes) tgt_deltas.append(tgt_delta) return tgt_labels, tgt_bboxes, tgt_deltas def label_box(anchors, gt_boxes, positive_overlap, negative_overlap, allow_low_quality, ignore_thresh, is_crowd=None, assign_on_cpu=False): if assign_on_cpu: device = paddle.device.get_device() paddle.set_device("cpu") iou = bbox_overlaps(gt_boxes, anchors) paddle.set_device(device) else: iou = bbox_overlaps(gt_boxes, anchors) n_gt = gt_boxes.shape[0] if n_gt == 0 or is_crowd is None: n_gt_crowd = 0 else: n_gt_crowd = paddle.nonzero(is_crowd).shape[0] if iou.shape[0] == 0 or n_gt_crowd == n_gt: # No truth, assign everything to background default_matches = paddle.full((iou.shape[1], ), 0, dtype='int64') default_match_labels = paddle.full((iou.shape[1], ), 0, dtype='int32') return default_matches, default_match_labels # if ignore_thresh > 0, remove anchor if it is closed to # one of the crowded ground-truth if n_gt_crowd > 0: N_a = anchors.shape[0] ones = paddle.ones([N_a]) mask = is_crowd * ones if ignore_thresh > 0: crowd_iou = iou * mask valid = (paddle.sum((crowd_iou > ignore_thresh).cast('int32'), axis=0) > 0).cast('float32') iou = iou * (1 - valid) - valid # ignore the iou between anchor and crowded ground-truth iou = iou * (1 - mask) - mask matched_vals, matches = paddle.topk(iou, k=1, axis=0) match_labels = paddle.full(matches.shape, -1, dtype='int32') # set ignored anchor with iou = -1 neg_cond = paddle.logical_and(matched_vals > -1, matched_vals < negative_overlap) match_labels = paddle.where(neg_cond, paddle.zeros_like(match_labels), match_labels) match_labels = paddle.where(matched_vals >= positive_overlap, paddle.ones_like(match_labels), match_labels) if allow_low_quality: highest_quality_foreach_gt = iou.max(axis=1, keepdim=True) pred_inds_with_highest_quality = paddle.logical_and( iou > 0, iou == highest_quality_foreach_gt).cast('int32').sum( 0, keepdim=True) match_labels = paddle.where(pred_inds_with_highest_quality > 0, paddle.ones_like(match_labels), match_labels) matches = matches.flatten() match_labels = match_labels.flatten() return matches, match_labels def subsample_labels(labels, num_samples, fg_fraction, bg_label=0, use_random=True): positive = paddle.nonzero( paddle.logical_and(labels != -1, labels != bg_label)) negative = paddle.nonzero(labels == bg_label) fg_num = int(num_samples * fg_fraction) fg_num = min(positive.numel(), fg_num) bg_num = num_samples - fg_num bg_num = min(negative.numel(), bg_num) if fg_num == 0 and bg_num == 0: fg_inds = paddle.zeros([0], dtype='int32') bg_inds = paddle.zeros([0], dtype='int32') return fg_inds, bg_inds # randomly select positive and negative examples negative = negative.cast('int32').flatten() bg_perm = paddle.randperm(negative.numel(), dtype='int32') bg_perm = paddle.slice(bg_perm, axes=[0], starts=[0], ends=[bg_num]) if use_random: bg_inds = paddle.gather(negative, bg_perm) else: bg_inds = paddle.slice(negative, axes=[0], starts=[0], ends=[bg_num]) if fg_num == 0: fg_inds = paddle.zeros([0], dtype='int32') return fg_inds, bg_inds positive = positive.cast('int32').flatten() fg_perm = paddle.randperm(positive.numel(), dtype='int32') fg_perm = paddle.slice(fg_perm, axes=[0], starts=[0], ends=[fg_num]) if use_random: fg_inds = paddle.gather(positive, fg_perm) else: fg_inds = paddle.slice(positive, axes=[0], starts=[0], ends=[fg_num]) return fg_inds, bg_inds def generate_proposal_target(rpn_rois, gt_classes, gt_boxes, batch_size_per_im, fg_fraction, fg_thresh, bg_thresh, num_classes, ignore_thresh=-1., is_crowd=None, use_random=True, is_cascade=False, cascade_iou=0.5, assign_on_cpu=False, add_gt_as_proposals=True): rois_with_gt = [] tgt_labels = [] tgt_bboxes = [] tgt_gt_inds = [] new_rois_num = [] # In cascade rcnn, the threshold for foreground and background # is used from cascade_iou fg_thresh = cascade_iou if is_cascade else fg_thresh bg_thresh = cascade_iou if is_cascade else bg_thresh for i, rpn_roi in enumerate(rpn_rois): gt_bbox = gt_boxes[i] is_crowd_i = is_crowd[i] if is_crowd else None gt_class = paddle.squeeze(gt_classes[i], axis=-1) # Concat RoIs and gt boxes except cascade rcnn or none gt if add_gt_as_proposals and gt_bbox.shape[0] > 0: bbox = paddle.concat([rpn_roi, gt_bbox]) else: bbox = rpn_roi # Step1: label bbox matches, match_labels = label_box(bbox, gt_bbox, fg_thresh, bg_thresh, False, ignore_thresh, is_crowd_i, assign_on_cpu) # Step2: sample bbox sampled_inds, sampled_gt_classes = sample_bbox( matches, match_labels, gt_class, batch_size_per_im, fg_fraction, num_classes, use_random, is_cascade) # Step3: make output rois_per_image = bbox if is_cascade else paddle.gather(bbox, sampled_inds) sampled_gt_ind = matches if is_cascade else paddle.gather(matches, sampled_inds) if gt_bbox.shape[0] > 0: sampled_bbox = paddle.gather(gt_bbox, sampled_gt_ind) else: num = rois_per_image.shape[0] sampled_bbox = paddle.zeros([num, 4], dtype='float32') rois_per_image.stop_gradient = True sampled_gt_ind.stop_gradient = True sampled_bbox.stop_gradient = True tgt_labels.append(sampled_gt_classes) tgt_bboxes.append(sampled_bbox) rois_with_gt.append(rois_per_image) tgt_gt_inds.append(sampled_gt_ind) new_rois_num.append(paddle.shape(sampled_inds)[0:1]) new_rois_num = paddle.concat(new_rois_num) return rois_with_gt, tgt_labels, tgt_bboxes, tgt_gt_inds, new_rois_num def sample_bbox(matches, match_labels, gt_classes, batch_size_per_im, fg_fraction, num_classes, use_random=True, is_cascade=False): n_gt = gt_classes.shape[0] if n_gt == 0: # No truth, assign everything to background gt_classes = paddle.ones(matches.shape, dtype='int32') * num_classes #return matches, match_labels + num_classes else: gt_classes = paddle.gather(gt_classes, matches) gt_classes = paddle.where(match_labels == 0, paddle.ones_like(gt_classes) * num_classes, gt_classes) gt_classes = paddle.where(match_labels == -1, paddle.ones_like(gt_classes) * -1, gt_classes) if is_cascade: index = paddle.arange(matches.shape[0]) return index, gt_classes rois_per_image = int(batch_size_per_im) fg_inds, bg_inds = subsample_labels(gt_classes, rois_per_image, fg_fraction, num_classes, use_random) if fg_inds.shape[0] == 0 and bg_inds.shape[0] == 0: # fake output labeled with -1 when all boxes are neither # foreground nor background sampled_inds = paddle.zeros([1], dtype='int32') else: sampled_inds = paddle.concat([fg_inds, bg_inds]) sampled_gt_classes = paddle.gather(gt_classes, sampled_inds) return sampled_inds, sampled_gt_classes def polygons_to_mask(polygons, height, width): """ Convert the polygons to mask format Args: polygons (list[ndarray]): each array has shape (Nx2,) height (int): mask height width (int): mask width Returns: ndarray: a bool mask of shape (height, width) """ import pycocotools.mask as mask_util assert len(polygons) > 0, "COCOAPI does not support empty polygons" rles = mask_util.frPyObjects(polygons, height, width) rle = mask_util.merge(rles) return mask_util.decode(rle).astype(np.bool_) def rasterize_polygons_within_box(poly, box, resolution): w, h = box[2] - box[0], box[3] - box[1] polygons = [np.asarray(p, dtype=np.float64) for p in poly] for p in polygons: p[0::2] = p[0::2] - box[0] p[1::2] = p[1::2] - box[1] ratio_h = resolution / max(h, 0.1) ratio_w = resolution / max(w, 0.1) if ratio_h == ratio_w: for p in polygons: p *= ratio_h else: for p in polygons: p[0::2] *= ratio_w p[1::2] *= ratio_h # 3. Rasterize the polygons with coco api mask = polygons_to_mask(polygons, resolution, resolution) mask = paddle.to_tensor(mask, dtype='int32') return mask def generate_mask_target(gt_segms, rois, labels_int32, sampled_gt_inds, num_classes, resolution): mask_rois = [] mask_rois_num = [] tgt_masks = [] tgt_classes = [] mask_index = [] tgt_weights = [] for k in range(len(rois)): labels_per_im = labels_int32[k] # select rois labeled with foreground fg_inds = paddle.nonzero( paddle.logical_and(labels_per_im != -1, labels_per_im != num_classes)) has_fg = True # generate fake roi if foreground is empty if fg_inds.numel() == 0: has_fg = False fg_inds = paddle.ones([1, 1], dtype='int64') inds_per_im = sampled_gt_inds[k] inds_per_im = paddle.gather(inds_per_im, fg_inds) rois_per_im = rois[k] fg_rois = paddle.gather(rois_per_im, fg_inds) # Copy the foreground roi to cpu # to generate mask target with ground-truth boxes = fg_rois.numpy() gt_segms_per_im = gt_segms[k] new_segm = [] inds_per_im = inds_per_im.numpy() if len(gt_segms_per_im) > 0: for i in inds_per_im: new_segm.append(gt_segms_per_im[i]) fg_inds_new = fg_inds.reshape([-1]).numpy() results = [] if len(gt_segms_per_im) > 0: for j in range(fg_inds_new.shape[0]): results.append( rasterize_polygons_within_box(new_segm[j], boxes[j], resolution)) else: results.append(paddle.ones([resolution, resolution], dtype='int32')) fg_classes = paddle.gather(labels_per_im, fg_inds) weight = paddle.ones([fg_rois.shape[0]], dtype='float32') if not has_fg: # now all sampled classes are background # which will cause error in loss calculation, # make fake classes with weight of 0. fg_classes = paddle.zeros([1], dtype='int32') weight = weight - 1 tgt_mask = paddle.stack(results) tgt_mask.stop_gradient = True fg_rois.stop_gradient = True mask_index.append(fg_inds) mask_rois.append(fg_rois) mask_rois_num.append(paddle.shape(fg_rois)[0:1]) tgt_classes.append(fg_classes) tgt_masks.append(tgt_mask) tgt_weights.append(weight) mask_index = paddle.concat(mask_index) mask_rois_num = paddle.concat(mask_rois_num) tgt_classes = paddle.concat(tgt_classes, axis=0) tgt_masks = paddle.concat(tgt_masks, axis=0) tgt_weights = paddle.concat(tgt_weights, axis=0) return mask_rois, mask_rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights def libra_sample_pos(max_overlaps, max_classes, pos_inds, num_expected): if len(pos_inds) <= num_expected: return pos_inds else: unique_gt_inds = np.unique(max_classes[pos_inds]) num_gts = len(unique_gt_inds) num_per_gt = int(round(num_expected / float(num_gts)) + 1) sampled_inds = [] for i in unique_gt_inds: inds = np.nonzero(max_classes == i)[0] before_len = len(inds) inds = list(set(inds) & set(pos_inds)) after_len = len(inds) if len(inds) > num_per_gt: inds = np.random.choice(inds, size=num_per_gt, replace=False) sampled_inds.extend(list(inds)) # combine as a new sampler if len(sampled_inds) < num_expected: num_extra = num_expected - len(sampled_inds) extra_inds = np.array(list(set(pos_inds) - set(sampled_inds))) assert len(sampled_inds) + len(extra_inds) == len(pos_inds), \ "sum of sampled_inds({}) and extra_inds({}) length must be equal with pos_inds({})!".format( len(sampled_inds), len(extra_inds), len(pos_inds)) if len(extra_inds) > num_extra: extra_inds = np.random.choice( extra_inds, size=num_extra, replace=False) sampled_inds.extend(extra_inds.tolist()) elif len(sampled_inds) > num_expected: sampled_inds = np.random.choice( sampled_inds, size=num_expected, replace=False) return paddle.to_tensor(sampled_inds) def libra_sample_via_interval(max_overlaps, full_set, num_expected, floor_thr, num_bins, bg_thresh): max_iou = max_overlaps.max() iou_interval = (max_iou - floor_thr) / num_bins per_num_expected = int(num_expected / num_bins) sampled_inds = [] for i in range(num_bins): start_iou = floor_thr + i * iou_interval end_iou = floor_thr + (i + 1) * iou_interval tmp_set = set( np.where( np.logical_and(max_overlaps >= start_iou, max_overlaps < end_iou))[0]) tmp_inds = list(tmp_set & full_set) if len(tmp_inds) > per_num_expected: tmp_sampled_set = np.random.choice( tmp_inds, size=per_num_expected, replace=False) else: tmp_sampled_set = np.array(tmp_inds, dtype=np.int32) sampled_inds.append(tmp_sampled_set) sampled_inds = np.concatenate(sampled_inds) if len(sampled_inds) < num_expected: num_extra = num_expected - len(sampled_inds) extra_inds = np.array(list(full_set - set(sampled_inds))) assert len(sampled_inds) + len(extra_inds) == len(full_set), \ "sum of sampled_inds({}) and extra_inds({}) length must be equal with full_set({})!".format( len(sampled_inds), len(extra_inds), len(full_set)) if len(extra_inds) > num_extra: extra_inds = np.random.choice(extra_inds, num_extra, replace=False) sampled_inds = np.concatenate([sampled_inds, extra_inds]) return sampled_inds def libra_sample_neg(max_overlaps, max_classes, neg_inds, num_expected, floor_thr=-1, floor_fraction=0, num_bins=3, bg_thresh=0.5): if len(neg_inds) <= num_expected: return neg_inds else: # balance sampling for negative samples neg_set = set(neg_inds.tolist()) if floor_thr > 0: floor_set = set( np.where( np.logical_and(max_overlaps >= 0, max_overlaps < floor_thr)) [0]) iou_sampling_set = set(np.where(max_overlaps >= floor_thr)[0]) elif floor_thr == 0: floor_set = set(np.where(max_overlaps == 0)[0]) iou_sampling_set = set(np.where(max_overlaps > floor_thr)[0]) else: floor_set = set() iou_sampling_set = set(np.where(max_overlaps > floor_thr)[0]) floor_thr = 0 floor_neg_inds = list(floor_set & neg_set) iou_sampling_neg_inds = list(iou_sampling_set & neg_set) num_expected_iou_sampling = int(num_expected * (1 - floor_fraction)) if len(iou_sampling_neg_inds) > num_expected_iou_sampling: if num_bins >= 2: iou_sampled_inds = libra_sample_via_interval( max_overlaps, set(iou_sampling_neg_inds), num_expected_iou_sampling, floor_thr, num_bins, bg_thresh) else: iou_sampled_inds = np.random.choice( iou_sampling_neg_inds, size=num_expected_iou_sampling, replace=False) else: iou_sampled_inds = np.array(iou_sampling_neg_inds, dtype=np.int32) num_expected_floor = num_expected - len(iou_sampled_inds) if len(floor_neg_inds) > num_expected_floor: sampled_floor_inds = np.random.choice( floor_neg_inds, size=num_expected_floor, replace=False) else: sampled_floor_inds = np.array(floor_neg_inds, dtype=np.int32) sampled_inds = np.concatenate((sampled_floor_inds, iou_sampled_inds)) if len(sampled_inds) < num_expected: num_extra = num_expected - len(sampled_inds) extra_inds = np.array(list(neg_set - set(sampled_inds))) if len(extra_inds) > num_extra: extra_inds = np.random.choice( extra_inds, size=num_extra, replace=False) sampled_inds = np.concatenate((sampled_inds, extra_inds)) return paddle.to_tensor(sampled_inds) def libra_label_box(anchors, gt_boxes, gt_classes, positive_overlap, negative_overlap, num_classes): # TODO: use paddle API to speed up gt_classes = gt_classes.numpy() gt_overlaps = np.zeros((anchors.shape[0], num_classes)) matches = np.zeros((anchors.shape[0]), dtype=np.int32) if len(gt_boxes) > 0: proposal_to_gt_overlaps = bbox_overlaps(anchors, gt_boxes).numpy() overlaps_argmax = proposal_to_gt_overlaps.argmax(axis=1) overlaps_max = proposal_to_gt_overlaps.max(axis=1) # Boxes which with non-zero overlap with gt boxes overlapped_boxes_ind = np.where(overlaps_max > 0)[0] overlapped_boxes_gt_classes = gt_classes[overlaps_argmax[ overlapped_boxes_ind]] for idx in range(len(overlapped_boxes_ind)): gt_overlaps[overlapped_boxes_ind[idx], overlapped_boxes_gt_classes[ idx]] = overlaps_max[overlapped_boxes_ind[idx]] matches[overlapped_boxes_ind[idx]] = overlaps_argmax[ overlapped_boxes_ind[idx]] gt_overlaps = paddle.to_tensor(gt_overlaps) matches = paddle.to_tensor(matches) matched_vals = paddle.max(gt_overlaps, axis=1) match_labels = paddle.full(matches.shape, -1, dtype='int32') match_labels = paddle.where(matched_vals < negative_overlap, paddle.zeros_like(match_labels), match_labels) match_labels = paddle.where(matched_vals >= positive_overlap, paddle.ones_like(match_labels), match_labels) return matches, match_labels, matched_vals def libra_sample_bbox(matches, match_labels, matched_vals, gt_classes, batch_size_per_im, num_classes, fg_fraction, fg_thresh, bg_thresh, num_bins, use_random=True, is_cascade_rcnn=False): rois_per_image = int(batch_size_per_im) fg_rois_per_im = int(np.round(fg_fraction * rois_per_image)) bg_rois_per_im = rois_per_image - fg_rois_per_im if is_cascade_rcnn: fg_inds = paddle.nonzero(matched_vals >= fg_thresh) bg_inds = paddle.nonzero(matched_vals < bg_thresh) else: matched_vals_np = matched_vals.numpy() match_labels_np = match_labels.numpy() # sample fg fg_inds = paddle.nonzero(matched_vals >= fg_thresh).flatten() fg_nums = int(np.minimum(fg_rois_per_im, fg_inds.shape[0])) if (fg_inds.shape[0] > fg_nums) and use_random: fg_inds = libra_sample_pos(matched_vals_np, match_labels_np, fg_inds.numpy(), fg_rois_per_im) fg_inds = fg_inds[:fg_nums] # sample bg bg_inds = paddle.nonzero(matched_vals < bg_thresh).flatten() bg_nums = int(np.minimum(rois_per_image - fg_nums, bg_inds.shape[0])) if (bg_inds.shape[0] > bg_nums) and use_random: bg_inds = libra_sample_neg( matched_vals_np, match_labels_np, bg_inds.numpy(), bg_rois_per_im, num_bins=num_bins, bg_thresh=bg_thresh) bg_inds = bg_inds[:bg_nums] sampled_inds = paddle.concat([fg_inds, bg_inds]) gt_classes = paddle.gather(gt_classes, matches) gt_classes = paddle.where(match_labels == 0, paddle.ones_like(gt_classes) * num_classes, gt_classes) gt_classes = paddle.where(match_labels == -1, paddle.ones_like(gt_classes) * -1, gt_classes) sampled_gt_classes = paddle.gather(gt_classes, sampled_inds) return sampled_inds, sampled_gt_classes def libra_generate_proposal_target(rpn_rois, gt_classes, gt_boxes, batch_size_per_im, fg_fraction, fg_thresh, bg_thresh, num_classes, use_random=True, is_cascade_rcnn=False, max_overlaps=None, num_bins=3): rois_with_gt = [] tgt_labels = [] tgt_bboxes = [] sampled_max_overlaps = [] tgt_gt_inds = [] new_rois_num = [] for i, rpn_roi in enumerate(rpn_rois): max_overlap = max_overlaps[i] if is_cascade_rcnn else None gt_bbox = gt_boxes[i] gt_class = paddle.squeeze(gt_classes[i], axis=-1) if is_cascade_rcnn: rpn_roi = filter_roi(rpn_roi, max_overlap) bbox = paddle.concat([rpn_roi, gt_bbox]) # Step1: label bbox matches, match_labels, matched_vals = libra_label_box( bbox, gt_bbox, gt_class, fg_thresh, bg_thresh, num_classes) # Step2: sample bbox sampled_inds, sampled_gt_classes = libra_sample_bbox( matches, match_labels, matched_vals, gt_class, batch_size_per_im, num_classes, fg_fraction, fg_thresh, bg_thresh, num_bins, use_random, is_cascade_rcnn) # Step3: make output rois_per_image = paddle.gather(bbox, sampled_inds) sampled_gt_ind = paddle.gather(matches, sampled_inds) sampled_bbox = paddle.gather(gt_bbox, sampled_gt_ind) sampled_overlap = paddle.gather(matched_vals, sampled_inds) rois_per_image.stop_gradient = True sampled_gt_ind.stop_gradient = True sampled_bbox.stop_gradient = True sampled_overlap.stop_gradient = True tgt_labels.append(sampled_gt_classes) tgt_bboxes.append(sampled_bbox) rois_with_gt.append(rois_per_image) sampled_max_overlaps.append(sampled_overlap) tgt_gt_inds.append(sampled_gt_ind) new_rois_num.append(paddle.shape(sampled_inds)[0:1]) new_rois_num = paddle.concat(new_rois_num) # rois_with_gt, tgt_labels, tgt_bboxes, tgt_gt_inds, new_rois_num return rois_with_gt, tgt_labels, tgt_bboxes, tgt_gt_inds, new_rois_num ================================================ FILE: ppdet/modeling/proposal_generator/target_layer.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys import paddle from ppdet.core.workspace import register, serializable from .target import rpn_anchor_target, generate_proposal_target, generate_mask_target, libra_generate_proposal_target import numpy as np @register @serializable class RPNTargetAssign(object): __shared__ = ['assign_on_cpu'] """ RPN targets assignment module The assignment consists of three steps: 1. Match anchor and ground-truth box, label the anchor with foreground or background sample 2. Sample anchors to keep the properly ratio between foreground and background 3. Generate the targets for classification and regression branch Args: batch_size_per_im (int): Total number of RPN samples per image. default 256 fg_fraction (float): Fraction of anchors that is labeled foreground, default 0.5 positive_overlap (float): Minimum overlap required between an anchor and ground-truth box for the (anchor, gt box) pair to be a foreground sample. default 0.7 negative_overlap (float): Maximum overlap allowed between an anchor and ground-truth box for the (anchor, gt box) pair to be a background sample. default 0.3 ignore_thresh(float): Threshold for ignoring the is_crowd ground-truth if the value is larger than zero. use_random (bool): Use random sampling to choose foreground and background boxes, default true. assign_on_cpu (bool): In case the number of gt box is too large, compute IoU on CPU, default false. """ def __init__(self, batch_size_per_im=256, fg_fraction=0.5, positive_overlap=0.7, negative_overlap=0.3, ignore_thresh=-1., use_random=True, assign_on_cpu=False): super(RPNTargetAssign, self).__init__() self.batch_size_per_im = batch_size_per_im self.fg_fraction = fg_fraction self.positive_overlap = positive_overlap self.negative_overlap = negative_overlap self.ignore_thresh = ignore_thresh self.use_random = use_random self.assign_on_cpu = assign_on_cpu def __call__(self, inputs, anchors): """ inputs: ground-truth instances. anchor_box (Tensor): [num_anchors, 4], num_anchors are all anchors in all feature maps. """ gt_boxes = inputs['gt_bbox'] is_crowd = inputs.get('is_crowd', None) batch_size = len(gt_boxes) tgt_labels, tgt_bboxes, tgt_deltas = rpn_anchor_target( anchors, gt_boxes, self.batch_size_per_im, self.positive_overlap, self.negative_overlap, self.fg_fraction, self.use_random, batch_size, self.ignore_thresh, is_crowd, assign_on_cpu=self.assign_on_cpu) norm = self.batch_size_per_im * batch_size return tgt_labels, tgt_bboxes, tgt_deltas, norm @register class BBoxAssigner(object): __shared__ = ['num_classes', 'assign_on_cpu'] """ RCNN targets assignment module The assignment consists of three steps: 1. Match RoIs and ground-truth box, label the RoIs with foreground or background sample 2. Sample anchors to keep the properly ratio between foreground and background 3. Generate the targets for classification and regression branch Args: batch_size_per_im (int): Total number of RoIs per image. default 512 fg_fraction (float): Fraction of RoIs that is labeled foreground, default 0.25 fg_thresh (float): Minimum overlap required between a RoI and ground-truth box for the (roi, gt box) pair to be a foreground sample. default 0.5 bg_thresh (float): Maximum overlap allowed between a RoI and ground-truth box for the (roi, gt box) pair to be a background sample. default 0.5 ignore_thresh(float): Threshold for ignoring the is_crowd ground-truth if the value is larger than zero. use_random (bool): Use random sampling to choose foreground and background boxes, default true cascade_iou (list[iou]): The list of overlap to select foreground and background of each stage, which is only used In Cascade RCNN. num_classes (int): The number of class. assign_on_cpu (bool): In case the number of gt box is too large, compute IoU on CPU, default false. """ def __init__(self, batch_size_per_im=512, fg_fraction=.25, fg_thresh=.5, bg_thresh=.5, ignore_thresh=-1., use_random=True, cascade_iou=[0.5, 0.6, 0.7], num_classes=80, assign_on_cpu=False): super(BBoxAssigner, self).__init__() self.batch_size_per_im = batch_size_per_im self.fg_fraction = fg_fraction self.fg_thresh = fg_thresh self.bg_thresh = bg_thresh self.ignore_thresh = ignore_thresh self.use_random = use_random self.cascade_iou = cascade_iou self.num_classes = num_classes self.assign_on_cpu = assign_on_cpu def __call__(self, rpn_rois, rpn_rois_num, inputs, stage=0, is_cascade=False, add_gt_as_proposals=True): gt_classes = inputs['gt_class'] gt_boxes = inputs['gt_bbox'] is_crowd = inputs.get('is_crowd', None) # rois, tgt_labels, tgt_bboxes, tgt_gt_inds # new_rois_num outs = generate_proposal_target( rpn_rois, gt_classes, gt_boxes, self.batch_size_per_im, self.fg_fraction, self.fg_thresh, self.bg_thresh, self.num_classes, self.ignore_thresh, is_crowd, self.use_random, is_cascade, self.cascade_iou[stage], self.assign_on_cpu, add_gt_as_proposals) rois = outs[0] rois_num = outs[-1] # tgt_labels, tgt_bboxes, tgt_gt_inds targets = outs[1:4] return rois, rois_num, targets @register class BBoxLibraAssigner(object): __shared__ = ['num_classes'] """ Libra-RCNN targets assignment module The assignment consists of three steps: 1. Match RoIs and ground-truth box, label the RoIs with foreground or background sample 2. Sample anchors to keep the properly ratio between foreground and background 3. Generate the targets for classification and regression branch Args: batch_size_per_im (int): Total number of RoIs per image. default 512 fg_fraction (float): Fraction of RoIs that is labeled foreground, default 0.25 fg_thresh (float): Minimum overlap required between a RoI and ground-truth box for the (roi, gt box) pair to be a foreground sample. default 0.5 bg_thresh (float): Maximum overlap allowed between a RoI and ground-truth box for the (roi, gt box) pair to be a background sample. default 0.5 use_random (bool): Use random sampling to choose foreground and background boxes, default true cascade_iou (list[iou]): The list of overlap to select foreground and background of each stage, which is only used In Cascade RCNN. num_classes (int): The number of class. num_bins (int): The number of libra_sample. """ def __init__(self, batch_size_per_im=512, fg_fraction=.25, fg_thresh=.5, bg_thresh=.5, use_random=True, cascade_iou=[0.5, 0.6, 0.7], num_classes=80, num_bins=3): super(BBoxLibraAssigner, self).__init__() self.batch_size_per_im = batch_size_per_im self.fg_fraction = fg_fraction self.fg_thresh = fg_thresh self.bg_thresh = bg_thresh self.use_random = use_random self.cascade_iou = cascade_iou self.num_classes = num_classes self.num_bins = num_bins def __call__(self, rpn_rois, rpn_rois_num, inputs, stage=0, is_cascade=False): gt_classes = inputs['gt_class'] gt_boxes = inputs['gt_bbox'] # rois, tgt_labels, tgt_bboxes, tgt_gt_inds outs = libra_generate_proposal_target( rpn_rois, gt_classes, gt_boxes, self.batch_size_per_im, self.fg_fraction, self.fg_thresh, self.bg_thresh, self.num_classes, self.use_random, is_cascade, self.cascade_iou[stage], self.num_bins) rois = outs[0] rois_num = outs[-1] # tgt_labels, tgt_bboxes, tgt_gt_inds targets = outs[1:4] return rois, rois_num, targets @register @serializable class MaskAssigner(object): __shared__ = ['num_classes', 'mask_resolution'] """ Mask targets assignment module The assignment consists of three steps: 1. Select RoIs labels with foreground. 2. Encode the RoIs and corresponding gt polygons to generate mask target Args: num_classes (int): The number of class mask_resolution (int): The resolution of mask target, default 14 """ def __init__(self, num_classes=80, mask_resolution=14): super(MaskAssigner, self).__init__() self.num_classes = num_classes self.mask_resolution = mask_resolution def __call__(self, rois, tgt_labels, tgt_gt_inds, inputs): gt_segms = inputs['gt_poly'] outs = generate_mask_target(gt_segms, rois, tgt_labels, tgt_gt_inds, self.num_classes, self.mask_resolution) # mask_rois, mask_rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights return outs @register class RBoxAssigner(object): """ assigner of rbox Args: pos_iou_thr (float): threshold of pos samples neg_iou_thr (float): threshold of neg samples min_iou_thr (float): the min threshold of samples ignore_iof_thr (int): the ignored threshold """ def __init__(self, pos_iou_thr=0.5, neg_iou_thr=0.4, min_iou_thr=0.0, ignore_iof_thr=-2): super(RBoxAssigner, self).__init__() self.pos_iou_thr = pos_iou_thr self.neg_iou_thr = neg_iou_thr self.min_iou_thr = min_iou_thr self.ignore_iof_thr = ignore_iof_thr def anchor_valid(self, anchors): """ Args: anchor: M x 4 Returns: """ if anchors.ndim == 3: anchors = anchors.reshape(-1, anchors.shape[-1]) assert anchors.ndim == 2 anchor_num = anchors.shape[0] anchor_valid = np.ones((anchor_num), np.int32) anchor_inds = np.arange(anchor_num) return anchor_inds def rbox2delta(self, proposals, gt, means=[0, 0, 0, 0, 0], stds=[1, 1, 1, 1, 1]): """ Args: proposals: tensor [N, 5] gt: gt [N, 5] means: means [5] stds: stds [5] Returns: """ proposals = proposals.astype(np.float64) PI = np.pi gt_widths = gt[..., 2] gt_heights = gt[..., 3] gt_angle = gt[..., 4] proposals_widths = proposals[..., 2] proposals_heights = proposals[..., 3] proposals_angle = proposals[..., 4] coord = gt[..., 0:2] - proposals[..., 0:2] dx = (np.cos(proposals[..., 4]) * coord[..., 0] + np.sin(proposals[..., 4]) * coord[..., 1]) / proposals_widths dy = (-np.sin(proposals[..., 4]) * coord[..., 0] + np.cos(proposals[..., 4]) * coord[..., 1]) / proposals_heights dw = np.log(gt_widths / proposals_widths) dh = np.log(gt_heights / proposals_heights) da = (gt_angle - proposals_angle) da = (da + PI / 4) % PI - PI / 4 da /= PI deltas = np.stack([dx, dy, dw, dh, da], axis=-1) means = np.array(means, dtype=deltas.dtype) stds = np.array(stds, dtype=deltas.dtype) deltas = (deltas - means) / stds deltas = deltas.astype(np.float32) return deltas def assign_anchor(self, anchors, gt_bboxes, gt_labels, pos_iou_thr, neg_iou_thr, min_iou_thr=0.0, ignore_iof_thr=-2): assert anchors.shape[1] == 4 or anchors.shape[1] == 5 assert gt_bboxes.shape[1] == 4 or gt_bboxes.shape[1] == 5 anchors_xc_yc = anchors gt_bboxes_xc_yc = gt_bboxes # calc rbox iou anchors_xc_yc = anchors_xc_yc.astype(np.float32) gt_bboxes_xc_yc = gt_bboxes_xc_yc.astype(np.float32) anchors_xc_yc = paddle.to_tensor(anchors_xc_yc) gt_bboxes_xc_yc = paddle.to_tensor(gt_bboxes_xc_yc) try: from ext_op import rbox_iou except Exception as e: print("import custom_ops error, try install ext_op " \ "following ppdet/ext_op/README.md", e) sys.stdout.flush() sys.exit(-1) iou = rbox_iou(gt_bboxes_xc_yc, anchors_xc_yc) iou = iou.numpy() iou = iou.T # every gt's anchor's index gt_bbox_anchor_inds = iou.argmax(axis=0) gt_bbox_anchor_iou = iou[gt_bbox_anchor_inds, np.arange(iou.shape[1])] gt_bbox_anchor_iou_inds = np.where(iou == gt_bbox_anchor_iou)[0] # every anchor's gt bbox's index anchor_gt_bbox_inds = iou.argmax(axis=1) anchor_gt_bbox_iou = iou[np.arange(iou.shape[0]), anchor_gt_bbox_inds] # (1) set labels=-2 as default labels = np.ones((iou.shape[0], ), dtype=np.int32) * ignore_iof_thr # (2) assign ignore labels[anchor_gt_bbox_iou < min_iou_thr] = ignore_iof_thr # (3) assign neg_ids -1 assign_neg_ids1 = anchor_gt_bbox_iou >= min_iou_thr assign_neg_ids2 = anchor_gt_bbox_iou < neg_iou_thr assign_neg_ids = np.logical_and(assign_neg_ids1, assign_neg_ids2) labels[assign_neg_ids] = -1 # anchor_gt_bbox_iou_inds # (4) assign max_iou as pos_ids >=0 anchor_gt_bbox_iou_inds = anchor_gt_bbox_inds[gt_bbox_anchor_iou_inds] # gt_bbox_anchor_iou_inds = np.logical_and(gt_bbox_anchor_iou_inds, anchor_gt_bbox_iou >= min_iou_thr) labels[gt_bbox_anchor_iou_inds] = gt_labels[anchor_gt_bbox_iou_inds] # (5) assign >= pos_iou_thr as pos_ids iou_pos_iou_thr_ids = anchor_gt_bbox_iou >= pos_iou_thr iou_pos_iou_thr_ids_box_inds = anchor_gt_bbox_inds[iou_pos_iou_thr_ids] labels[iou_pos_iou_thr_ids] = gt_labels[iou_pos_iou_thr_ids_box_inds] return anchor_gt_bbox_inds, anchor_gt_bbox_iou, labels def __call__(self, anchors, gt_bboxes, gt_labels, is_crowd): assert anchors.ndim == 2 assert anchors.shape[1] == 5 assert gt_bboxes.ndim == 2 assert gt_bboxes.shape[1] == 5 pos_iou_thr = self.pos_iou_thr neg_iou_thr = self.neg_iou_thr min_iou_thr = self.min_iou_thr ignore_iof_thr = self.ignore_iof_thr anchor_num = anchors.shape[0] gt_bboxes = gt_bboxes is_crowd_slice = is_crowd not_crowd_inds = np.where(is_crowd_slice == 0) # Step1: match anchor and gt_bbox anchor_gt_bbox_inds, anchor_gt_bbox_iou, labels = self.assign_anchor( anchors, gt_bboxes, gt_labels.reshape(-1), pos_iou_thr, neg_iou_thr, min_iou_thr, ignore_iof_thr) # Step2: sample anchor pos_inds = np.where(labels >= 0)[0] neg_inds = np.where(labels == -1)[0] # Step3: make output anchors_num = anchors.shape[0] bbox_targets = np.zeros_like(anchors) bbox_weights = np.zeros_like(anchors) bbox_gt_bboxes = np.zeros_like(anchors) pos_labels = np.zeros(anchors_num, dtype=np.int32) pos_labels_weights = np.zeros(anchors_num, dtype=np.float32) pos_sampled_anchors = anchors[pos_inds] pos_sampled_gt_boxes = gt_bboxes[anchor_gt_bbox_inds[pos_inds]] if len(pos_inds) > 0: pos_bbox_targets = self.rbox2delta(pos_sampled_anchors, pos_sampled_gt_boxes) bbox_targets[pos_inds, :] = pos_bbox_targets bbox_gt_bboxes[pos_inds, :] = pos_sampled_gt_boxes bbox_weights[pos_inds, :] = 1.0 pos_labels[pos_inds] = labels[pos_inds] pos_labels_weights[pos_inds] = 1.0 if len(neg_inds) > 0: pos_labels_weights[neg_inds] = 1.0 return (pos_labels, pos_labels_weights, bbox_targets, bbox_weights, bbox_gt_bboxes, pos_inds, neg_inds) ================================================ FILE: ppdet/modeling/rbox_utils.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math import paddle import numpy as np import cv2 def norm_angle(angle, range=[-np.pi / 4, np.pi]): return (angle - range[0]) % range[1] + range[0] # rbox function implemented using numpy def poly2rbox_le135_np(poly): """convert poly to rbox [-pi / 4, 3 * pi / 4] Args: poly: [x1, y1, x2, y2, x3, y3, x4, y4] Returns: rbox: [cx, cy, w, h, angle] """ poly = np.array(poly[:8], dtype=np.float32) pt1 = (poly[0], poly[1]) pt2 = (poly[2], poly[3]) pt3 = (poly[4], poly[5]) pt4 = (poly[6], poly[7]) edge1 = np.sqrt((pt1[0] - pt2[0]) * (pt1[0] - pt2[0]) + (pt1[1] - pt2[1]) * (pt1[1] - pt2[1])) edge2 = np.sqrt((pt2[0] - pt3[0]) * (pt2[0] - pt3[0]) + (pt2[1] - pt3[1]) * (pt2[1] - pt3[1])) width = max(edge1, edge2) height = min(edge1, edge2) rbox_angle = 0 if edge1 > edge2: rbox_angle = np.arctan2(float(pt2[1] - pt1[1]), float(pt2[0] - pt1[0])) elif edge2 >= edge1: rbox_angle = np.arctan2(float(pt4[1] - pt1[1]), float(pt4[0] - pt1[0])) rbox_angle = norm_angle(rbox_angle) x_ctr = float(pt1[0] + pt3[0]) / 2 y_ctr = float(pt1[1] + pt3[1]) / 2 return [x_ctr, y_ctr, width, height, rbox_angle] def poly2rbox_oc_np(poly): """convert poly to rbox (0, pi / 2] Args: poly: [x1, y1, x2, y2, x3, y3, x4, y4] Returns: rbox: [cx, cy, w, h, angle] """ points = np.array(poly, dtype=np.float32).reshape((-1, 2)) (cx, cy), (w, h), angle = cv2.minAreaRect(points) # using the new OpenCV Rotated BBox definition since 4.5.1 # if angle < 0, opencv is older than 4.5.1, angle is in [-90, 0) if angle < 0: angle += 90 w, h = h, w # convert angle to [0, 90) if angle == -0.0: angle = 0.0 if angle == 90.0: angle = 0.0 w, h = h, w angle = angle / 180 * np.pi return [cx, cy, w, h, angle] def poly2rbox_np(polys, rbox_type='oc'): """ polys: [x0,y0,x1,y1,x2,y2,x3,y3] to rboxes: [x_ctr,y_ctr,w,h,angle] """ assert rbox_type in ['oc', 'le135'], 'only oc or le135 is supported now' poly2rbox_fn = poly2rbox_oc_np if rbox_type == 'oc' else poly2rbox_le135_np rboxes = [] for poly in polys: x, y, w, h, angle = poly2rbox_fn(poly) rbox = np.array([x, y, w, h, angle], dtype=np.float32) rboxes.append(rbox) return np.array(rboxes) def cal_line_length(point1, point2): return math.sqrt( math.pow(point1[0] - point2[0], 2) + math.pow(point1[1] - point2[1], 2)) def get_best_begin_point_single(coordinate): x1, y1, x2, y2, x3, y3, x4, y4 = coordinate xmin = min(x1, x2, x3, x4) ymin = min(y1, y2, y3, y4) xmax = max(x1, x2, x3, x4) ymax = max(y1, y2, y3, y4) combinate = [[[x1, y1], [x2, y2], [x3, y3], [x4, y4]], [[x4, y4], [x1, y1], [x2, y2], [x3, y3]], [[x3, y3], [x4, y4], [x1, y1], [x2, y2]], [[x2, y2], [x3, y3], [x4, y4], [x1, y1]]] dst_coordinate = [[xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax]] force = 100000000.0 force_flag = 0 for i in range(4): temp_force = cal_line_length(combinate[i][0], dst_coordinate[0]) \ + cal_line_length(combinate[i][1], dst_coordinate[1]) \ + cal_line_length(combinate[i][2], dst_coordinate[2]) \ + cal_line_length(combinate[i][3], dst_coordinate[3]) if temp_force < force: force = temp_force force_flag = i if force_flag != 0: pass return np.array(combinate[force_flag]).reshape(8) def rbox2poly_np(rboxes): """ rboxes:[x_ctr,y_ctr,w,h,angle] to poly:[x0,y0,x1,y1,x2,y2,x3,y3] """ polys = [] for i in range(len(rboxes)): x_ctr, y_ctr, width, height, angle = rboxes[i][:5] tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2 rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]]) R = np.array([[np.cos(angle), -np.sin(angle)], [np.sin(angle), np.cos(angle)]]) poly = R.dot(rect) x0, x1, x2, x3 = poly[0, :4] + x_ctr y0, y1, y2, y3 = poly[1, :4] + y_ctr poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float32) poly = get_best_begin_point_single(poly) polys.append(poly) polys = np.array(polys) return polys # rbox function implemented using paddle def box2corners(box): """convert box coordinate to corners Args: box (Tensor): (B, N, 5) with (x, y, w, h, alpha) angle is in [0, 90) Returns: corners (Tensor): (B, N, 4, 2) with (x1, y1, x2, y2, x3, y3, x4, y4) """ B = box.shape[0] x, y, w, h, alpha = paddle.split(box, 5, axis=-1) x4 = paddle.to_tensor( [0.5, 0.5, -0.5, -0.5], dtype=paddle.float32).reshape( (1, 1, 4)) # (1,1,4) x4 = x4 * w # (B, N, 4) y4 = paddle.to_tensor( [-0.5, 0.5, 0.5, -0.5], dtype=paddle.float32).reshape((1, 1, 4)) y4 = y4 * h # (B, N, 4) corners = paddle.stack([x4, y4], axis=-1) # (B, N, 4, 2) sin = paddle.sin(alpha) cos = paddle.cos(alpha) row1 = paddle.concat([cos, sin], axis=-1) row2 = paddle.concat([-sin, cos], axis=-1) # (B, N, 2) rot_T = paddle.stack([row1, row2], axis=-2) # (B, N, 2, 2) rotated = paddle.bmm(corners.reshape([-1, 4, 2]), rot_T.reshape([-1, 2, 2])) rotated = rotated.reshape([B, -1, 4, 2]) # (B*N, 4, 2) -> (B, N, 4, 2) rotated[..., 0] += x rotated[..., 1] += y return rotated def paddle_gather(x, dim, index): index_shape = index.shape index_flatten = index.flatten() if dim < 0: dim = len(x.shape) + dim nd_index = [] for k in range(len(x.shape)): if k == dim: nd_index.append(index_flatten) else: reshape_shape = [1] * len(x.shape) reshape_shape[k] = x.shape[k] x_arange = paddle.arange(x.shape[k], dtype=index.dtype) x_arange = x_arange.reshape(reshape_shape) dim_index = paddle.expand(x_arange, index_shape).flatten() nd_index.append(dim_index) ind2 = paddle.transpose(paddle.stack(nd_index), [1, 0]).astype("int64") paddle_out = paddle.gather_nd(x, ind2).reshape(index_shape) return paddle_out def check_points_in_polys(points, polys): """Check whether point is in rotated boxes Args: points (tensor): (1, L, 2) anchor points polys (tensor): [B, N, 4, 2] gt_polys eps (float): default 1e-9 Returns: is_in_polys (tensor): (B, N, L) """ # [1, L, 2] -> [1, 1, L, 2] points = points.unsqueeze(0) # [B, N, 4, 2] -> [B, N, 1, 2] a, b, c, d = polys.split(4, axis=2) ab = b - a ad = d - a # [B, N, L, 2] ap = points - a # [B, N, 1] norm_ab = paddle.sum(ab * ab, axis=-1) # [B, N, 1] norm_ad = paddle.sum(ad * ad, axis=-1) # [B, N, L] dot product ap_dot_ab = paddle.sum(ap * ab, axis=-1) # [B, N, L] dot product ap_dot_ad = paddle.sum(ap * ad, axis=-1) # [B, N, L] = |A|*|B|*cos(theta) is_in_polys = (ap_dot_ab >= 0) & (ap_dot_ab <= norm_ab) & ( ap_dot_ad >= 0) & (ap_dot_ad <= norm_ad) return is_in_polys def check_points_in_rotated_boxes(points, boxes): """Check whether point is in rotated boxes Args: points (tensor): (1, L, 2) anchor points boxes (tensor): [B, N, 5] gt_bboxes eps (float): default 1e-9 Returns: is_in_box (tensor): (B, N, L) """ # [B, N, 5] -> [B, N, 4, 2] corners = box2corners(boxes) # [1, L, 2] -> [1, 1, L, 2] points = points.unsqueeze(0) # [B, N, 4, 2] -> [B, N, 1, 2] a, b, c, d = corners.split(4, axis=2) ab = b - a ad = d - a # [B, N, L, 2] ap = points - a # [B, N, L] norm_ab = paddle.sum(ab * ab, axis=-1) # [B, N, L] norm_ad = paddle.sum(ad * ad, axis=-1) # [B, N, L] dot product ap_dot_ab = paddle.sum(ap * ab, axis=-1) # [B, N, L] dot product ap_dot_ad = paddle.sum(ap * ad, axis=-1) # [B, N, L] = |A|*|B|*cos(theta) is_in_box = (ap_dot_ab >= 0) & (ap_dot_ab <= norm_ab) & (ap_dot_ad >= 0) & ( ap_dot_ad <= norm_ad) return is_in_box def rotated_iou_similarity(box1, box2, eps=1e-9, func=''): """Calculate iou of box1 and box2 Args: box1 (Tensor): box with the shape [N, M1, 5] box2 (Tensor): box with the shape [N, M2, 5] Return: iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2] """ from ext_op import rbox_iou rotated_ious = [] for b1, b2 in zip(box1, box2): rotated_ious.append(rbox_iou(b1, b2)) return paddle.stack(rotated_ious, axis=0) ================================================ FILE: ppdet/modeling/reid/__init__.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import jde_embedding_head from . import fairmot_embedding_head from . import resnet from . import pyramidal_embedding from . import pplcnet_embedding from . import resnet_embedding from .fairmot_embedding_head import * from .jde_embedding_head import * from .resnet import * from .pyramidal_embedding import * from .pplcnet_embedding import * from .resnet_embedding import * ================================================ FILE: ppdet/modeling/reid/fairmot_embedding_head.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np import math import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle.nn.initializer import KaimingUniform, Uniform from ppdet.core.workspace import register from ppdet.modeling.heads.centernet_head import ConvLayer __all__ = ['FairMOTEmbeddingHead'] @register class FairMOTEmbeddingHead(nn.Layer): __shared__ = ['num_classes'] """ Args: in_channels (int): the channel number of input to FairMOTEmbeddingHead. ch_head (int): the channel of features before fed into embedding, 256 by default. ch_emb (int): the channel of the embedding feature, 128 by default. num_identities_dict (dict): the number of identities of each category, support single class and multi-calss, {0: 14455} as default. """ def __init__(self, in_channels, ch_head=256, ch_emb=128, num_classes=1, num_identities_dict={0: 14455}): super(FairMOTEmbeddingHead, self).__init__() assert num_classes >= 1 self.num_classes = num_classes self.ch_emb = ch_emb self.num_identities_dict = num_identities_dict self.reid = nn.Sequential( ConvLayer( in_channels, ch_head, kernel_size=3, padding=1, bias=True), nn.ReLU(), ConvLayer( ch_head, ch_emb, kernel_size=1, stride=1, padding=0, bias=True)) param_attr = paddle.ParamAttr(initializer=KaimingUniform()) bound = 1 / math.sqrt(ch_emb) bias_attr = paddle.ParamAttr(initializer=Uniform(-bound, bound)) self.reid_loss = nn.CrossEntropyLoss(ignore_index=-1, reduction='sum') if num_classes == 1: nID = self.num_identities_dict[0] # single class self.classifier = nn.Linear( ch_emb, nID, weight_attr=param_attr, bias_attr=bias_attr) # When num_identities(nID) is 1, emb_scale is set as 1 self.emb_scale = math.sqrt(2) * math.log(nID - 1) if nID > 1 else 1 else: self.classifiers = dict() self.emb_scale_dict = dict() for cls_id, nID in self.num_identities_dict.items(): self.classifiers[str(cls_id)] = nn.Linear( ch_emb, nID, weight_attr=param_attr, bias_attr=bias_attr) # When num_identities(nID) is 1, emb_scale is set as 1 self.emb_scale_dict[str(cls_id)] = math.sqrt(2) * math.log( nID - 1) if nID > 1 else 1 @classmethod def from_config(cls, cfg, input_shape): if isinstance(input_shape, (list, tuple)): input_shape = input_shape[0] return {'in_channels': input_shape.channels} def process_by_class(self, bboxes, embedding, bbox_inds, topk_clses): pred_dets, pred_embs = [], [] for cls_id in range(self.num_classes): inds_masks = topk_clses == cls_id inds_masks = paddle.cast(inds_masks, 'float32') pos_num = inds_masks.sum().numpy() if pos_num == 0: continue cls_inds_mask = inds_masks > 0 bbox_mask = paddle.nonzero(cls_inds_mask) cls_bboxes = paddle.gather_nd(bboxes, bbox_mask) pred_dets.append(cls_bboxes) cls_inds = paddle.masked_select(bbox_inds, cls_inds_mask) cls_inds = cls_inds.unsqueeze(-1) cls_embedding = paddle.gather_nd(embedding, cls_inds) pred_embs.append(cls_embedding) return paddle.concat(pred_dets), paddle.concat(pred_embs) def forward(self, neck_feat, inputs, bboxes=None, bbox_inds=None, topk_clses=None): reid_feat = self.reid(neck_feat) if self.training: if self.num_classes == 1: loss = self.get_loss(reid_feat, inputs) else: loss = self.get_mc_loss(reid_feat, inputs) return loss else: assert bboxes is not None and bbox_inds is not None reid_feat = F.normalize(reid_feat) embedding = paddle.transpose(reid_feat, [0, 2, 3, 1]) embedding = paddle.reshape(embedding, [-1, self.ch_emb]) # embedding shape: [bs * h * w, ch_emb] if self.num_classes == 1: pred_dets = bboxes pred_embs = paddle.gather(embedding, bbox_inds) else: pred_dets, pred_embs = self.process_by_class( bboxes, embedding, bbox_inds, topk_clses) return pred_dets, pred_embs def get_loss(self, feat, inputs): index = inputs['index'] mask = inputs['index_mask'] target = inputs['reid'] target = paddle.masked_select(target, mask > 0) target = paddle.unsqueeze(target, 1) feat = paddle.transpose(feat, perm=[0, 2, 3, 1]) feat_n, feat_h, feat_w, feat_c = feat.shape feat = paddle.reshape(feat, shape=[feat_n, -1, feat_c]) index = paddle.unsqueeze(index, 2) batch_inds = list() for i in range(feat_n): batch_ind = paddle.full( shape=[1, index.shape[1], 1], fill_value=i, dtype='int64') batch_inds.append(batch_ind) batch_inds = paddle.concat(batch_inds, axis=0) index = paddle.concat(x=[batch_inds, index], axis=2) feat = paddle.gather_nd(feat, index=index) mask = paddle.unsqueeze(mask, axis=2) mask = paddle.expand_as(mask, feat) mask.stop_gradient = True feat = paddle.masked_select(feat, mask > 0) feat = paddle.reshape(feat, shape=[-1, feat_c]) feat = F.normalize(feat) feat = self.emb_scale * feat logit = self.classifier(feat) target.stop_gradient = True loss = self.reid_loss(logit, target) valid = (target != self.reid_loss.ignore_index) valid.stop_gradient = True count = paddle.sum((paddle.cast(valid, dtype=np.int32))) count.stop_gradient = True if count > 0: loss = loss / count return loss def get_mc_loss(self, feat, inputs): # feat.shape = [bs, ch_emb, h, w] assert 'cls_id_map' in inputs and 'cls_tr_ids' in inputs index = inputs['index'] mask = inputs['index_mask'] cls_id_map = inputs['cls_id_map'] # [bs, h, w] cls_tr_ids = inputs['cls_tr_ids'] # [bs, num_classes, h, w] feat = paddle.transpose(feat, perm=[0, 2, 3, 1]) feat_n, feat_h, feat_w, feat_c = feat.shape feat = paddle.reshape(feat, shape=[feat_n, -1, feat_c]) index = paddle.unsqueeze(index, 2) batch_inds = list() for i in range(feat_n): batch_ind = paddle.full( shape=[1, index.shape[1], 1], fill_value=i, dtype='int64') batch_inds.append(batch_ind) batch_inds = paddle.concat(batch_inds, axis=0) index = paddle.concat(x=[batch_inds, index], axis=2) feat = paddle.gather_nd(feat, index=index) mask = paddle.unsqueeze(mask, axis=2) mask = paddle.expand_as(mask, feat) mask.stop_gradient = True feat = paddle.masked_select(feat, mask > 0) feat = paddle.reshape(feat, shape=[-1, feat_c]) reid_losses = 0 for cls_id, id_num in self.num_identities_dict.items(): # target cur_cls_tr_ids = paddle.reshape( cls_tr_ids[:, cls_id, :, :], shape=[feat_n, -1]) # [bs, h*w] cls_id_target = paddle.gather_nd(cur_cls_tr_ids, index=index) mask = inputs['index_mask'] cls_id_target = paddle.masked_select(cls_id_target, mask > 0) cls_id_target.stop_gradient = True # feat cls_id_feat = self.emb_scale_dict[str(cls_id)] * F.normalize(feat) cls_id_pred = self.classifiers[str(cls_id)](cls_id_feat) loss = self.reid_loss(cls_id_pred, cls_id_target) valid = (cls_id_target != self.reid_loss.ignore_index) valid.stop_gradient = True count = paddle.sum((paddle.cast(valid, dtype=np.int32))) count.stop_gradient = True if count > 0: loss = loss / count reid_losses += loss return reid_losses ================================================ FILE: ppdet/modeling/reid/jde_embedding_head.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import math import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.regularizer import L2Decay from ppdet.core.workspace import register from paddle.nn.initializer import Normal, Constant __all__ = ['JDEEmbeddingHead'] class LossParam(nn.Layer): def __init__(self, init_value=0., use_uncertainy=True): super(LossParam, self).__init__() self.loss_param = self.create_parameter( shape=[1], attr=ParamAttr(initializer=Constant(value=init_value)), dtype="float32") def forward(self, inputs): out = paddle.exp(-self.loss_param) * inputs + self.loss_param return out * 0.5 @register class JDEEmbeddingHead(nn.Layer): __shared__ = ['num_classes'] __inject__ = ['emb_loss', 'jde_loss'] """ JDEEmbeddingHead Args: num_classes(int): Number of classes. Only support one class tracking. num_identities(int): Number of identities. anchor_levels(int): Number of anchor levels, same as FPN levels. anchor_scales(int): Number of anchor scales on each FPN level. embedding_dim(int): Embedding dimension. Default: 512. emb_loss(object): Instance of 'JDEEmbeddingLoss' jde_loss(object): Instance of 'JDELoss' """ def __init__( self, num_classes=1, num_identities=14455, # dataset.num_identities_dict[0] anchor_levels=3, anchor_scales=4, embedding_dim=512, emb_loss='JDEEmbeddingLoss', jde_loss='JDELoss'): super(JDEEmbeddingHead, self).__init__() self.num_classes = num_classes self.num_identities = num_identities self.anchor_levels = anchor_levels self.anchor_scales = anchor_scales self.embedding_dim = embedding_dim self.emb_loss = emb_loss self.jde_loss = jde_loss self.emb_scale = math.sqrt(2) * math.log( self.num_identities - 1) if self.num_identities > 1 else 1 self.identify_outputs = [] self.loss_params_cls = [] self.loss_params_reg = [] self.loss_params_ide = [] for i in range(self.anchor_levels): name = 'identify_output.{}'.format(i) identify_output = self.add_sublayer( name, nn.Conv2D( in_channels=64 * (2**self.anchor_levels) // (2**i), out_channels=self.embedding_dim, kernel_size=3, stride=1, padding=1, bias_attr=ParamAttr(regularizer=L2Decay(0.)))) self.identify_outputs.append(identify_output) loss_p_cls = self.add_sublayer('cls.{}'.format(i), LossParam(-4.15)) self.loss_params_cls.append(loss_p_cls) loss_p_reg = self.add_sublayer('reg.{}'.format(i), LossParam(-4.85)) self.loss_params_reg.append(loss_p_reg) loss_p_ide = self.add_sublayer('ide.{}'.format(i), LossParam(-2.3)) self.loss_params_ide.append(loss_p_ide) self.classifier = self.add_sublayer( 'classifier', nn.Linear( self.embedding_dim, self.num_identities, weight_attr=ParamAttr( learning_rate=1., initializer=Normal( mean=0.0, std=0.01)), bias_attr=ParamAttr( learning_rate=2., regularizer=L2Decay(0.)))) def forward(self, identify_feats, targets, loss_confs=None, loss_boxes=None, bboxes=None, boxes_idx=None, nms_keep_idx=None): assert self.num_classes == 1, 'JDE only support sindle class MOT.' assert len(identify_feats) == self.anchor_levels ide_outs = [] for feat, ide_head in zip(identify_feats, self.identify_outputs): ide_outs.append(ide_head(feat)) if self.training: assert len(loss_confs) == len(loss_boxes) == self.anchor_levels loss_ides = self.emb_loss(ide_outs, targets, self.emb_scale, self.classifier) jde_losses = self.jde_loss( loss_confs, loss_boxes, loss_ides, self.loss_params_cls, self.loss_params_reg, self.loss_params_ide, targets) return jde_losses else: assert bboxes is not None assert boxes_idx is not None assert nms_keep_idx is not None emb_outs = self.get_emb_outs(ide_outs) emb_valid = paddle.gather_nd(emb_outs, boxes_idx) pred_embs = paddle.gather_nd(emb_valid, nms_keep_idx) input_shape = targets['image'].shape[2:] # input_shape: [h, w], before data transforms, set in model config im_shape = targets['im_shape'][0].numpy() # im_shape: [new_h, new_w], after data transforms scale_factor = targets['scale_factor'][0].numpy() bboxes[:, 2:] = self.scale_coords(bboxes[:, 2:], input_shape, im_shape, scale_factor) # cls_ids, scores, tlwhs pred_dets = bboxes return pred_dets, pred_embs def scale_coords(self, coords, input_shape, im_shape, scale_factor): ratio = scale_factor[0] pad_w = (input_shape[1] - int(im_shape[1])) / 2 pad_h = (input_shape[0] - int(im_shape[0])) / 2 coords = paddle.cast(coords, 'float32') coords[:, 0::2] -= pad_w coords[:, 1::2] -= pad_h coords[:, 0:4] /= ratio coords[:, :4] = paddle.clip( coords[:, :4], min=0, max=coords[:, :4].max()) return coords.round() def get_emb_and_gt_outs(self, ide_outs, targets): emb_and_gts = [] for i, p_ide in enumerate(ide_outs): t_conf = targets['tconf{}'.format(i)] t_ide = targets['tide{}'.format(i)] p_ide = p_ide.transpose((0, 2, 3, 1)) p_ide_flatten = paddle.reshape(p_ide, [-1, self.embedding_dim]) mask = t_conf > 0 mask = paddle.cast(mask, dtype="int64") emb_mask = mask.max(1).flatten() emb_mask_inds = paddle.nonzero(emb_mask > 0).flatten() if len(emb_mask_inds) > 0: t_ide_flatten = paddle.reshape(t_ide.max(1), [-1, 1]) tids = paddle.gather(t_ide_flatten, emb_mask_inds) embedding = paddle.gather(p_ide_flatten, emb_mask_inds) embedding = self.emb_scale * F.normalize(embedding) emb_and_gt = paddle.concat([embedding, tids], axis=1) emb_and_gts.append(emb_and_gt) if len(emb_and_gts) > 0: return paddle.concat(emb_and_gts, axis=0) else: return paddle.zeros((1, self.embedding_dim + 1)) def get_emb_outs(self, ide_outs): emb_outs = [] for i, p_ide in enumerate(ide_outs): p_ide = p_ide.transpose((0, 2, 3, 1)) p_ide_repeat = paddle.tile(p_ide, [self.anchor_scales, 1, 1, 1]) embedding = F.normalize(p_ide_repeat, axis=-1) emb = paddle.reshape(embedding, [-1, self.embedding_dim]) emb_outs.append(emb) if len(emb_outs) > 0: return paddle.concat(emb_outs, axis=0) else: return paddle.zeros((1, self.embedding_dim)) ================================================ FILE: ppdet/modeling/reid/pplcnet_embedding.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle.nn.initializer import Normal, Constant from paddle import ParamAttr from paddle.nn import AdaptiveAvgPool2D, BatchNorm2D, Conv2D, Linear from paddle.regularizer import L2Decay from paddle.nn.initializer import KaimingNormal, XavierNormal from ppdet.core.workspace import register __all__ = ['PPLCNetEmbedding'] # Each element(list) represents a depthwise block, which is composed of k, in_c, out_c, s, use_se. # k: kernel_size # in_c: input channel number in depthwise block # out_c: output channel number in depthwise block # s: stride in depthwise block # use_se: whether to use SE block NET_CONFIG = { "blocks2": #k, in_c, out_c, s, use_se [[3, 16, 32, 1, False]], "blocks3": [[3, 32, 64, 2, False], [3, 64, 64, 1, False]], "blocks4": [[3, 64, 128, 2, False], [3, 128, 128, 1, False]], "blocks5": [[3, 128, 256, 2, False], [5, 256, 256, 1, False], [5, 256, 256, 1, False], [5, 256, 256, 1, False], [5, 256, 256, 1, False], [5, 256, 256, 1, False]], "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]] } def make_divisible(v, divisor=8, min_value=None): if min_value is None: min_value = divisor new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) if new_v < 0.9 * v: new_v += divisor return new_v class ConvBNLayer(nn.Layer): def __init__(self, num_channels, filter_size, num_filters, stride, num_groups=1): super().__init__() self.conv = Conv2D( in_channels=num_channels, out_channels=num_filters, kernel_size=filter_size, stride=stride, padding=(filter_size - 1) // 2, groups=num_groups, weight_attr=ParamAttr(initializer=KaimingNormal()), bias_attr=False) self.bn = BatchNorm2D( num_filters, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) self.hardswish = nn.Hardswish() def forward(self, x): x = self.conv(x) x = self.bn(x) x = self.hardswish(x) return x class DepthwiseSeparable(nn.Layer): def __init__(self, num_channels, num_filters, stride, dw_size=3, use_se=False): super().__init__() self.use_se = use_se self.dw_conv = ConvBNLayer( num_channels=num_channels, num_filters=num_channels, filter_size=dw_size, stride=stride, num_groups=num_channels) if use_se: self.se = SEModule(num_channels) self.pw_conv = ConvBNLayer( num_channels=num_channels, filter_size=1, num_filters=num_filters, stride=1) def forward(self, x): x = self.dw_conv(x) if self.use_se: x = self.se(x) x = self.pw_conv(x) return x class SEModule(nn.Layer): def __init__(self, channel, reduction=4): super().__init__() self.avg_pool = AdaptiveAvgPool2D(1) self.conv1 = Conv2D( in_channels=channel, out_channels=channel // reduction, kernel_size=1, stride=1, padding=0) self.relu = nn.ReLU() self.conv2 = Conv2D( in_channels=channel // reduction, out_channels=channel, kernel_size=1, stride=1, padding=0) self.hardsigmoid = nn.Hardsigmoid() def forward(self, x): identity = x x = self.avg_pool(x) x = self.conv1(x) x = self.relu(x) x = self.conv2(x) x = self.hardsigmoid(x) x = paddle.multiply(x=identity, y=x) return x class PPLCNet(nn.Layer): """ PP-LCNet, see https://arxiv.org/abs/2109.15099. This code is different from PPLCNet in ppdet/modeling/backbones/lcnet.py or in PaddleClas, because the output is the flatten feature of last_conv. Args: scale (float): Scale ratio of channels. class_expand (int): Number of channels of conv feature. """ def __init__(self, scale=1.0, class_expand=1280): super(PPLCNet, self).__init__() self.scale = scale self.class_expand = class_expand self.conv1 = ConvBNLayer( num_channels=3, filter_size=3, num_filters=make_divisible(16 * scale), stride=2) self.blocks2 = nn.Sequential(*[ DepthwiseSeparable( num_channels=make_divisible(in_c * scale), num_filters=make_divisible(out_c * scale), dw_size=k, stride=s, use_se=se) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"]) ]) self.blocks3 = nn.Sequential(*[ DepthwiseSeparable( num_channels=make_divisible(in_c * scale), num_filters=make_divisible(out_c * scale), dw_size=k, stride=s, use_se=se) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"]) ]) self.blocks4 = nn.Sequential(*[ DepthwiseSeparable( num_channels=make_divisible(in_c * scale), num_filters=make_divisible(out_c * scale), dw_size=k, stride=s, use_se=se) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"]) ]) self.blocks5 = nn.Sequential(*[ DepthwiseSeparable( num_channels=make_divisible(in_c * scale), num_filters=make_divisible(out_c * scale), dw_size=k, stride=s, use_se=se) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"]) ]) self.blocks6 = nn.Sequential(*[ DepthwiseSeparable( num_channels=make_divisible(in_c * scale), num_filters=make_divisible(out_c * scale), dw_size=k, stride=s, use_se=se) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"]) ]) self.avg_pool = AdaptiveAvgPool2D(1) self.last_conv = Conv2D( in_channels=make_divisible(NET_CONFIG["blocks6"][-1][2] * scale), out_channels=self.class_expand, kernel_size=1, stride=1, padding=0, bias_attr=False) self.hardswish = nn.Hardswish() self.flatten = nn.Flatten(start_axis=1, stop_axis=-1) def forward(self, x): x = self.conv1(x) x = self.blocks2(x) x = self.blocks3(x) x = self.blocks4(x) x = self.blocks5(x) x = self.blocks6(x) x = self.avg_pool(x) x = self.last_conv(x) x = self.hardswish(x) x = self.flatten(x) return x class FC(nn.Layer): def __init__(self, input_ch, output_ch): super(FC, self).__init__() weight_attr = ParamAttr(initializer=XavierNormal()) self.fc = paddle.nn.Linear(input_ch, output_ch, weight_attr=weight_attr) def forward(self, x): out = self.fc(x) return out @register class PPLCNetEmbedding(nn.Layer): """ PPLCNet Embedding Args: input_ch (int): Number of channels of input conv feature. output_ch (int): Number of channels of output conv feature. """ def __init__(self, scale=2.5, input_ch=1280, output_ch=512): super(PPLCNetEmbedding, self).__init__() self.backbone = PPLCNet(scale=scale) self.neck = FC(input_ch, output_ch) def forward(self, x): feat = self.backbone(x) feat_out = self.neck(feat) return feat_out ================================================ FILE: ppdet/modeling/reid/pyramidal_embedding.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle.nn.initializer import Normal, Constant from paddle import ParamAttr from .resnet import ResNet50, ResNet101 from ppdet.core.workspace import register __all__ = ['PCBPyramid'] @register class PCBPyramid(nn.Layer): """ PCB (Part-based Convolutional Baseline), see https://arxiv.org/abs/1711.09349, Pyramidal Person Re-IDentification, see https://arxiv.org/abs/1810.12193 Args: input_ch (int): Number of channels of the input feature. num_stripes (int): Number of sub-parts. used_levels (tuple): Whether the level is used, 1 means used. num_classes (int): Number of classes for identities, default 751 in Market-1501 dataset. last_conv_stride (int): Stride of the last conv. last_conv_dilation (int): Dilation of the last conv. num_conv_out_channels (int): Number of channels of conv feature. """ def __init__(self, input_ch=2048, model_name='ResNet101', num_stripes=6, used_levels=(1, 1, 1, 1, 1, 1), num_classes=751, last_conv_stride=1, last_conv_dilation=1, num_conv_out_channels=128): super(PCBPyramid, self).__init__() self.num_stripes = num_stripes self.used_levels = used_levels self.num_classes = num_classes self.num_in_each_level = [i for i in range(self.num_stripes, 0, -1)] self.num_branches = sum(self.num_in_each_level) assert model_name in ['ResNet50', 'ResNet101'], "Unsupported ReID arch: {}".format(model_name) self.base = eval(model_name)( lr_mult=0.1, last_conv_stride=last_conv_stride, last_conv_dilation=last_conv_dilation) self.dropout_layer = nn.Dropout(p=0.2) self.pyramid_conv_list0, self.pyramid_fc_list0 = self.basic_branch( num_conv_out_channels, input_ch) def basic_branch(self, num_conv_out_channels, input_ch): # the level indexes are defined from fine to coarse, # the branch will contain one more part than that of its previous level # the sliding step is set to 1 pyramid_conv_list = nn.LayerList() pyramid_fc_list = nn.LayerList() idx_levels = 0 for idx_branches in range(self.num_branches): if idx_branches >= sum(self.num_in_each_level[0:idx_levels + 1]): idx_levels += 1 pyramid_conv_list.append( nn.Sequential( nn.Conv2D(input_ch, num_conv_out_channels, 1), nn.BatchNorm2D(num_conv_out_channels), nn.ReLU())) idx_levels = 0 for idx_branches in range(self.num_branches): if idx_branches >= sum(self.num_in_each_level[0:idx_levels + 1]): idx_levels += 1 fc = nn.Linear( in_features=num_conv_out_channels, out_features=self.num_classes, weight_attr=ParamAttr(initializer=Normal( mean=0., std=0.001)), bias_attr=ParamAttr(initializer=Constant(value=0.))) pyramid_fc_list.append(fc) return pyramid_conv_list, pyramid_fc_list def pyramid_forward(self, feat): each_stripe_size = int(feat.shape[2] / self.num_stripes) feat_list, logits_list = [], [] idx_levels = 0 used_branches = 0 for idx_branches in range(self.num_branches): if idx_branches >= sum(self.num_in_each_level[0:idx_levels + 1]): idx_levels += 1 idx_in_each_level = idx_branches - sum(self.num_in_each_level[ 0:idx_levels]) stripe_size_in_each_level = each_stripe_size * (idx_levels + 1) start = idx_in_each_level * each_stripe_size end = start + stripe_size_in_each_level k = feat.shape[-1] local_feat_avgpool = F.avg_pool2d( feat[:, :, start:end, :], kernel_size=(stripe_size_in_each_level, k)) local_feat_maxpool = F.max_pool2d( feat[:, :, start:end, :], kernel_size=(stripe_size_in_each_level, k)) local_feat = local_feat_avgpool + local_feat_maxpool local_feat = self.pyramid_conv_list0[used_branches](local_feat) local_feat = paddle.reshape( local_feat, shape=[local_feat.shape[0], -1]) feat_list.append(local_feat) local_logits = self.pyramid_fc_list0[used_branches]( self.dropout_layer(local_feat)) logits_list.append(local_logits) used_branches += 1 return feat_list, logits_list def forward(self, x): feat = self.base(x) assert feat.shape[2] % self.num_stripes == 0 feat_list, logits_list = self.pyramid_forward(feat) feat_out = paddle.concat(feat_list, axis=-1) return feat_out ================================================ FILE: ppdet/modeling/reid/resnet.py ================================================ # copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import math import paddle from paddle import ParamAttr import paddle.nn as nn import paddle.nn.functional as F from paddle.nn.initializer import Normal __all__ = ["ResNet18", "ResNet34", "ResNet50", "ResNet101", "ResNet152"] class ConvBNLayer(nn.Layer): def __init__(self, num_channels, num_filters, filter_size, stride=1, dilation=1, groups=1, act=None, lr_mult=1.0, name=None, data_format="NCHW"): super(ConvBNLayer, self).__init__() conv_stdv = filter_size * filter_size * num_filters self._conv = nn.Conv2D( in_channels=num_channels, out_channels=num_filters, kernel_size=filter_size, stride=stride, padding=(filter_size - 1) // 2, dilation=dilation, groups=groups, weight_attr=ParamAttr( learning_rate=lr_mult, initializer=Normal(0, math.sqrt(2. / conv_stdv))), bias_attr=False, data_format=data_format) self._batch_norm = nn.BatchNorm2D(num_filters) self.act = act def forward(self, inputs): y = self._conv(inputs) y = self._batch_norm(y) if self.act: y = getattr(F, self.act)(y) return y class BottleneckBlock(nn.Layer): def __init__(self, num_channels, num_filters, stride, shortcut=True, name=None, lr_mult=1.0, dilation=1, data_format="NCHW"): super(BottleneckBlock, self).__init__() self.conv0 = ConvBNLayer( num_channels=num_channels, num_filters=num_filters, filter_size=1, dilation=dilation, act="relu", lr_mult=lr_mult, name=name + "_branch2a", data_format=data_format) self.conv1 = ConvBNLayer( num_channels=num_filters, num_filters=num_filters, filter_size=3, dilation=dilation, stride=stride, act="relu", lr_mult=lr_mult, name=name + "_branch2b", data_format=data_format) self.conv2 = ConvBNLayer( num_channels=num_filters, num_filters=num_filters * 4, filter_size=1, dilation=dilation, act=None, lr_mult=lr_mult, name=name + "_branch2c", data_format=data_format) if not shortcut: self.short = ConvBNLayer( num_channels=num_channels, num_filters=num_filters * 4, filter_size=1, dilation=dilation, stride=stride, lr_mult=lr_mult, name=name + "_branch1", data_format=data_format) self.shortcut = shortcut self._num_channels_out = num_filters * 4 def forward(self, inputs): y = self.conv0(inputs) conv1 = self.conv1(y) conv2 = self.conv2(conv1) if self.shortcut: short = inputs else: short = self.short(inputs) y = paddle.add(x=short, y=conv2) y = F.relu(y) return y class BasicBlock(nn.Layer): def __init__(self, num_channels, num_filters, stride, shortcut=True, name=None, data_format="NCHW"): super(BasicBlock, self).__init__() self.stride = stride self.conv0 = ConvBNLayer( num_channels=num_channels, num_filters=num_filters, filter_size=3, stride=stride, act="relu", name=name + "_branch2a", data_format=data_format) self.conv1 = ConvBNLayer( num_channels=num_filters, num_filters=num_filters, filter_size=3, act=None, name=name + "_branch2b", data_format=data_format) if not shortcut: self.short = ConvBNLayer( num_channels=num_channels, num_filters=num_filters, filter_size=1, stride=stride, name=name + "_branch1", data_format=data_format) self.shortcut = shortcut def forward(self, inputs): y = self.conv0(inputs) conv1 = self.conv1(y) if self.shortcut: short = inputs else: short = self.short(inputs) y = paddle.add(x=short, y=conv1) y = F.relu(y) return y class ResNet(nn.Layer): def __init__(self, layers=50, lr_mult=1.0, last_conv_stride=2, last_conv_dilation=1): super(ResNet, self).__init__() self.layers = layers self.data_format = "NCHW" self.input_image_channel = 3 supported_layers = [18, 34, 50, 101, 152] assert layers in supported_layers, \ "supported layers are {} but input layer is {}".format( supported_layers, layers) if layers == 18: depth = [2, 2, 2, 2] elif layers == 34 or layers == 50: depth = [3, 4, 6, 3] elif layers == 101: depth = [3, 4, 23, 3] elif layers == 152: depth = [3, 8, 36, 3] num_channels = [64, 256, 512, 1024] if layers >= 50 else [64, 64, 128, 256] num_filters = [64, 128, 256, 512] self.conv = ConvBNLayer( num_channels=self.input_image_channel, num_filters=64, filter_size=7, stride=2, act="relu", lr_mult=lr_mult, name="conv1", data_format=self.data_format) self.pool2d_max = nn.MaxPool2D( kernel_size=3, stride=2, padding=1, data_format=self.data_format) self.block_list = [] if layers >= 50: for block in range(len(depth)): shortcut = False for i in range(depth[block]): if layers in [101, 152] and block == 2: if i == 0: conv_name = "res" + str(block + 2) + "a" else: conv_name = "res" + str(block + 2) + "b" + str(i) else: conv_name = "res" + str(block + 2) + chr(97 + i) if i != 0 or block == 0: stride = 1 elif block == len(depth) - 1: stride = last_conv_stride else: stride = 2 bottleneck_block = self.add_sublayer( conv_name, BottleneckBlock( num_channels=num_channels[block] if i == 0 else num_filters[block] * 4, num_filters=num_filters[block], stride=stride, shortcut=shortcut, name=conv_name, lr_mult=lr_mult, dilation=last_conv_dilation if block == len(depth) - 1 else 1, data_format=self.data_format)) self.block_list.append(bottleneck_block) shortcut = True else: for block in range(len(depth)): shortcut = False for i in range(depth[block]): conv_name = "res" + str(block + 2) + chr(97 + i) basic_block = self.add_sublayer( conv_name, BasicBlock( num_channels=num_channels[block] if i == 0 else num_filters[block], num_filters=num_filters[block], stride=2 if i == 0 and block != 0 else 1, shortcut=shortcut, name=conv_name, data_format=self.data_format)) self.block_list.append(basic_block) shortcut = True def forward(self, inputs): y = self.conv(inputs) y = self.pool2d_max(y) for block in self.block_list: y = block(y) return y def ResNet18(**args): model = ResNet(layers=18, **args) return model def ResNet34(**args): model = ResNet(layers=34, **args) return model def ResNet50(pretrained=None, **args): model = ResNet(layers=50, **args) if pretrained is not None: if not (os.path.isdir(pretrained) or os.path.exists(pretrained + '.pdparams')): raise ValueError("Model pretrain path {} does not " "exists.".format(pretrained)) param_state_dict = paddle.load(pretrained + '.pdparams') model.set_dict(param_state_dict) return model def ResNet101(pretrained=None, **args): model = ResNet(layers=101, **args) if pretrained is not None: if not (os.path.isdir(pretrained) or os.path.exists(pretrained + '.pdparams')): raise ValueError("Model pretrain path {} does not " "exists.".format(pretrained)) param_state_dict = paddle.load(pretrained + '.pdparams') model.set_dict(param_state_dict) return model def ResNet152(**args): model = ResNet(layers=152, **args) return model ================================================ FILE: ppdet/modeling/reid/resnet_embedding.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import paddle import paddle.nn.functional as F from paddle import nn from .resnet import ResNet50, ResNet101 from ppdet.core.workspace import register __all__ = ['ResNetEmbedding'] @register class ResNetEmbedding(nn.Layer): in_planes = 2048 def __init__(self, model_name='ResNet50', last_stride=1): super(ResNetEmbedding, self).__init__() assert model_name in ['ResNet50', 'ResNet101'], "Unsupported ReID arch: {}".format(model_name) self.base = eval(model_name)(last_conv_stride=last_stride) self.gap = nn.AdaptiveAvgPool2D(output_size=1) self.flatten = nn.Flatten(start_axis=1, stop_axis=-1) self.bn = nn.BatchNorm1D(self.in_planes, bias_attr=False) def forward(self, x): base_out = self.base(x) global_feat = self.gap(base_out) global_feat = self.flatten(global_feat) global_feat = self.bn(global_feat) return global_feat ================================================ FILE: ppdet/modeling/shape_spec.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # The code is based on: # https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/shape_spec.py from collections import namedtuple class ShapeSpec( namedtuple("_ShapeSpec", ["channels", "height", "width", "stride"])): def __new__(cls, channels=None, height=None, width=None, stride=None): return super(ShapeSpec, cls).__new__(cls, channels, height, width, stride) ================================================ FILE: ppdet/modeling/ssod/__init__.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import utils from . import losses from .utils import * from .losses import * ================================================ FILE: ppdet/modeling/ssod/losses.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from ppdet.modeling.losses.iou_loss import GIoULoss from .utils import QFLv2 from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) __all__ = [ 'SSODFCOSLoss', 'SSODPPYOLOELoss', ] @register class SSODFCOSLoss(nn.Layer): def __init__(self, loss_weight=1.0): super(SSODFCOSLoss, self).__init__() self.loss_weight = loss_weight def forward(self, student_head_outs, teacher_head_outs, train_cfg): # for semi-det distill student_logits, student_deltas, student_quality = student_head_outs teacher_logits, teacher_deltas, teacher_quality = teacher_head_outs nc = student_logits[0].shape[1] student_logits = paddle.concat( [ _.transpose([0, 2, 3, 1]).reshape([-1, nc]) for _ in student_logits ], axis=0) teacher_logits = paddle.concat( [ _.transpose([0, 2, 3, 1]).reshape([-1, nc]) for _ in teacher_logits ], axis=0) student_deltas = paddle.concat( [ _.transpose([0, 2, 3, 1]).reshape([-1, 4]) for _ in student_deltas ], axis=0) teacher_deltas = paddle.concat( [ _.transpose([0, 2, 3, 1]).reshape([-1, 4]) for _ in teacher_deltas ], axis=0) student_quality = paddle.concat( [ _.transpose([0, 2, 3, 1]).reshape([-1, 1]) for _ in student_quality ], axis=0) teacher_quality = paddle.concat( [ _.transpose([0, 2, 3, 1]).reshape([-1, 1]) for _ in teacher_quality ], axis=0) ratio = train_cfg.get('ratio', 0.01) with paddle.no_grad(): # Region Selection count_num = int(teacher_logits.shape[0] * ratio) teacher_probs = F.sigmoid(teacher_logits) max_vals = paddle.max(teacher_probs, 1) sorted_vals, sorted_inds = paddle.topk(max_vals, teacher_logits.shape[0]) mask = paddle.zeros_like(max_vals) mask[sorted_inds[:count_num]] = 1. fg_num = sorted_vals[:count_num].sum() b_mask = mask > 0 # distill_loss_cls loss_logits = QFLv2( F.sigmoid(student_logits), teacher_probs, weight=mask, reduction="sum") / fg_num # distill_loss_box inputs = paddle.concat( (-student_deltas[b_mask][..., :2], student_deltas[b_mask][..., 2:]), axis=-1) targets = paddle.concat( (-teacher_deltas[b_mask][..., :2], teacher_deltas[b_mask][..., 2:]), axis=-1) iou_loss = GIoULoss(reduction='mean') loss_deltas = iou_loss(inputs, targets) # distill_loss_quality loss_quality = F.binary_cross_entropy( F.sigmoid(student_quality[b_mask]), F.sigmoid(teacher_quality[b_mask]), reduction='mean') return { "distill_loss_cls": loss_logits, "distill_loss_box": loss_deltas, "distill_loss_quality": loss_quality, "fg_sum": fg_num, } @register class SSODPPYOLOELoss(nn.Layer): def __init__(self, loss_weight=1.0): super(SSODPPYOLOELoss, self).__init__() self.loss_weight = loss_weight def forward(self, student_head_outs, teacher_head_outs, train_cfg): # for semi-det distill # student_probs: already sigmoid student_probs, student_deltas, student_dfl = student_head_outs teacher_probs, teacher_deltas, teacher_dfl = teacher_head_outs bs, l, nc = student_probs.shape[:] # bs, l, num_classes bs, l, _, reg_ch = student_dfl.shape[:] # bs, l, 4, reg_ch student_probs = student_probs.reshape([-1, nc]) teacher_probs = teacher_probs.reshape([-1, nc]) student_deltas = student_deltas.reshape([-1, 4]) teacher_deltas = teacher_deltas.reshape([-1, 4]) student_dfl = student_dfl.reshape([-1, 4, reg_ch]) teacher_dfl = teacher_dfl.reshape([-1, 4, reg_ch]) ratio = train_cfg.get('ratio', 0.01) # for contrast loss curr_iter = train_cfg['curr_iter'] st_iter = train_cfg['st_iter'] if curr_iter == st_iter + 1: # start semi-det training self.queue_ptr = 0 self.queue_size = int(bs * l * ratio) self.queue_feats = paddle.zeros([self.queue_size, nc]) self.queue_probs = paddle.zeros([self.queue_size, nc]) contrast_loss_cfg = train_cfg['contrast_loss'] temperature = contrast_loss_cfg.get('temperature', 0.2) alpha = contrast_loss_cfg.get('alpha', 0.9) smooth_iter = contrast_loss_cfg.get('smooth_iter', 100) + st_iter with paddle.no_grad(): # Region Selection count_num = int(teacher_probs.shape[0] * ratio) max_vals = paddle.max(teacher_probs, 1) sorted_vals, sorted_inds = paddle.topk(max_vals, teacher_probs.shape[0]) mask = paddle.zeros_like(max_vals) mask[sorted_inds[:count_num]] = 1. fg_num = sorted_vals[:count_num].sum() b_mask = mask > 0. # for contrast loss probs = teacher_probs[b_mask].detach() if curr_iter > smooth_iter: # memory-smoothing A = paddle.exp( paddle.mm(teacher_probs[b_mask], self.queue_probs.t()) / temperature) A = A / A.sum(1, keepdim=True) probs = alpha * probs + (1 - alpha) * paddle.mm( A, self.queue_probs) n = student_probs[b_mask].shape[0] # update memory bank self.queue_feats[self.queue_ptr:self.queue_ptr + n, :] = teacher_probs[b_mask].detach() self.queue_probs[self.queue_ptr:self.queue_ptr + n, :] = teacher_probs[b_mask].detach() self.queue_ptr = (self.queue_ptr + n) % self.queue_size # embedding similarity sim = paddle.exp( paddle.mm(student_probs[b_mask], teacher_probs[b_mask].t()) / 0.2) sim_probs = sim / sim.sum(1, keepdim=True) # pseudo-label graph with self-loop Q = paddle.mm(probs, probs.t()) Q.fill_diagonal_(1) pos_mask = (Q >= 0.5).astype('float32') Q = Q * pos_mask Q = Q / Q.sum(1, keepdim=True) # contrastive loss loss_contrast = -(paddle.log(sim_probs + 1e-7) * Q).sum(1) loss_contrast = loss_contrast.mean() # distill_loss_cls loss_cls = QFLv2( student_probs, teacher_probs, weight=mask, reduction="sum") / fg_num # distill_loss_iou inputs = paddle.concat( (-student_deltas[b_mask][..., :2], student_deltas[b_mask][..., 2:]), -1) targets = paddle.concat( (-teacher_deltas[b_mask][..., :2], teacher_deltas[b_mask][..., 2:]), -1) iou_loss = GIoULoss(reduction='mean') loss_iou = iou_loss(inputs, targets) # distill_loss_dfl loss_dfl = F.cross_entropy( student_dfl[b_mask].reshape([-1, reg_ch]), teacher_dfl[b_mask].reshape([-1, reg_ch]), soft_label=True, reduction='mean') return { "distill_loss_cls": loss_cls, "distill_loss_iou": loss_iou, "distill_loss_dfl": loss_dfl, "distill_loss_contrast": loss_contrast, "fg_sum": fg_num, } ================================================ FILE: ppdet/modeling/ssod/utils.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn.functional as F def align_weak_strong_shape(data_weak, data_strong): max_shape_x = max(data_strong['image'].shape[2], data_weak['image'].shape[2]) max_shape_y = max(data_strong['image'].shape[3], data_weak['image'].shape[3]) scale_x_s = max_shape_x / data_strong['image'].shape[2] scale_y_s = max_shape_y / data_strong['image'].shape[3] scale_x_w = max_shape_x / data_weak['image'].shape[2] scale_y_w = max_shape_y / data_weak['image'].shape[3] target_size = [max_shape_x, max_shape_y] if scale_x_s != 1 or scale_y_s != 1: data_strong['image'] = F.interpolate( data_strong['image'], size=target_size, mode='bilinear', align_corners=False) if 'gt_bbox' in data_strong: gt_bboxes = data_strong['gt_bbox'].numpy() for i in range(len(gt_bboxes)): if len(gt_bboxes[i]) > 0: gt_bboxes[i][:, 0::2] = gt_bboxes[i][:, 0::2] * scale_x_s gt_bboxes[i][:, 1::2] = gt_bboxes[i][:, 1::2] * scale_y_s data_strong['gt_bbox'] = paddle.to_tensor(gt_bboxes) if scale_x_w != 1 or scale_y_w != 1: data_weak['image'] = F.interpolate( data_weak['image'], size=target_size, mode='bilinear', align_corners=False) if 'gt_bbox' in data_weak: gt_bboxes = data_weak['gt_bbox'].numpy() for i in range(len(gt_bboxes)): if len(gt_bboxes[i]) > 0: gt_bboxes[i][:, 0::2] = gt_bboxes[i][:, 0::2] * scale_x_w gt_bboxes[i][:, 1::2] = gt_bboxes[i][:, 1::2] * scale_y_w data_weak['gt_bbox'] = paddle.to_tensor(gt_bboxes) return data_weak, data_strong def QFLv2(pred_sigmoid, teacher_sigmoid, weight=None, beta=2.0, reduction='mean'): pt = pred_sigmoid zerolabel = paddle.zeros_like(pt) loss = F.binary_cross_entropy( pred_sigmoid, zerolabel, reduction='none') * pt.pow(beta) pos = weight > 0 pt = teacher_sigmoid[pos] - pred_sigmoid[pos] loss[pos] = F.binary_cross_entropy( pred_sigmoid[pos], teacher_sigmoid[pos], reduction='none') * pt.pow(beta) valid = weight >= 0 if reduction == "mean": loss = loss[valid].mean() elif reduction == "sum": loss = loss[valid].sum() return loss def filter_invalid(bbox, label=None, score=None, thr=0.0, min_size=0): if score.numel() > 0: soft_score = score.max(-1) valid = soft_score >= thr bbox = bbox[valid] if label is not None: label = label[valid] score = score[valid] if min_size is not None and bbox.shape[0] > 0: bw = bbox[:, 2] bh = bbox[:, 3] valid = (bw > min_size) & (bh > min_size) bbox = bbox[valid] if label is not None: label = label[valid] score = score[valid] return bbox, label, score ================================================ FILE: ppdet/modeling/tests/__init__.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppdet/modeling/tests/test_architectures.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import unittest import ppdet class TestFasterRCNN(unittest.TestCase): def setUp(self): self.set_config() def set_config(self): self.cfg_file = 'configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml' def test_trainer(self): # Trainer __init__ will build model and DataLoader # 'train' and 'eval' mode include dataset loading # use 'test' mode to simplify tests cfg = ppdet.core.workspace.load_config(self.cfg_file) trainer = ppdet.engine.Trainer(cfg, mode='test') class TestMaskRCNN(TestFasterRCNN): def set_config(self): self.cfg_file = 'configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.yml' class TestCascadeRCNN(TestFasterRCNN): def set_config(self): self.cfg_file = 'configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.yml' class TestYolov3(TestFasterRCNN): def set_config(self): self.cfg_file = 'configs/yolov3/yolov3_darknet53_270e_coco.yml' class TestSSD(TestFasterRCNN): def set_config(self): self.cfg_file = 'configs/ssd/ssd_vgg16_300_240e_voc.yml' class TestGFL(TestFasterRCNN): def set_config(self): self.cfg_file = 'configs/gfl/gfl_r50_fpn_1x_coco.yml' class TestPicoDet(TestFasterRCNN): def set_config(self): self.cfg_file = 'configs/picodet/picodet_s_320_coco_lcnet.yml' if __name__ == '__main__': unittest.main() ================================================ FILE: ppdet/modeling/tests/test_base.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import print_function import unittest import contextlib import paddle from paddle.static import Program class LayerTest(unittest.TestCase): @classmethod def setUpClass(cls): cls.seed = 111 @classmethod def tearDownClass(cls): pass def _get_place(self, force_to_use_cpu=False): # this option for ops that only have cpu kernel if force_to_use_cpu: return 'cpu' else: return paddle.device.get_device() @contextlib.contextmanager def static_graph(self): paddle.enable_static() scope = paddle.static.Scope() program = Program() with paddle.static.scope_guard(scope): with paddle.static.program_guard(program): paddle.seed(self.seed) paddle.framework.random._manual_program_seed(self.seed) yield def get_static_graph_result(self, feed, fetch_list, with_lod=False, force_to_use_cpu=False): exe = paddle.static.Executor(self._get_place(force_to_use_cpu)) exe.run(paddle.static.default_startup_program()) return exe.run(paddle.static.default_main_program(), feed=feed, fetch_list=fetch_list, return_numpy=(not with_lod)) @contextlib.contextmanager def dynamic_graph(self, force_to_use_cpu=False): paddle.disable_static() place = self._get_place(force_to_use_cpu=force_to_use_cpu) paddle.device.set_device(place) paddle.seed(self.seed) paddle.framework.random._manual_program_seed(self.seed) yield ================================================ FILE: ppdet/modeling/tests/test_mstest.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import unittest from ppdet.core.workspace import load_config from ppdet.engine import Trainer class TestMultiScaleInference(unittest.TestCase): def setUp(self): self.set_config() def set_config(self): self.mstest_cfg_file = 'configs/faster_rcnn/faster_rcnn_r34_fpn_multiscaletest_1x_coco.yml' # test evaluation with multi scale test def test_eval_mstest(self): cfg = load_config(self.mstest_cfg_file) trainer = Trainer(cfg, mode='eval') cfg.weights = 'https://paddledet.bj.bcebos.com/models/faster_rcnn_r34_fpn_1x_coco.pdparams' trainer.load_weights(cfg.weights) trainer.evaluate() # test inference with multi scale test def test_infer_mstest(self): cfg = load_config(self.mstest_cfg_file) trainer = Trainer(cfg, mode='test') cfg.weights = 'https://paddledet.bj.bcebos.com/models/faster_rcnn_r34_fpn_1x_coco.pdparams' trainer.load_weights(cfg.weights) tests_img_root = os.path.join(os.path.dirname(__file__), 'imgs') # input images to predict imgs = [ 'coco2017_val2017_000000000139.jpg', 'coco2017_val2017_000000000724.jpg' ] imgs = [os.path.join(tests_img_root, img) for img in imgs] trainer.predict( imgs, draw_threshold=0.5, output_dir='output', save_results=False) if __name__ == '__main__': unittest.main() ================================================ FILE: ppdet/modeling/tests/test_ops.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import print_function import os, sys # add python path of PaddleDetection to sys.path parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 4))) if parent_path not in sys.path: sys.path.append(parent_path) import unittest import numpy as np import paddle import ppdet.modeling.ops as ops from ppdet.modeling.tests.test_base import LayerTest def make_rois(h, w, rois_num, output_size): rois = np.zeros((0, 4)).astype('float32') for roi_num in rois_num: roi = np.zeros((roi_num, 4)).astype('float32') roi[:, 0] = np.random.randint(0, h - output_size[0], size=roi_num) roi[:, 1] = np.random.randint(0, w - output_size[1], size=roi_num) roi[:, 2] = np.random.randint(roi[:, 0] + output_size[0], h) roi[:, 3] = np.random.randint(roi[:, 1] + output_size[1], w) rois = np.vstack((rois, roi)) return rois def softmax(x): # clip to shiftx, otherwise, when calc loss with # log(exp(shiftx)), may get log(0)=INF shiftx = (x - np.max(x)).clip(-64.) exps = np.exp(shiftx) return exps / np.sum(exps) class TestROIAlign(LayerTest): def test_roi_align(self): b, c, h, w = 2, 12, 20, 20 inputs_np = np.random.rand(b, c, h, w).astype('float32') rois_num = [4, 6] output_size = (7, 7) rois_np = make_rois(h, w, rois_num, output_size) rois_num_np = np.array(rois_num).astype('int32') with self.static_graph(): inputs = paddle.static.data( name='inputs', shape=[b, c, h, w], dtype='float32') rois = paddle.static.data( name='rois', shape=[10, 4], dtype='float32') rois_num = paddle.static.data( name='rois_num', shape=[None], dtype='int32') output = paddle.vision.ops.roi_align( x=inputs, boxes=rois, boxes_num=rois_num, output_size=output_size) output_np, = self.get_static_graph_result( feed={ 'inputs': inputs_np, 'rois': rois_np, 'rois_num': rois_num_np }, fetch_list=output, with_lod=False) with self.dynamic_graph(): inputs_dy = paddle.to_tensor(inputs_np) rois_dy = paddle.to_tensor(rois_np) rois_num_dy = paddle.to_tensor(rois_num_np) output_dy = paddle.vision.ops.roi_align( x=inputs_dy, boxes=rois_dy, boxes_num=rois_num_dy, output_size=output_size) output_dy_np = output_dy.numpy() self.assertTrue(np.array_equal(output_np, output_dy_np)) def test_roi_align_error(self): with self.static_graph(): inputs = paddle.static.data( name='inputs', shape=[2, 12, 20, 20], dtype='float32') rois = paddle.static.data( name='data_error', shape=[10, 4], dtype='int32', lod_level=1) self.assertRaises( TypeError, paddle.vision.ops.roi_align, input=inputs, rois=rois, output_size=(7, 7)) paddle.disable_static() class TestROIPool(LayerTest): def test_roi_pool(self): b, c, h, w = 2, 12, 20, 20 inputs_np = np.random.rand(b, c, h, w).astype('float32') rois_num = [4, 6] output_size = (7, 7) rois_np = make_rois(h, w, rois_num, output_size) rois_num_np = np.array(rois_num).astype('int32') with self.static_graph(): inputs = paddle.static.data( name='inputs', shape=[b, c, h, w], dtype='float32') rois = paddle.static.data( name='rois', shape=[10, 4], dtype='float32') rois_num = paddle.static.data( name='rois_num', shape=[None], dtype='int32') output = paddle.vision.ops.roi_pool( x=inputs, boxes=rois, boxes_num=rois_num, output_size=output_size) output_np, = self.get_static_graph_result( feed={ 'inputs': inputs_np, 'rois': rois_np, 'rois_num': rois_num_np }, fetch_list=[output], with_lod=False) with self.dynamic_graph(): inputs_dy = paddle.to_tensor(inputs_np) rois_dy = paddle.to_tensor(rois_np) rois_num_dy = paddle.to_tensor(rois_num_np) output_dy = paddle.vision.ops.roi_pool( x=inputs_dy, boxes=rois_dy, boxes_num=rois_num_dy, output_size=output_size) output_dy_np = output_dy.numpy() self.assertTrue(np.array_equal(output_np, output_dy_np)) def test_roi_pool_error(self): with self.static_graph(): inputs = paddle.static.data( name='inputs', shape=[2, 12, 20, 20], dtype='float32') rois = paddle.static.data( name='data_error', shape=[10, 4], dtype='int32', lod_level=1) self.assertRaises( TypeError, paddle.vision.ops.roi_pool, input=inputs, rois=rois, output_size=(7, 7)) paddle.disable_static() class TestPriorBox(LayerTest): def test_prior_box(self): input_np = np.random.rand(2, 10, 32, 32).astype('float32') image_np = np.random.rand(2, 10, 40, 40).astype('float32') min_sizes = [2, 4] with self.static_graph(): input = paddle.static.data( name='input', shape=[2, 10, 32, 32], dtype='float32') image = paddle.static.data( name='image', shape=[2, 10, 40, 40], dtype='float32') box, var = ops.prior_box( input=input, image=image, min_sizes=min_sizes, clip=True, flip=True) box_np, var_np = self.get_static_graph_result( feed={ 'input': input_np, 'image': image_np, }, fetch_list=[box, var], with_lod=False) with self.dynamic_graph(): inputs_dy = paddle.to_tensor(input_np) image_dy = paddle.to_tensor(image_np) box_dy, var_dy = ops.prior_box( input=inputs_dy, image=image_dy, min_sizes=min_sizes, clip=True, flip=True) box_dy_np = box_dy.numpy() var_dy_np = var_dy.numpy() self.assertTrue(np.array_equal(box_np, box_dy_np)) self.assertTrue(np.array_equal(var_np, var_dy_np)) def test_prior_box_error(self): with self.static_graph(): input = paddle.static.data( name='input', shape=[2, 10, 32, 32], dtype='int32') image = paddle.static.data( name='image', shape=[2, 10, 40, 40], dtype='int32') self.assertRaises( TypeError, ops.prior_box, input=input, image=image, min_sizes=[2, 4], clip=True, flip=True) paddle.disable_static() class TestMulticlassNms(LayerTest): def test_multiclass_nms(self): boxes_np = np.random.rand(10, 81, 4).astype('float32') scores_np = np.random.rand(10, 81).astype('float32') rois_num_np = np.array([2, 8]).astype('int32') with self.static_graph(): boxes = paddle.static.data( name='bboxes', shape=[None, 81, 4], dtype='float32', lod_level=1) scores = paddle.static.data( name='scores', shape=[None, 81], dtype='float32', lod_level=1) rois_num = paddle.static.data( name='rois_num', shape=[None], dtype='int32') output = ops.multiclass_nms( bboxes=boxes, scores=scores, background_label=0, score_threshold=0.5, nms_top_k=400, nms_threshold=0.3, keep_top_k=200, normalized=False, return_index=True, rois_num=rois_num) out_np, index_np, nms_rois_num_np = self.get_static_graph_result( feed={ 'bboxes': boxes_np, 'scores': scores_np, 'rois_num': rois_num_np }, fetch_list=output, with_lod=True) out_np = np.array(out_np) index_np = np.array(index_np) nms_rois_num_np = np.array(nms_rois_num_np) with self.dynamic_graph(): boxes_dy = paddle.to_tensor(boxes_np) scores_dy = paddle.to_tensor(scores_np) rois_num_dy = paddle.to_tensor(rois_num_np) out_dy, index_dy, nms_rois_num_dy = ops.multiclass_nms( bboxes=boxes_dy, scores=scores_dy, background_label=0, score_threshold=0.5, nms_top_k=400, nms_threshold=0.3, keep_top_k=200, normalized=False, return_index=True, rois_num=rois_num_dy) out_dy_np = out_dy.numpy() index_dy_np = index_dy.numpy() nms_rois_num_dy_np = nms_rois_num_dy.numpy() self.assertTrue(np.array_equal(out_np, out_dy_np)) self.assertTrue(np.array_equal(index_np, index_dy_np)) self.assertTrue(np.array_equal(nms_rois_num_np, nms_rois_num_dy_np)) def test_multiclass_nms_error(self): with self.static_graph(): boxes = paddle.static.data( name='bboxes', shape=[81, 4], dtype='float32', lod_level=1) scores = paddle.static.data( name='scores', shape=[81], dtype='float32', lod_level=1) rois_num = paddle.static.data( name='rois_num', shape=[40, 41], dtype='int32') self.assertRaises( TypeError, ops.multiclass_nms, boxes=boxes, scores=scores, background_label=0, score_threshold=0.5, nms_top_k=400, nms_threshold=0.3, keep_top_k=200, normalized=False, return_index=True, rois_num=rois_num) class TestMatrixNMS(LayerTest): def test_matrix_nms(self): N, M, C = 7, 1200, 21 BOX_SIZE = 4 nms_top_k = 400 keep_top_k = 200 score_threshold = 0.01 post_threshold = 0. scores_np = np.random.random((N * M, C)).astype('float32') scores_np = np.apply_along_axis(softmax, 1, scores_np) scores_np = np.reshape(scores_np, (N, M, C)) scores_np = np.transpose(scores_np, (0, 2, 1)) boxes_np = np.random.random((N, M, BOX_SIZE)).astype('float32') boxes_np[:, :, 0:2] = boxes_np[:, :, 0:2] * 0.5 boxes_np[:, :, 2:4] = boxes_np[:, :, 2:4] * 0.5 + 0.5 with self.static_graph(): boxes = paddle.static.data( name='boxes', shape=[N, M, BOX_SIZE], dtype='float32') scores = paddle.static.data( name='scores', shape=[N, C, M], dtype='float32') out, index, _ = ops.matrix_nms( bboxes=boxes, scores=scores, score_threshold=score_threshold, post_threshold=post_threshold, nms_top_k=nms_top_k, keep_top_k=keep_top_k, return_index=True) out_np, index_np = self.get_static_graph_result( feed={'boxes': boxes_np, 'scores': scores_np}, fetch_list=[out, index], with_lod=True) with self.dynamic_graph(): boxes_dy = paddle.to_tensor(boxes_np) scores_dy = paddle.to_tensor(scores_np) out_dy, index_dy, _ = ops.matrix_nms( bboxes=boxes_dy, scores=scores_dy, score_threshold=score_threshold, post_threshold=post_threshold, nms_top_k=nms_top_k, keep_top_k=keep_top_k, return_index=True) out_dy_np = out_dy.numpy() index_dy_np = index_dy.numpy() self.assertTrue(np.array_equal(out_np, out_dy_np)) self.assertTrue(np.array_equal(index_np, index_dy_np)) def test_matrix_nms_error(self): with self.static_graph(): bboxes = paddle.static.data( name='bboxes', shape=[7, 1200, 4], dtype='float32') scores = paddle.static.data( name='data_error', shape=[7, 21, 1200], dtype='int32') self.assertRaises( TypeError, ops.matrix_nms, bboxes=bboxes, scores=scores, score_threshold=0.01, post_threshold=0., nms_top_k=400, keep_top_k=200, return_index=True) paddle.disable_static() class TestBoxCoder(LayerTest): def test_box_coder(self): prior_box_np = np.random.random((81, 4)).astype('float32') prior_box_var_np = np.random.random((81, 4)).astype('float32') target_box_np = np.random.random((20, 81, 4)).astype('float32') # static with self.static_graph(): prior_box = paddle.static.data( name='prior_box', shape=[81, 4], dtype='float32') prior_box_var = paddle.static.data( name='prior_box_var', shape=[81, 4], dtype='float32') target_box = paddle.static.data( name='target_box', shape=[20, 81, 4], dtype='float32') boxes = ops.box_coder( prior_box=prior_box, prior_box_var=prior_box_var, target_box=target_box, code_type="decode_center_size", box_normalized=False) boxes_np, = self.get_static_graph_result( feed={ 'prior_box': prior_box_np, 'prior_box_var': prior_box_var_np, 'target_box': target_box_np, }, fetch_list=[boxes], with_lod=False) # dygraph with self.dynamic_graph(): prior_box_dy = paddle.to_tensor(prior_box_np) prior_box_var_dy = paddle.to_tensor(prior_box_var_np) target_box_dy = paddle.to_tensor(target_box_np) boxes_dy = ops.box_coder( prior_box=prior_box_dy, prior_box_var=prior_box_var_dy, target_box=target_box_dy, code_type="decode_center_size", box_normalized=False) boxes_dy_np = boxes_dy.numpy() self.assertTrue(np.array_equal(boxes_np, boxes_dy_np)) def test_box_coder_error(self): with self.static_graph(): prior_box = paddle.static.data( name='prior_box', shape=[81, 4], dtype='int32') prior_box_var = paddle.static.data( name='prior_box_var', shape=[81, 4], dtype='float32') target_box = paddle.static.data( name='target_box', shape=[20, 81, 4], dtype='float32') self.assertRaises(TypeError, ops.box_coder, prior_box, prior_box_var, target_box) paddle.disable_static() if __name__ == '__main__': unittest.main() ================================================ FILE: ppdet/modeling/tests/test_yolov3_loss.py ================================================ # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import division import unittest import paddle import paddle.nn.functional as F # add python path of PaddleDetection to sys.path import os import sys parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 4))) if parent_path not in sys.path: sys.path.append(parent_path) from ppdet.modeling.losses import YOLOv3Loss from ppdet.data.transform.op_helper import jaccard_overlap from ppdet.modeling.bbox_utils import iou_similarity import numpy as np np.random.seed(0) def _split_output(output, an_num, num_classes): """ Split output feature map to x, y, w, h, objectness, classification along channel dimension """ x = paddle.strided_slice( output, axes=[1], starts=[0], ends=[output.shape[1]], strides=[5 + num_classes]) y = paddle.strided_slice( output, axes=[1], starts=[1], ends=[output.shape[1]], strides=[5 + num_classes]) w = paddle.strided_slice( output, axes=[1], starts=[2], ends=[output.shape[1]], strides=[5 + num_classes]) h = paddle.strided_slice( output, axes=[1], starts=[3], ends=[output.shape[1]], strides=[5 + num_classes]) obj = paddle.strided_slice( output, axes=[1], starts=[4], ends=[output.shape[1]], strides=[5 + num_classes]) clss = [] stride = output.shape[1] // an_num for m in range(an_num): clss.append( paddle.slice( output, axes=[1], starts=[stride * m + 5], ends=[stride * m + 5 + num_classes])) cls = paddle.transpose(paddle.stack(clss, axis=1), perm=[0, 1, 3, 4, 2]) return (x, y, w, h, obj, cls) def _split_target(target): """ split target to x, y, w, h, objectness, classification along dimension 2 target is in shape [N, an_num, 6 + class_num, H, W] """ tx = target[:, :, 0, :, :] ty = target[:, :, 1, :, :] tw = target[:, :, 2, :, :] th = target[:, :, 3, :, :] tscale = target[:, :, 4, :, :] tobj = target[:, :, 5, :, :] tcls = paddle.transpose(target[:, :, 6:, :, :], perm=[0, 1, 3, 4, 2]) tcls.stop_gradient = True return (tx, ty, tw, th, tscale, tobj, tcls) def _calc_obj_loss(output, obj, tobj, gt_box, batch_size, anchors, num_classes, downsample, ignore_thresh, scale_x_y): # A prediction bbox overlap any gt_bbox over ignore_thresh, # objectness loss will be ignored, process as follows: # 1. get pred bbox, which is same with YOLOv3 infer mode, use yolo_box here # NOTE: img_size is set as 1.0 to get noramlized pred bbox bbox, prob = paddle.vision.ops.yolo_box( x=output, img_size=paddle.ones( shape=[batch_size, 2], dtype="int32"), anchors=anchors, class_num=num_classes, conf_thresh=0., downsample_ratio=downsample, clip_bbox=False, scale_x_y=scale_x_y) # 2. split pred bbox and gt bbox by sample, calculate IoU between pred bbox # and gt bbox in each sample if batch_size > 1: preds = paddle.split(bbox, batch_size, axis=0) gts = paddle.split(gt_box, batch_size, axis=0) else: preds = [bbox] gts = [gt_box] probs = [prob] ious = [] for pred, gt in zip(preds, gts): def box_xywh2xyxy(box): x = box[:, 0] y = box[:, 1] w = box[:, 2] h = box[:, 3] return paddle.stack( [ x - w / 2., y - h / 2., x + w / 2., y + h / 2., ], axis=1) pred = paddle.squeeze(pred, axis=[0]) gt = box_xywh2xyxy(paddle.squeeze(gt, axis=[0])) ious.append(iou_similarity(pred, gt)) iou = paddle.stack(ious, axis=0) # 3. Get iou_mask by IoU between gt bbox and prediction bbox, # Get obj_mask by tobj(holds gt_score), calculate objectness loss max_iou = paddle.max(iou, axis=-1) iou_mask = paddle.cast(max_iou <= ignore_thresh, dtype="float32") output_shape = output.shape an_num = len(anchors) // 2 iou_mask = paddle.reshape(iou_mask, (-1, an_num, output_shape[2], output_shape[3])) iou_mask.stop_gradient = True # NOTE: tobj holds gt_score, obj_mask holds object existence mask obj_mask = paddle.cast(tobj > 0., dtype="float32") obj_mask.stop_gradient = True # For positive objectness grids, objectness loss should be calculated # For negative objectness grids, objectness loss is calculated only iou_mask == 1.0 obj_sigmoid = F.sigmoid(obj) loss_obj = F.binary_cross_entropy(obj_sigmoid, obj_mask, reduction='none') loss_obj_pos = paddle.sum(loss_obj * tobj, axis=[1, 2, 3]) loss_obj_neg = paddle.sum(loss_obj * (1.0 - obj_mask) * iou_mask, axis=[1, 2, 3]) return loss_obj_pos, loss_obj_neg def fine_grained_loss(output, target, gt_box, batch_size, num_classes, anchors, ignore_thresh, downsample, scale_x_y=1., eps=1e-10): an_num = len(anchors) // 2 x, y, w, h, obj, cls = _split_output(output, an_num, num_classes) tx, ty, tw, th, tscale, tobj, tcls = _split_target(target) tscale_tobj = tscale * tobj scale_x_y = scale_x_y if (abs(scale_x_y - 1.0) < eps): x = F.sigmoid(x) y = F.sigmoid(y) loss_x = F.binary_cross_entropy(x, tx, reduction='none') * tscale_tobj loss_x = paddle.sum(loss_x, axis=[1, 2, 3]) loss_y = F.binary_cross_entropy(y, ty, reduction='none') * tscale_tobj loss_y = paddle.sum(loss_y, axis=[1, 2, 3]) else: dx = scale_x_y * F.sigmoid(x) - 0.5 * (scale_x_y - 1.0) dy = scale_x_y * F.sigmoid(y) - 0.5 * (scale_x_y - 1.0) loss_x = paddle.abs(dx - tx) * tscale_tobj loss_x = paddle.sum(loss_x, axis=[1, 2, 3]) loss_y = paddle.abs(dy - ty) * tscale_tobj loss_y = paddle.sum(loss_y, axis=[1, 2, 3]) # NOTE: we refined loss function of (w, h) as L1Loss loss_w = paddle.abs(w - tw) * tscale_tobj loss_w = paddle.sum(loss_w, axis=[1, 2, 3]) loss_h = paddle.abs(h - th) * tscale_tobj loss_h = paddle.sum(loss_h, axis=[1, 2, 3]) loss_obj_pos, loss_obj_neg = _calc_obj_loss( output, obj, tobj, gt_box, batch_size, anchors, num_classes, downsample, ignore_thresh, scale_x_y) cls = F.sigmoid(cls) loss_cls = F.binary_cross_entropy(cls, tcls, reduction='none') tobj = paddle.unsqueeze(tobj, axis=-1) loss_cls = paddle.multiply(loss_cls, tobj) loss_cls = paddle.sum(loss_cls, axis=[1, 2, 3, 4]) loss_xys = paddle.mean(loss_x + loss_y) loss_whs = paddle.mean(loss_w + loss_h) loss_objs = paddle.mean(loss_obj_pos + loss_obj_neg) loss_clss = paddle.mean(loss_cls) losses_all = { "loss_xy": paddle.sum(loss_xys), "loss_wh": paddle.sum(loss_whs), "loss_loc": paddle.sum(loss_xys) + paddle.sum(loss_whs), "loss_obj": paddle.sum(loss_objs), "loss_cls": paddle.sum(loss_clss), } return losses_all, x, y, tx, ty def gt2yolotarget(gt_bbox, gt_class, gt_score, anchors, mask, num_classes, size, stride): grid_h, grid_w = size h, w = grid_h * stride, grid_w * stride an_hw = np.array(anchors) / np.array([[w, h]]) target = np.zeros( (len(mask), 6 + num_classes, grid_h, grid_w), dtype=np.float32) for b in range(gt_bbox.shape[0]): gx, gy, gw, gh = gt_bbox[b, :] cls = gt_class[b] score = gt_score[b] if gw <= 0. or gh <= 0. or score <= 0.: continue # find best match anchor index best_iou = 0. best_idx = -1 for an_idx in range(an_hw.shape[0]): iou = jaccard_overlap([0., 0., gw, gh], [0., 0., an_hw[an_idx, 0], an_hw[an_idx, 1]]) if iou > best_iou: best_iou = iou best_idx = an_idx gi = int(gx * grid_w) gj = int(gy * grid_h) # gtbox should be regresed in this layes if best match # anchor index in anchor mask of this layer if best_idx in mask: best_n = mask.index(best_idx) # x, y, w, h, scale target[best_n, 0, gj, gi] = gx * grid_w - gi target[best_n, 1, gj, gi] = gy * grid_h - gj target[best_n, 2, gj, gi] = np.log(gw * w / anchors[best_idx][0]) target[best_n, 3, gj, gi] = np.log(gh * h / anchors[best_idx][1]) target[best_n, 4, gj, gi] = 2.0 - gw * gh # objectness record gt_score # if target[best_n, 5, gj, gi] > 0: # print('find 1 duplicate') target[best_n, 5, gj, gi] = score # classification target[best_n, 6 + cls, gj, gi] = 1. return target class TestYolov3LossOp(unittest.TestCase): def setUp(self): self.initTestCase() x = np.random.uniform(0, 1, self.x_shape).astype('float64') gtbox = np.random.random(size=self.gtbox_shape).astype('float64') gtlabel = np.random.randint(0, self.class_num, self.gtbox_shape[:2]) gtmask = np.random.randint(0, 2, self.gtbox_shape[:2]) gtbox = gtbox * gtmask[:, :, np.newaxis] gtlabel = gtlabel * gtmask gtscore = np.ones(self.gtbox_shape[:2]).astype('float64') if self.gtscore: gtscore = np.random.random(self.gtbox_shape[:2]).astype('float64') target = [] for box, label, score in zip(gtbox, gtlabel, gtscore): target.append( gt2yolotarget(box, label, score, self.anchors, self.anchor_mask, self.class_num, (self.h, self.w ), self.downsample_ratio)) self.target = np.array(target).astype('float64') self.mask_anchors = [] for i in self.anchor_mask: self.mask_anchors.extend(self.anchors[i]) self.x = x self.gtbox = gtbox self.gtlabel = gtlabel self.gtscore = gtscore def initTestCase(self): self.b = 8 self.h = 19 self.w = 19 self.anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], [59, 119], [116, 90], [156, 198], [373, 326]] self.anchor_mask = [6, 7, 8] self.na = len(self.anchor_mask) self.class_num = 80 self.ignore_thresh = 0.7 self.downsample_ratio = 32 self.x_shape = (self.b, len(self.anchor_mask) * (5 + self.class_num), self.h, self.w) self.gtbox_shape = (self.b, 40, 4) self.gtscore = True self.use_label_smooth = False self.scale_x_y = 1. def test_loss(self): x, gtbox, gtlabel, gtscore, target = self.x, self.gtbox, self.gtlabel, self.gtscore, self.target yolo_loss = YOLOv3Loss( ignore_thresh=self.ignore_thresh, label_smooth=self.use_label_smooth, num_classes=self.class_num, downsample=self.downsample_ratio, scale_x_y=self.scale_x_y) x = paddle.to_tensor(x.astype(np.float32)) gtbox = paddle.to_tensor(gtbox.astype(np.float32)) gtlabel = paddle.to_tensor(gtlabel.astype(np.float32)) gtscore = paddle.to_tensor(gtscore.astype(np.float32)) t = paddle.to_tensor(target.astype(np.float32)) anchor = [self.anchors[i] for i in self.anchor_mask] (yolo_loss1, px, py, tx, ty) = fine_grained_loss( output=x, target=t, gt_box=gtbox, batch_size=self.b, num_classes=self.class_num, anchors=self.mask_anchors, ignore_thresh=self.ignore_thresh, downsample=self.downsample_ratio, scale_x_y=self.scale_x_y) yolo_loss2 = yolo_loss.yolov3_loss( x, t, gtbox, anchor, self.downsample_ratio, self.scale_x_y) for k in yolo_loss2: self.assertAlmostEqual( float(yolo_loss1[k]), float(yolo_loss2[k]), delta=1e-2, msg=k) class TestYolov3LossNoGTScore(TestYolov3LossOp): def initTestCase(self): self.b = 1 self.h = 76 self.w = 76 self.anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], [59, 119], [116, 90], [156, 198], [373, 326]] self.anchor_mask = [0, 1, 2] self.na = len(self.anchor_mask) self.class_num = 80 self.ignore_thresh = 0.7 self.downsample_ratio = 8 self.x_shape = (self.b, len(self.anchor_mask) * (5 + self.class_num), self.h, self.w) self.gtbox_shape = (self.b, 40, 4) self.gtscore = False self.use_label_smooth = False self.scale_x_y = 1. class TestYolov3LossWithScaleXY(TestYolov3LossOp): def initTestCase(self): self.b = 5 self.h = 38 self.w = 38 self.anchors = [[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], [59, 119], [116, 90], [156, 198], [373, 326]] self.anchor_mask = [3, 4, 5] self.na = len(self.anchor_mask) self.class_num = 80 self.ignore_thresh = 0.7 self.downsample_ratio = 16 self.x_shape = (self.b, len(self.anchor_mask) * (5 + self.class_num), self.h, self.w) self.gtbox_shape = (self.b, 40, 4) self.gtscore = True self.use_label_smooth = False self.scale_x_y = 1.2 if __name__ == "__main__": unittest.main() ================================================ FILE: ppdet/modeling/transformers/__init__.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import detr_transformer from . import utils from . import matchers from . import position_encoding from . import deformable_transformer from . import dino_transformer from . import group_detr_transformer from . import mask_dino_transformer from . import rtdetr_transformer from . import hybrid_encoder from . import mask_rtdetr_transformer from . import rtdetr_transformerv2 from . import rtdetr_transformerv3 from .detr_transformer import * from .utils import * from .matchers import * from .position_encoding import * from .deformable_transformer import * from .dino_transformer import * from .petr_transformer import * from .group_detr_transformer import * from .mask_dino_transformer import * from .rtdetr_transformer import * from .hybrid_encoder import * from .mask_rtdetr_transformer import * from .rtdetr_transformerv2 import * from .rtdetr_transformerv3 import * ================================================ FILE: ppdet/modeling/transformers/deformable_transformer.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR) # Copyright (c) 2020 SenseTime. All Rights Reserved. from __future__ import absolute_import from __future__ import division from __future__ import print_function import math import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from ppdet.core.workspace import register from ..layers import MultiHeadAttention from .position_encoding import PositionEmbedding from .utils import _get_clones, get_valid_ratio from ..initializer import linear_init_, constant_, xavier_uniform_, normal_ __all__ = ['DeformableTransformer'] class MSDeformableAttention(nn.Layer): def __init__(self, embed_dim=256, num_heads=8, num_levels=4, num_points=4, lr_mult=0.1): """ Multi-Scale Deformable Attention Module """ super(MSDeformableAttention, self).__init__() self.embed_dim = embed_dim self.num_heads = num_heads self.num_levels = num_levels self.num_points = num_points self.total_points = num_heads * num_levels * num_points self.head_dim = embed_dim // num_heads assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" self.sampling_offsets = nn.Linear( embed_dim, self.total_points * 2, weight_attr=ParamAttr(learning_rate=lr_mult), bias_attr=ParamAttr(learning_rate=lr_mult)) self.attention_weights = nn.Linear(embed_dim, self.total_points) self.value_proj = nn.Linear(embed_dim, embed_dim) self.output_proj = nn.Linear(embed_dim, embed_dim) try: # use cuda op from deformable_detr_ops import ms_deformable_attn except: # use paddle func from .utils import deformable_attention_core_func as ms_deformable_attn self.ms_deformable_attn_core = ms_deformable_attn self._reset_parameters() def _reset_parameters(self): # sampling_offsets constant_(self.sampling_offsets.weight) thetas = paddle.arange( self.num_heads, dtype=paddle.float32) * (2.0 * math.pi / self.num_heads) grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1) grid_init = grid_init / grid_init.abs().max(-1, keepdim=True) grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile( [1, self.num_levels, self.num_points, 1]) scaling = paddle.arange( 1, self.num_points + 1, dtype=paddle.float32).reshape([1, 1, -1, 1]) grid_init *= scaling self.sampling_offsets.bias.set_value(grid_init.flatten()) # attention_weights constant_(self.attention_weights.weight) constant_(self.attention_weights.bias) # proj xavier_uniform_(self.value_proj.weight) constant_(self.value_proj.bias) xavier_uniform_(self.output_proj.weight) constant_(self.output_proj.bias) def forward(self, query, reference_points, value, value_spatial_shapes, value_level_start_index, value_mask=None): """ Args: query (Tensor): [bs, query_length, C] reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area value (Tensor): [bs, value_length, C] value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] value_level_start_index (Tensor(int64)): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...] value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements Returns: output (Tensor): [bs, Length_{query}, C] """ bs, Len_q = query.shape[:2] Len_v = value.shape[1] assert int(value_spatial_shapes.prod(1).sum()) == Len_v value = self.value_proj(value) if value_mask is not None: value_mask = value_mask.astype(value.dtype).unsqueeze(-1) value *= value_mask value = value.reshape([bs, Len_v, self.num_heads, self.head_dim]) sampling_offsets = self.sampling_offsets(query).reshape( [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2]) attention_weights = self.attention_weights(query).reshape( [bs, Len_q, self.num_heads, self.num_levels * self.num_points]) attention_weights = F.softmax(attention_weights).reshape( [bs, Len_q, self.num_heads, self.num_levels, self.num_points]) if reference_points.shape[-1] == 2: offset_normalizer = value_spatial_shapes.flip([1]).reshape( [1, 1, 1, self.num_levels, 1, 2]) sampling_locations = reference_points.reshape([ bs, Len_q, 1, self.num_levels, 1, 2 ]) + sampling_offsets / offset_normalizer.astype(sampling_offsets.dtype) elif reference_points.shape[-1] == 4: sampling_locations = ( reference_points[:, :, None, :, None, :2] + sampling_offsets / self.num_points * reference_points[:, :, None, :, None, 2:] * 0.5) else: raise ValueError( "Last dim of reference_points must be 2 or 4, but get {} instead.". format(reference_points.shape[-1])) output = self.ms_deformable_attn_core( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) output = self.output_proj(output) return output class DeformableTransformerEncoderLayer(nn.Layer): def __init__(self, d_model=256, n_head=8, dim_feedforward=1024, dropout=0.1, activation="relu", n_levels=4, n_points=4, lr_mult=0.1, weight_attr=None, bias_attr=None): super(DeformableTransformerEncoderLayer, self).__init__() # self attention self.self_attn = MSDeformableAttention(d_model, n_head, n_levels, n_points, lr_mult) self.dropout1 = nn.Dropout(dropout) self.norm1 = nn.LayerNorm( d_model, weight_attr=weight_attr, bias_attr=bias_attr) # ffn self.linear1 = nn.Linear(d_model, dim_feedforward) self.activation = getattr(F, activation) self.dropout2 = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.dropout3 = nn.Dropout(dropout) self.norm2 = nn.LayerNorm( d_model, weight_attr=weight_attr, bias_attr=bias_attr) self._reset_parameters() def _reset_parameters(self): linear_init_(self.linear1) linear_init_(self.linear2) xavier_uniform_(self.linear1.weight) xavier_uniform_(self.linear2.weight) def with_pos_embed(self, tensor, pos): return tensor if pos is None else tensor + pos def forward_ffn(self, src): src2 = self.linear2(self.dropout2(self.activation(self.linear1(src)))) src = src + self.dropout3(src2) src = self.norm2(src) return src def forward(self, src, reference_points, spatial_shapes, level_start_index, src_mask=None, query_pos_embed=None): # self attention src2 = self.self_attn( self.with_pos_embed(src, query_pos_embed), reference_points, src, spatial_shapes, level_start_index, src_mask) src = src + self.dropout1(src2) src = self.norm1(src) # ffn src = self.forward_ffn(src) return src class DeformableTransformerEncoder(nn.Layer): def __init__(self, encoder_layer, num_layers): super(DeformableTransformerEncoder, self).__init__() self.layers = _get_clones(encoder_layer, num_layers) self.num_layers = num_layers @staticmethod def get_reference_points(spatial_shapes, valid_ratios, offset=0.5): valid_ratios = valid_ratios.unsqueeze(1) reference_points = [] for i, (H, W) in enumerate(spatial_shapes): ref_y, ref_x = paddle.meshgrid( paddle.arange(end=H) + offset, paddle.arange(end=W) + offset) ref_y = ref_y.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 1] * H) ref_x = ref_x.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 0] * W) reference_points.append(paddle.stack((ref_x, ref_y), axis=-1)) reference_points = paddle.concat(reference_points, 1).unsqueeze(2) reference_points = reference_points * valid_ratios return reference_points def forward(self, feat, spatial_shapes, level_start_index, feat_mask=None, query_pos_embed=None, valid_ratios=None): if valid_ratios is None: valid_ratios = paddle.ones( [feat.shape[0], spatial_shapes.shape[0], 2]) reference_points = self.get_reference_points(spatial_shapes, valid_ratios) for layer in self.layers: feat = layer(feat, reference_points, spatial_shapes, level_start_index, feat_mask, query_pos_embed) return feat class DeformableTransformerDecoderLayer(nn.Layer): def __init__(self, d_model=256, n_head=8, dim_feedforward=1024, dropout=0.1, activation="relu", n_levels=4, n_points=4, lr_mult=0.1, weight_attr=None, bias_attr=None): super(DeformableTransformerDecoderLayer, self).__init__() # self attention self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout) self.dropout1 = nn.Dropout(dropout) self.norm1 = nn.LayerNorm( d_model, weight_attr=weight_attr, bias_attr=bias_attr) # cross attention self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels, n_points, lr_mult) self.dropout2 = nn.Dropout(dropout) self.norm2 = nn.LayerNorm( d_model, weight_attr=weight_attr, bias_attr=bias_attr) # ffn self.linear1 = nn.Linear(d_model, dim_feedforward) self.activation = getattr(F, activation) self.dropout3 = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.dropout4 = nn.Dropout(dropout) self.norm3 = nn.LayerNorm( d_model, weight_attr=weight_attr, bias_attr=bias_attr) self._reset_parameters() def _reset_parameters(self): linear_init_(self.linear1) linear_init_(self.linear2) xavier_uniform_(self.linear1.weight) xavier_uniform_(self.linear2.weight) def with_pos_embed(self, tensor, pos): return tensor if pos is None else tensor + pos def forward_ffn(self, tgt): tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) tgt = tgt + self.dropout4(tgt2) tgt = self.norm3(tgt) return tgt def forward(self, tgt, reference_points, memory, memory_spatial_shapes, memory_level_start_index, memory_mask=None, query_pos_embed=None): # self attention q = k = self.with_pos_embed(tgt, query_pos_embed) tgt2 = self.self_attn(q, k, value=tgt) tgt = tgt + self.dropout1(tgt2) tgt = self.norm1(tgt) # cross attention tgt2 = self.cross_attn( self.with_pos_embed(tgt, query_pos_embed), reference_points, memory, memory_spatial_shapes, memory_level_start_index, memory_mask) tgt = tgt + self.dropout2(tgt2) tgt = self.norm2(tgt) # ffn tgt = self.forward_ffn(tgt) return tgt class DeformableTransformerDecoder(nn.Layer): def __init__(self, decoder_layer, num_layers, return_intermediate=False): super(DeformableTransformerDecoder, self).__init__() self.layers = _get_clones(decoder_layer, num_layers) self.num_layers = num_layers self.return_intermediate = return_intermediate def forward(self, tgt, reference_points, memory, memory_spatial_shapes, memory_level_start_index, memory_mask=None, query_pos_embed=None): output = tgt intermediate = [] for lid, layer in enumerate(self.layers): output = layer(output, reference_points, memory, memory_spatial_shapes, memory_level_start_index, memory_mask, query_pos_embed) if self.return_intermediate: intermediate.append(output) if self.return_intermediate: return paddle.stack(intermediate) return output.unsqueeze(0) @register class DeformableTransformer(nn.Layer): __shared__ = ['hidden_dim'] def __init__(self, num_queries=300, position_embed_type='sine', return_intermediate_dec=True, in_feats_channel=[512, 1024, 2048], num_feature_levels=4, num_encoder_points=4, num_decoder_points=4, hidden_dim=256, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=1024, dropout=0.1, activation="relu", lr_mult=0.1, pe_temperature=10000, pe_offset=-0.5): super(DeformableTransformer, self).__init__() assert position_embed_type in ['sine', 'learned'], \ f'ValueError: position_embed_type not supported {position_embed_type}!' assert len(in_feats_channel) <= num_feature_levels self.hidden_dim = hidden_dim self.nhead = nhead self.num_feature_levels = num_feature_levels encoder_layer = DeformableTransformerEncoderLayer( hidden_dim, nhead, dim_feedforward, dropout, activation, num_feature_levels, num_encoder_points, lr_mult) self.encoder = DeformableTransformerEncoder(encoder_layer, num_encoder_layers) decoder_layer = DeformableTransformerDecoderLayer( hidden_dim, nhead, dim_feedforward, dropout, activation, num_feature_levels, num_decoder_points) self.decoder = DeformableTransformerDecoder( decoder_layer, num_decoder_layers, return_intermediate_dec) self.level_embed = nn.Embedding(num_feature_levels, hidden_dim) self.tgt_embed = nn.Embedding(num_queries, hidden_dim) self.query_pos_embed = nn.Embedding(num_queries, hidden_dim) self.reference_points = nn.Linear( hidden_dim, 2, weight_attr=ParamAttr(learning_rate=lr_mult), bias_attr=ParamAttr(learning_rate=lr_mult)) self.input_proj = nn.LayerList() for in_channels in in_feats_channel: self.input_proj.append( nn.Sequential( nn.Conv2D( in_channels, hidden_dim, kernel_size=1), nn.GroupNorm(32, hidden_dim))) in_channels = in_feats_channel[-1] for _ in range(num_feature_levels - len(in_feats_channel)): self.input_proj.append( nn.Sequential( nn.Conv2D( in_channels, hidden_dim, kernel_size=3, stride=2, padding=1), nn.GroupNorm(32, hidden_dim))) in_channels = hidden_dim self.position_embedding = PositionEmbedding( hidden_dim // 2, temperature=pe_temperature, normalize=True if position_embed_type == 'sine' else False, embed_type=position_embed_type, offset=pe_offset, eps=1e-4) self._reset_parameters() def _reset_parameters(self): normal_(self.level_embed.weight) normal_(self.tgt_embed.weight) normal_(self.query_pos_embed.weight) xavier_uniform_(self.reference_points.weight) constant_(self.reference_points.bias) for l in self.input_proj: xavier_uniform_(l[0].weight) constant_(l[0].bias) @classmethod def from_config(cls, cfg, input_shape): return {'in_feats_channel': [i.channels for i in input_shape], } def forward(self, src_feats, src_mask=None, *args, **kwargs): srcs = [] for i in range(len(src_feats)): srcs.append(self.input_proj[i](src_feats[i])) if self.num_feature_levels > len(srcs): len_srcs = len(srcs) for i in range(len_srcs, self.num_feature_levels): if i == len_srcs: srcs.append(self.input_proj[i](src_feats[-1])) else: srcs.append(self.input_proj[i](srcs[-1])) src_flatten = [] mask_flatten = [] lvl_pos_embed_flatten = [] spatial_shapes = [] valid_ratios = [] for level, src in enumerate(srcs): src_shape = paddle.shape(src) bs = src_shape[0:1] h = src_shape[2:3] w = src_shape[3:4] spatial_shapes.append(paddle.concat([h, w])) src = src.flatten(2).transpose([0, 2, 1]) src_flatten.append(src) if src_mask is not None: mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0] else: mask = paddle.ones([bs, h, w]) valid_ratios.append(get_valid_ratio(mask)) pos_embed = self.position_embedding(mask).flatten(1, 2) lvl_pos_embed = pos_embed + self.level_embed.weight[level] lvl_pos_embed_flatten.append(lvl_pos_embed) mask = mask.flatten(1) mask_flatten.append(mask) src_flatten = paddle.concat(src_flatten, 1) mask_flatten = None if src_mask is None else paddle.concat(mask_flatten, 1) lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1) # [l, 2] spatial_shapes = paddle.to_tensor( paddle.stack(spatial_shapes).astype('int64')) # [l], 每一个level的起始index level_start_index = paddle.concat([ paddle.zeros( [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1] ]) # [b, l, 2] valid_ratios = paddle.stack(valid_ratios, 1) # encoder memory = self.encoder(src_flatten, spatial_shapes, level_start_index, mask_flatten, lvl_pos_embed_flatten, valid_ratios) # prepare input for decoder bs, _, c = memory.shape query_embed = self.query_pos_embed.weight.unsqueeze(0).tile([bs, 1, 1]) tgt = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1]) reference_points = F.sigmoid(self.reference_points(query_embed)) reference_points_input = reference_points.unsqueeze( 2) * valid_ratios.unsqueeze(1) # decoder hs = self.decoder(tgt, reference_points_input, memory, spatial_shapes, level_start_index, mask_flatten, query_embed) return (hs, memory, reference_points) class QRDeformableTransformerDecoder(DeformableTransformerDecoder): def __init__(self, decoder_layer, num_layers, start_q=None, end_q=None, return_intermediate=False): super(QRDeformableTransformerDecoder, self).__init__( decoder_layer, num_layers, return_intermediate=return_intermediate) self.start_q = start_q self.end_q = end_q def forward(self, tgt, reference_points, memory, memory_spatial_shapes, memory_level_start_index, memory_mask=None, query_pos_embed=None): if not self.training: return super(QRDeformableTransformerDecoder, self).forward( tgt, reference_points, memory, memory_spatial_shapes, memory_level_start_index, memory_mask=memory_mask, query_pos_embed=query_pos_embed) batchsize = tgt.shape[0] query_list_reserve = [tgt] intermediate = [] for lid, layer in enumerate(self.layers): start_q = self.start_q[lid] end_q = self.end_q[lid] query_list = query_list_reserve.copy()[start_q:end_q] # prepare for parallel process output = paddle.concat(query_list, axis=0) fakesetsize = int(output.shape[0] / batchsize) reference_points_tiled = reference_points.tile([fakesetsize, 1, 1, 1]) memory_tiled = memory.tile([fakesetsize, 1, 1]) query_pos_embed_tiled = query_pos_embed.tile([fakesetsize, 1, 1]) memory_mask_tiled = memory_mask.tile([fakesetsize, 1]) output = layer(output, reference_points_tiled, memory_tiled, memory_spatial_shapes, memory_level_start_index, memory_mask_tiled, query_pos_embed_tiled) for i in range(fakesetsize): query_list_reserve.append(output[batchsize*i:batchsize*(i+1)]) if self.return_intermediate: for i in range(fakesetsize): intermediate.append(output[batchsize*i:batchsize*(i+1)]) if self.return_intermediate: return paddle.stack(intermediate) return output.unsqueeze(0) @register class QRDeformableTransformer(DeformableTransformer): def __init__(self, num_queries=300, position_embed_type='sine', return_intermediate_dec=True, in_feats_channel=[512, 1024, 2048], num_feature_levels=4, num_encoder_points=4, num_decoder_points=4, hidden_dim=256, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=1024, dropout=0.1, activation="relu", lr_mult=0.1, pe_temperature=10000, pe_offset=-0.5, start_q=None, end_q=None): super(QRDeformableTransformer, self).__init__( num_queries=num_queries, position_embed_type=position_embed_type, return_intermediate_dec=return_intermediate_dec, in_feats_channel=in_feats_channel, num_feature_levels=num_feature_levels, num_encoder_points=num_encoder_points, num_decoder_points=num_decoder_points, hidden_dim=hidden_dim, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, activation=activation, lr_mult=lr_mult, pe_temperature=pe_temperature, pe_offset=pe_offset) decoder_layer = DeformableTransformerDecoderLayer( hidden_dim, nhead, dim_feedforward, dropout, activation, num_feature_levels, num_decoder_points) self.decoder = QRDeformableTransformerDecoder( decoder_layer, num_decoder_layers, start_q, end_q, return_intermediate_dec) ================================================ FILE: ppdet/modeling/transformers/detr_transformer.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Modified from DETR (https://github.com/facebookresearch/detr) # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from ..layers import MultiHeadAttention, _convert_attention_mask from .position_encoding import PositionEmbedding from .utils import _get_clones from ..initializer import linear_init_, conv_init_, xavier_uniform_, normal_ __all__ = ['DETRTransformer'] class TransformerEncoderLayer(nn.Layer): def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", attn_dropout=None, act_dropout=None, normalize_before=False): super(TransformerEncoderLayer, self).__init__() attn_dropout = dropout if attn_dropout is None else attn_dropout act_dropout = dropout if act_dropout is None else act_dropout self.normalize_before = normalize_before self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout) # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train") self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train") self.activation = getattr(F, activation) self._reset_parameters() def _reset_parameters(self): linear_init_(self.linear1) linear_init_(self.linear2) @staticmethod def with_pos_embed(tensor, pos_embed): return tensor if pos_embed is None else tensor + pos_embed def forward(self, src, src_mask=None, pos_embed=None): residual = src if self.normalize_before: src = self.norm1(src) q = k = self.with_pos_embed(src, pos_embed) src = self.self_attn(q, k, value=src, attn_mask=src_mask) src = residual + self.dropout1(src) if not self.normalize_before: src = self.norm1(src) residual = src if self.normalize_before: src = self.norm2(src) src = self.linear2(self.dropout(self.activation(self.linear1(src)))) src = residual + self.dropout2(src) if not self.normalize_before: src = self.norm2(src) return src class TransformerEncoder(nn.Layer): def __init__(self, encoder_layer, num_layers, norm=None): super(TransformerEncoder, self).__init__() self.layers = _get_clones(encoder_layer, num_layers) self.num_layers = num_layers self.norm = norm def forward(self, src, src_mask=None, pos_embed=None): output = src for layer in self.layers: output = layer(output, src_mask=src_mask, pos_embed=pos_embed) if self.norm is not None: output = self.norm(output) return output class TransformerDecoderLayer(nn.Layer): def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", attn_dropout=None, act_dropout=None, normalize_before=False): super(TransformerDecoderLayer, self).__init__() attn_dropout = dropout if attn_dropout is None else attn_dropout act_dropout = dropout if act_dropout is None else act_dropout self.normalize_before = normalize_before self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout) self.cross_attn = MultiHeadAttention(d_model, nhead, attn_dropout) # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train") self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train") self.dropout3 = nn.Dropout(dropout, mode="upscale_in_train") self.activation = getattr(F, activation) self._reset_parameters() def _reset_parameters(self): linear_init_(self.linear1) linear_init_(self.linear2) @staticmethod def with_pos_embed(tensor, pos_embed): return tensor if pos_embed is None else tensor + pos_embed def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, pos_embed=None, query_pos_embed=None): tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype) residual = tgt if self.normalize_before: tgt = self.norm1(tgt) q = k = self.with_pos_embed(tgt, query_pos_embed) tgt = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask) tgt = residual + self.dropout1(tgt) if not self.normalize_before: tgt = self.norm1(tgt) residual = tgt if self.normalize_before: tgt = self.norm2(tgt) q = self.with_pos_embed(tgt, query_pos_embed) k = self.with_pos_embed(memory, pos_embed) tgt = self.cross_attn(q, k, value=memory, attn_mask=memory_mask) tgt = residual + self.dropout2(tgt) if not self.normalize_before: tgt = self.norm2(tgt) residual = tgt if self.normalize_before: tgt = self.norm3(tgt) tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) tgt = residual + self.dropout3(tgt) if not self.normalize_before: tgt = self.norm3(tgt) return tgt class TransformerDecoder(nn.Layer): def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False): super(TransformerDecoder, self).__init__() self.layers = _get_clones(decoder_layer, num_layers) self.num_layers = num_layers self.norm = norm self.return_intermediate = return_intermediate def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, pos_embed=None, query_pos_embed=None): tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype) output = tgt intermediate = [] for layer in self.layers: output = layer( output, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, pos_embed=pos_embed, query_pos_embed=query_pos_embed) if self.return_intermediate: intermediate.append(self.norm(output)) if self.norm is not None: output = self.norm(output) if self.return_intermediate: return paddle.stack(intermediate) return output.unsqueeze(0) @register class DETRTransformer(nn.Layer): __shared__ = ['hidden_dim'] def __init__(self, num_queries=100, position_embed_type='sine', return_intermediate_dec=True, backbone_num_channels=2048, hidden_dim=256, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, activation="relu", pe_temperature=10000, pe_offset=0., attn_dropout=None, act_dropout=None, normalize_before=False): super(DETRTransformer, self).__init__() assert position_embed_type in ['sine', 'learned'],\ f'ValueError: position_embed_type not supported {position_embed_type}!' self.hidden_dim = hidden_dim self.nhead = nhead encoder_layer = TransformerEncoderLayer( hidden_dim, nhead, dim_feedforward, dropout, activation, attn_dropout, act_dropout, normalize_before) encoder_norm = nn.LayerNorm(hidden_dim) if normalize_before else None self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) decoder_layer = TransformerDecoderLayer( hidden_dim, nhead, dim_feedforward, dropout, activation, attn_dropout, act_dropout, normalize_before) decoder_norm = nn.LayerNorm(hidden_dim) self.decoder = TransformerDecoder( decoder_layer, num_decoder_layers, decoder_norm, return_intermediate=return_intermediate_dec) self.input_proj = nn.Conv2D( backbone_num_channels, hidden_dim, kernel_size=1) self.query_pos_embed = nn.Embedding(num_queries, hidden_dim) self.position_embedding = PositionEmbedding( hidden_dim // 2, temperature=pe_temperature, normalize=True if position_embed_type == 'sine' else False, embed_type=position_embed_type, offset=pe_offset) self._reset_parameters() def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: xavier_uniform_(p) conv_init_(self.input_proj) normal_(self.query_pos_embed.weight) @classmethod def from_config(cls, cfg, input_shape): return { 'backbone_num_channels': [i.channels for i in input_shape][-1], } def _convert_attention_mask(self, mask): return (mask - 1.0) * 1e9 def forward(self, src, src_mask=None, *args, **kwargs): r""" Applies a Transformer model on the inputs. Parameters: src (List(Tensor)): Backbone feature maps with shape [[bs, c, h, w]]. src_mask (Tensor, optional): A tensor used in multi-head attention to prevents attention to some unwanted positions, usually the paddings or the subsequent positions. It is a tensor with shape [bs, H, W]`. When the data type is bool, the unwanted positions have `False` values and the others have `True` values. When the data type is int, the unwanted positions have 0 values and the others have 1 values. When the data type is float, the unwanted positions have `-INF` values and the others have 0 values. It can be None when nothing wanted or needed to be prevented attention to. Default None. Returns: output (Tensor): [num_levels, batch_size, num_queries, hidden_dim] memory (Tensor): [batch_size, hidden_dim, h, w] """ # use last level feature map src_proj = self.input_proj(src[-1]) bs, c, h, w = src_proj.shape # flatten [B, C, H, W] to [B, HxW, C] src_flatten = src_proj.flatten(2).transpose([0, 2, 1]) if src_mask is not None: src_mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0] else: src_mask = paddle.ones([bs, h, w]) pos_embed = self.position_embedding(src_mask).flatten(1, 2) if self.training: src_mask = self._convert_attention_mask(src_mask) src_mask = src_mask.reshape([bs, 1, 1, h * w]) else: src_mask = None memory = self.encoder( src_flatten, src_mask=src_mask, pos_embed=pos_embed) query_pos_embed = self.query_pos_embed.weight.unsqueeze(0).tile( [bs, 1, 1]) tgt = paddle.zeros_like(query_pos_embed) output = self.decoder( tgt, memory, memory_mask=src_mask, pos_embed=pos_embed, query_pos_embed=query_pos_embed) if self.training: src_mask = src_mask.reshape([bs, 1, 1, h, w]) else: src_mask = None return (output, memory.transpose([0, 2, 1]).reshape([bs, c, h, w]), src_proj, src_mask) ================================================ FILE: ppdet/modeling/transformers/dino_transformer.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR) # Copyright (c) 2020 SenseTime. All Rights Reserved. # Modified from detrex (https://github.com/IDEA-Research/detrex) # Copyright 2022 The IDEA Authors. All rights reserved. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.regularizer import L2Decay from ppdet.core.workspace import register from ..layers import MultiHeadAttention from .position_encoding import PositionEmbedding from ..heads.detr_head import MLP from .deformable_transformer import (MSDeformableAttention, DeformableTransformerEncoderLayer, DeformableTransformerEncoder) from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_, bias_init_with_prob) from .utils import (_get_clones, get_valid_ratio, get_contrastive_denoising_training_group, get_sine_pos_embed, inverse_sigmoid) __all__ = ['DINOTransformer'] class DINOTransformerDecoderLayer(nn.Layer): def __init__(self, d_model=256, n_head=8, dim_feedforward=1024, dropout=0., activation="relu", n_levels=4, n_points=4, lr_mult=1.0, weight_attr=None, bias_attr=None): super(DINOTransformerDecoderLayer, self).__init__() # self attention self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout) self.dropout1 = nn.Dropout(dropout) self.norm1 = nn.LayerNorm( d_model, weight_attr=weight_attr, bias_attr=bias_attr) # cross attention self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels, n_points, lr_mult) self.dropout2 = nn.Dropout(dropout) self.norm2 = nn.LayerNorm( d_model, weight_attr=weight_attr, bias_attr=bias_attr) # ffn self.linear1 = nn.Linear(d_model, dim_feedforward) self.activation = getattr(F, activation) self.dropout3 = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.dropout4 = nn.Dropout(dropout) self.norm3 = nn.LayerNorm( d_model, weight_attr=weight_attr, bias_attr=bias_attr) self._reset_parameters() def _reset_parameters(self): linear_init_(self.linear1) linear_init_(self.linear2) xavier_uniform_(self.linear1.weight) xavier_uniform_(self.linear2.weight) def with_pos_embed(self, tensor, pos): return tensor if pos is None else tensor + pos def forward_ffn(self, tgt): return self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) def forward(self, tgt, reference_points, memory, memory_spatial_shapes, memory_level_start_index, attn_mask=None, memory_mask=None, query_pos_embed=None): # self attention q = k = self.with_pos_embed(tgt, query_pos_embed) if attn_mask is not None: attn_mask = paddle.where( attn_mask.astype('bool'), paddle.zeros(attn_mask.shape, tgt.dtype), paddle.full(attn_mask.shape, float("-inf"), tgt.dtype)) tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask) tgt = tgt + self.dropout1(tgt2) tgt = self.norm1(tgt) # cross attention tgt2 = self.cross_attn( self.with_pos_embed(tgt, query_pos_embed), reference_points, memory, memory_spatial_shapes, memory_level_start_index, memory_mask) tgt = tgt + self.dropout2(tgt2) tgt = self.norm2(tgt) # ffn tgt2 = self.forward_ffn(tgt) tgt = tgt + self.dropout4(tgt2) tgt = self.norm3(tgt) return tgt class DINOTransformerDecoder(nn.Layer): def __init__(self, hidden_dim, decoder_layer, num_layers, weight_attr=None, bias_attr=None): super(DINOTransformerDecoder, self).__init__() self.layers = _get_clones(decoder_layer, num_layers) self.hidden_dim = hidden_dim self.num_layers = num_layers self.norm = nn.LayerNorm( hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr) def forward(self, tgt, ref_points_unact, memory, memory_spatial_shapes, memory_level_start_index, bbox_head, query_pos_head, valid_ratios=None, attn_mask=None, memory_mask=None): if valid_ratios is None: valid_ratios = paddle.ones( [memory.shape[0], memory_spatial_shapes.shape[0], 2]) output = tgt intermediate = [] inter_bboxes = [] ref_points = F.sigmoid(ref_points_unact) for i, layer in enumerate(self.layers): reference_points_input = ref_points.detach().unsqueeze( 2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1) query_pos_embed = get_sine_pos_embed( reference_points_input[..., 0, :], self.hidden_dim // 2) query_pos_embed = query_pos_head(query_pos_embed) output = layer(output, reference_points_input, memory, memory_spatial_shapes, memory_level_start_index, attn_mask, memory_mask, query_pos_embed) ref_points = F.sigmoid(bbox_head[i](output) + inverse_sigmoid( ref_points.detach())) intermediate.append(self.norm(output)) inter_bboxes.append(ref_points) return paddle.stack(intermediate), paddle.stack(inter_bboxes) @register class DINOTransformer(nn.Layer): __shared__ = ['num_classes', 'hidden_dim'] def __init__(self, num_classes=80, hidden_dim=256, num_queries=900, position_embed_type='sine', in_feats_channel=[512, 1024, 2048], num_levels=4, num_encoder_points=4, num_decoder_points=4, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=1024, dropout=0., activation="relu", lr_mult=1.0, pe_temperature=10000, pe_offset=-0.5, num_denoising=100, label_noise_ratio=0.5, box_noise_scale=1.0, learnt_init_query=True, eps=1e-2): super(DINOTransformer, self).__init__() assert position_embed_type in ['sine', 'learned'], \ f'ValueError: position_embed_type not supported {position_embed_type}!' assert len(in_feats_channel) <= num_levels self.hidden_dim = hidden_dim self.nhead = nhead self.num_levels = num_levels self.num_classes = num_classes self.num_queries = num_queries self.eps = eps self.num_decoder_layers = num_decoder_layers weight_attr = ParamAttr(regularizer=L2Decay(0.0)) bias_attr = ParamAttr(regularizer=L2Decay(0.0)) # backbone feature projection self._build_input_proj_layer(in_feats_channel, weight_attr, bias_attr) # Transformer module encoder_layer = DeformableTransformerEncoderLayer( hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels, num_encoder_points, lr_mult, weight_attr, bias_attr) self.encoder = DeformableTransformerEncoder(encoder_layer, num_encoder_layers) decoder_layer = DINOTransformerDecoderLayer( hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels, num_decoder_points, lr_mult, weight_attr, bias_attr) self.decoder = DINOTransformerDecoder(hidden_dim, decoder_layer, num_decoder_layers, weight_attr, bias_attr) # denoising part self.denoising_class_embed = nn.Embedding( num_classes, hidden_dim, weight_attr=ParamAttr(initializer=nn.initializer.Normal())) self.num_denoising = num_denoising self.label_noise_ratio = label_noise_ratio self.box_noise_scale = box_noise_scale # position embedding self.position_embedding = PositionEmbedding( hidden_dim // 2, temperature=pe_temperature, normalize=True if position_embed_type == 'sine' else False, embed_type=position_embed_type, offset=pe_offset) self.level_embed = nn.Embedding(num_levels, hidden_dim) # decoder embedding self.learnt_init_query = learnt_init_query if learnt_init_query: self.tgt_embed = nn.Embedding(num_queries, hidden_dim) self.query_pos_head = MLP(2 * hidden_dim, hidden_dim, hidden_dim, num_layers=2) # encoder head self.enc_output = nn.Sequential( nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm( hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr)) self.enc_score_head = nn.Linear(hidden_dim, num_classes) self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3) # decoder head self.dec_score_head = nn.LayerList([ nn.Linear(hidden_dim, num_classes) for _ in range(num_decoder_layers) ]) self.dec_bbox_head = nn.LayerList([ MLP(hidden_dim, hidden_dim, 4, num_layers=3) for _ in range(num_decoder_layers) ]) self._reset_parameters() def _reset_parameters(self): # class and bbox head init bias_cls = bias_init_with_prob(0.01) linear_init_(self.enc_score_head) constant_(self.enc_score_head.bias, bias_cls) constant_(self.enc_bbox_head.layers[-1].weight) constant_(self.enc_bbox_head.layers[-1].bias) for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head): linear_init_(cls_) constant_(cls_.bias, bias_cls) constant_(reg_.layers[-1].weight) constant_(reg_.layers[-1].bias) linear_init_(self.enc_output[0]) xavier_uniform_(self.enc_output[0].weight) normal_(self.level_embed.weight) if self.learnt_init_query: xavier_uniform_(self.tgt_embed.weight) xavier_uniform_(self.query_pos_head.layers[0].weight) xavier_uniform_(self.query_pos_head.layers[1].weight) for l in self.input_proj: xavier_uniform_(l[0].weight) constant_(l[0].bias) @classmethod def from_config(cls, cfg, input_shape): return {'in_feats_channel': [i.channels for i in input_shape], } def _build_input_proj_layer(self, in_feats_channel, weight_attr=None, bias_attr=None): self.input_proj = nn.LayerList() for in_channels in in_feats_channel: self.input_proj.append( nn.Sequential( ('conv', nn.Conv2D( in_channels, self.hidden_dim, kernel_size=1)), ( 'norm', nn.GroupNorm( 32, self.hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr)))) in_channels = in_feats_channel[-1] for _ in range(self.num_levels - len(in_feats_channel)): self.input_proj.append( nn.Sequential( ('conv', nn.Conv2D( in_channels, self.hidden_dim, kernel_size=3, stride=2, padding=1)), ('norm', nn.GroupNorm( 32, self.hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr)))) in_channels = self.hidden_dim def _get_encoder_input(self, feats, pad_mask=None): # get projection features proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)] if self.num_levels > len(proj_feats): len_srcs = len(proj_feats) for i in range(len_srcs, self.num_levels): if i == len_srcs: proj_feats.append(self.input_proj[i](feats[-1])) else: proj_feats.append(self.input_proj[i](proj_feats[-1])) # get encoder inputs feat_flatten = [] mask_flatten = [] lvl_pos_embed_flatten = [] spatial_shapes = [] valid_ratios = [] for i, feat in enumerate(proj_feats): bs, _, h, w = paddle.shape(feat) spatial_shapes.append(paddle.stack([h, w])) # [b,c,h,w] -> [b,h*w,c] feat_flatten.append(feat.flatten(2).transpose([0, 2, 1])) if pad_mask is not None: mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0] else: mask = paddle.ones([bs, h, w]) valid_ratios.append(get_valid_ratio(mask)) # [b, h*w, c] pos_embed = self.position_embedding(mask).flatten(1, 2) lvl_pos_embed = pos_embed + self.level_embed.weight[i] lvl_pos_embed_flatten.append(lvl_pos_embed) if pad_mask is not None: # [b, h*w] mask_flatten.append(mask.flatten(1)) # [b, l, c] feat_flatten = paddle.concat(feat_flatten, 1) # [b, l] mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten, 1) # [b, l, c] lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1) # [num_levels, 2] spatial_shapes = paddle.to_tensor( paddle.stack(spatial_shapes).astype('int64')) # [l] start index of each level level_start_index = paddle.concat([ paddle.zeros( [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1] ]) # [b, num_levels, 2] valid_ratios = paddle.stack(valid_ratios, 1) return (feat_flatten, spatial_shapes, level_start_index, mask_flatten, lvl_pos_embed_flatten, valid_ratios) def forward(self, feats, pad_mask=None, gt_meta=None): # input projection and embedding (feat_flatten, spatial_shapes, level_start_index, mask_flatten, lvl_pos_embed_flatten, valid_ratios) = self._get_encoder_input(feats, pad_mask) # encoder memory = self.encoder(feat_flatten, spatial_shapes, level_start_index, mask_flatten, lvl_pos_embed_flatten, valid_ratios) # prepare denoising training if self.training: denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \ get_contrastive_denoising_training_group(gt_meta, self.num_classes, self.num_queries, self.denoising_class_embed.weight, self.num_denoising, self.label_noise_ratio, self.box_noise_scale) else: denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \ self._get_decoder_input( memory, spatial_shapes, mask_flatten, denoising_class, denoising_bbox_unact) # decoder inter_feats, inter_bboxes = self.decoder( target, init_ref_points_unact, memory, spatial_shapes, level_start_index, self.dec_bbox_head, self.query_pos_head, valid_ratios, attn_mask, mask_flatten) out_bboxes = [] out_logits = [] for i in range(self.num_decoder_layers): out_logits.append(self.dec_score_head[i](inter_feats[i])) if i == 0: out_bboxes.append( F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) + init_ref_points_unact)) else: out_bboxes.append( F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) + inverse_sigmoid(inter_bboxes[i - 1]))) out_bboxes = paddle.stack(out_bboxes) out_logits = paddle.stack(out_logits) return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits, dn_meta) def _get_encoder_output_anchors(self, memory, spatial_shapes, memory_mask=None, grid_size=0.05): output_anchors = [] idx = 0 for lvl, (h, w) in enumerate(spatial_shapes): if memory_mask is not None: mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w]) valid_H = paddle.sum(mask_[:, :, 0], 1) valid_W = paddle.sum(mask_[:, 0, :], 1) else: valid_H, valid_W = h, w grid_y, grid_x = paddle.meshgrid( paddle.arange(end=h), paddle.arange(end=w)) grid_xy = paddle.stack([grid_x, grid_y], -1).astype(memory.dtype) valid_WH = paddle.stack([valid_W, valid_H], -1).reshape( [-1, 1, 1, 2]).astype(grid_xy.dtype) grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl) output_anchors.append( paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4])) idx += h * w output_anchors = paddle.concat(output_anchors, 1) valid_mask = ((output_anchors > self.eps) * (output_anchors < 1 - self.eps)).all(-1, keepdim=True) output_anchors = paddle.log(output_anchors / (1 - output_anchors)) if memory_mask is not None: valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0 output_anchors = paddle.where(valid_mask, output_anchors, paddle.to_tensor(float("inf"))) memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.)) output_memory = self.enc_output(memory) return output_memory, output_anchors def _get_decoder_input(self, memory, spatial_shapes, memory_mask=None, denoising_class=None, denoising_bbox_unact=None): bs, _, _ = memory.shape # prepare input for decoder output_memory, output_anchors = self._get_encoder_output_anchors( memory, spatial_shapes, memory_mask) enc_outputs_class = self.enc_score_head(output_memory) enc_outputs_coord_unact = self.enc_bbox_head( output_memory) + output_anchors _, topk_ind = paddle.topk( enc_outputs_class.max(-1), self.num_queries, axis=1) # extract region proposal boxes batch_ind = paddle.arange(end=bs).astype(topk_ind.dtype) batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries]) topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1) reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact, topk_ind) # unsigmoided. enc_topk_bboxes = F.sigmoid(reference_points_unact) if denoising_bbox_unact is not None: reference_points_unact = paddle.concat( [denoising_bbox_unact, reference_points_unact], 1) enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind) # extract region features if self.learnt_init_query: target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1]) else: target = paddle.gather_nd(output_memory, topk_ind).detach() if denoising_class is not None: target = paddle.concat([denoising_class, target], 1) return target, reference_points_unact.detach( ), enc_topk_bboxes, enc_topk_logits ================================================ FILE: ppdet/modeling/transformers/ext_op/README.md ================================================ # Multi-scale deformable attention自定义OP编译 该自定义OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。 ## 1. 环境依赖 - Paddle >= 2.3.2 - gcc 8.2 ## 2. 安装 请在当前路径下进行编译安装 ``` cd PaddleDetection/ppdet/modeling/transformers/ext_op/ python setup_ms_deformable_attn_op.py install ``` 编译完成后即可使用,以下为`ms_deformable_attn`的使用示例 ``` # 引入自定义op from deformable_detr_ops import ms_deformable_attn # 构造fake input tensor bs, n_heads, c = 2, 8, 8 query_length, n_levels, n_points = 2, 2, 2 spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64) level_start_index = paddle.concat((paddle.to_tensor( [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1])) value_length = sum([(H * W).item() for H, W in spatial_shapes]) def get_test_tensors(channels): value = paddle.rand( [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01 sampling_locations = paddle.rand( [bs, query_length, n_heads, n_levels, n_points, 2], dtype=paddle.float32) attention_weights = paddle.rand( [bs, query_length, n_heads, n_levels, n_points], dtype=paddle.float32) + 1e-5 attention_weights /= attention_weights.sum(-1, keepdim=True).sum( -2, keepdim=True) return [value, sampling_locations, attention_weights] value, sampling_locations, attention_weights = get_test_tensors(c) output = ms_deformable_attn(value, spatial_shapes, level_start_index, sampling_locations, attention_weights) ``` ## 3. 单元测试 可以通过执行单元测试来确认自定义算子功能的正确性,执行单元测试的示例如下所示: ``` python test_ms_deformable_attn_op.py ``` 运行成功后,打印如下: ``` *True check_forward_equal_with_paddle_float: max_abs_err 6.98e-10 max_rel_err 2.03e-07 *tensor1 True check_gradient_numerical(D=30) *tensor2 True check_gradient_numerical(D=30) *tensor3 True check_gradient_numerical(D=30) *tensor1 True check_gradient_numerical(D=32) *tensor2 True check_gradient_numerical(D=32) *tensor3 True check_gradient_numerical(D=32) *tensor1 True check_gradient_numerical(D=64) *tensor2 True check_gradient_numerical(D=64) *tensor3 True check_gradient_numerical(D=64) *tensor1 True check_gradient_numerical(D=71) *tensor2 True check_gradient_numerical(D=71) *tensor3 True check_gradient_numerical(D=71) *tensor1 True check_gradient_numerical(D=128) *tensor2 True check_gradient_numerical(D=128) *tensor3 True check_gradient_numerical(D=128) *tensor1 True check_gradient_numerical(D=1024) *tensor2 True check_gradient_numerical(D=1024) *tensor3 True check_gradient_numerical(D=1024) *tensor1 True check_gradient_numerical(D=1025) *tensor2 True check_gradient_numerical(D=1025) *tensor3 True check_gradient_numerical(D=1025) *tensor1 True check_gradient_numerical(D=2048) *tensor2 True check_gradient_numerical(D=2048) *tensor3 True check_gradient_numerical(D=2048) *tensor1 True check_gradient_numerical(D=3096) *tensor2 True check_gradient_numerical(D=3096) *tensor3 True check_gradient_numerical(D=3096) ``` ================================================ FILE: ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cc ================================================ /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/extension.h" #include // declare GPU implementation std::vector MSDeformableAttnCUDAForward(const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes, const paddle::Tensor &value_level_start_index, const paddle::Tensor &sampling_locations, const paddle::Tensor &attention_weights); std::vector MSDeformableAttnCUDABackward( const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes, const paddle::Tensor &value_level_start_index, const paddle::Tensor &sampling_locations, const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out); //// CPU not implemented std::vector> MSDeformableAttnInferShape(std::vector value_shape, std::vector value_spatial_shapes_shape, std::vector value_level_start_index_shape, std::vector sampling_locations_shape, std::vector attention_weights_shape) { return {{value_shape[0], sampling_locations_shape[1], value_shape[2] * value_shape[3]}}; } std::vector MSDeformableAttnInferDtype(paddle::DataType value_dtype, paddle::DataType value_spatial_shapes_dtype, paddle::DataType value_level_start_index_dtype, paddle::DataType sampling_locations_dtype, paddle::DataType attention_weights_dtype) { return {value_dtype}; } PD_BUILD_OP(ms_deformable_attn) .Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations", "AttentionWeights"}) .Outputs({"Out"}) .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDAForward)) .SetInferShapeFn(PD_INFER_SHAPE(MSDeformableAttnInferShape)) .SetInferDtypeFn(PD_INFER_DTYPE(MSDeformableAttnInferDtype)); PD_BUILD_GRAD_OP(ms_deformable_attn) .Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations", "AttentionWeights", paddle::Grad("Out")}) .Outputs({paddle::Grad("Value"), paddle::Grad("SpatialShapes"), paddle::Grad("LevelIndex"), paddle::Grad("SamplingLocations"), paddle::Grad("AttentionWeights")}) .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDABackward)); ================================================ FILE: ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cu ================================================ /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/extension.h" #define CUDA_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ i += blockDim.x * gridDim.x) const int CUDA_NUM_THREADS = 1024; inline int GET_BLOCKS(const int N, const int num_threads) { return (N + num_threads - 1) / num_threads; } // forward bilinear template __device__ data_t deformable_attn_bilinear_forward( const data_t *&bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const data_t &h, const data_t &w, const int &m, const int &c) { const int h_low = floor(h); const int w_low = floor(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const data_t lh = h - h_low; const data_t lw = w - w_low; const data_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * channels + c; data_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; } data_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; } data_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; } data_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; } const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } // forward kernel template __global__ void deformable_attn_cuda_kernel_forward( const int n, const data_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const data_t *data_sampling_loc, const data_t *data_attn_weight, const int batch_size, const int value_length, const int num_heads, const int channels, const int num_levels, const int query_length, const int num_points, data_t *output_data_ptr) { CUDA_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % query_length; _temp /= query_length; const int b_col = _temp; data_t *data_ptr = output_data_ptr + index; int data_weight_ptr = sampling_index * num_levels * num_points; int data_loc_w_ptr = data_weight_ptr << 1; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * value_length * qid_stride; data_t col = 0; for (int l_col = 0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const data_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride); for (int p_col = 0; p_col < num_points; ++p_col) { const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const data_t weight = data_attn_weight[data_weight_ptr]; const data_t h_im = loc_h * spatial_h - 0.5; const data_t w_im = loc_w * spatial_w - 0.5; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { col += deformable_attn_bilinear_forward( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight; } data_weight_ptr += 1; data_loc_w_ptr += 2; } } *data_ptr = col; } } #define CHECK_INPUT_GPU(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.") // forward std::vector MSDeformableAttnCUDAForward(const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes, const paddle::Tensor &value_level_start_index, const paddle::Tensor &sampling_locations, const paddle::Tensor &attention_weights) { CHECK_INPUT_GPU(value); CHECK_INPUT_GPU(value_spatial_shapes); CHECK_INPUT_GPU(value_level_start_index); CHECK_INPUT_GPU(sampling_locations); CHECK_INPUT_GPU(attention_weights); const int batch_size = value.shape()[0]; const int value_length = value.shape()[1]; const int num_heads = value.shape()[2]; const int channels = value.shape()[3]; const int num_levels = value_spatial_shapes.shape()[0]; const int query_length = sampling_locations.shape()[1]; const int num_points = sampling_locations.shape()[4]; auto output = paddle::full({batch_size, query_length, num_heads * channels}, 0, value.dtype(), paddle::GPUPlace()); const int num_kernels = batch_size * query_length * num_heads * channels; deformable_attn_cuda_kernel_forward <<>>(num_kernels, value.data(), value_spatial_shapes.data(), value_level_start_index.data(), sampling_locations.data(), attention_weights.data(), batch_size, value_length, num_heads, channels, num_levels, query_length, num_points, output.data()); return {output}; } // backward bilinear template __device__ void deformable_attn_bilinear_backward( const data_t *&bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const data_t &h, const data_t &w, const int &m, const int &c, const data_t &top_grad, const data_t &attn_weight, data_t *&grad_value, data_t *grad_sampling_loc, data_t *grad_attn_weight) { const int h_low = floor(h); const int w_low = floor(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const data_t lh = h - h_low; const data_t lw = w - w_low; const data_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * channels + c; const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const data_t top_grad_value = top_grad * attn_weight; data_t grad_h_weight = 0, grad_w_weight = 0; data_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; grad_h_weight -= hw * v1; grad_w_weight -= hh * v1; atomicAdd(grad_value + ptr1, w1 * top_grad_value); } data_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; grad_h_weight -= lw * v2; grad_w_weight += hh * v2; atomicAdd(grad_value + ptr2, w2 * top_grad_value); } data_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; grad_h_weight += hw * v3; grad_w_weight -= lh * v3; atomicAdd(grad_value + ptr3, w3 * top_grad_value); } data_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; grad_h_weight += lw * v4; grad_w_weight += lh * v4; atomicAdd(grad_value + ptr4, w4 * top_grad_value); } const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); *grad_attn_weight = top_grad * val; *grad_sampling_loc = width * grad_w_weight * top_grad_value; *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value; } template __device__ void deformable_attn_bilinear_backward_gm( const data_t *&bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const data_t &h, const data_t &w, const int &m, const int &c, const data_t &top_grad, const data_t &attn_weight, data_t *&grad_value, data_t *grad_sampling_loc, data_t *grad_attn_weight) { const int h_low = floor(h); const int w_low = floor(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const data_t lh = h - h_low; const data_t lw = w - w_low; const data_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * channels + c; const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const data_t top_grad_value = top_grad * attn_weight; data_t grad_h_weight = 0, grad_w_weight = 0; data_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; grad_h_weight -= hw * v1; grad_w_weight -= hh * v1; atomicAdd(grad_value + ptr1, w1 * top_grad_value); } data_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; grad_h_weight -= lw * v2; grad_w_weight += hh * v2; atomicAdd(grad_value + ptr2, w2 * top_grad_value); } data_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; grad_h_weight += hw * v3; grad_w_weight -= lh * v3; atomicAdd(grad_value + ptr3, w3 * top_grad_value); } data_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; grad_h_weight += lw * v4; grad_w_weight += lh * v4; atomicAdd(grad_value + ptr4, w4 * top_grad_value); } const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); atomicAdd(grad_attn_weight, top_grad * val); atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value); atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value); } // backward kernels // channels > 1024 template __global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v2_multi_blocks( const int n, const data_t *grad_col, const data_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const data_t *data_sampling_loc, const data_t *data_attn_weight, const int batch_size, const int value_length, const int num_heads, const int channels, const int num_levels, const int query_length, const int num_points, data_t *grad_value, data_t *grad_sampling_loc, data_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { extern __shared__ int _s[]; data_t *cache_grad_sampling_loc = (data_t *)_s; data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % query_length; _temp /= query_length; const int b_col = _temp; const data_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_points; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * value_length * qid_stride; for (int l_col = 0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const data_t *data_value_ptr = data_value + value_ptr_offset; data_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col = 0; p_col < num_points; ++p_col) { const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const data_t weight = data_attn_weight[data_weight_ptr]; const data_t h_im = loc_h * spatial_h - 0.5; const data_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight + threadIdx.x) = 0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { deformable_attn_bilinear_backward( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc + (threadIdx.x << 1), cache_grad_attn_weight + threadIdx.x); } __syncthreads(); for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0; s >>= 1, spre >>= 1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; if (tid + (s << 1) < spre) { cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; } } __syncthreads(); } if (tid == 0) { atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]); atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]); atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]); } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void deformable_attn_cuda_kernel_backward_gm( const int n, const data_t *grad_col, const data_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const data_t *data_sampling_loc, const data_t *data_attn_weight, const int batch_size, const int value_length, const int num_heads, const int channels, const int num_levels, const int query_length, const int num_points, data_t *grad_value, data_t *grad_sampling_loc, data_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % query_length; _temp /= query_length; const int b_col = _temp; const data_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_points; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * value_length * qid_stride; for (int l_col = 0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const data_t *data_value_ptr = data_value + value_ptr_offset; data_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col = 0; p_col < num_points; ++p_col) { const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const data_t weight = data_attn_weight[data_weight_ptr]; const data_t h_im = loc_h * spatial_h - 0.5; const data_t w_im = loc_w * spatial_w - 0.5; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { deformable_attn_bilinear_backward_gm( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, grad_sampling_loc, grad_attn_weight); } data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } // channels <= 1024 template __global__ void deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1( const int n, const data_t *grad_col, const data_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const data_t *data_sampling_loc, const data_t *data_attn_weight, const int batch_size, const int value_length, const int num_heads, const int channels, const int num_levels, const int query_length, const int num_points, data_t *grad_value, data_t *grad_sampling_loc, data_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { __shared__ data_t cache_grad_sampling_loc[blockSize * 2]; __shared__ data_t cache_grad_attn_weight[blockSize]; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % query_length; _temp /= query_length; const int b_col = _temp; const data_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_points; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * value_length * qid_stride; for (int l_col = 0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const data_t *data_value_ptr = data_value + value_ptr_offset; data_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col = 0; p_col < num_points; ++p_col) { const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const data_t weight = data_attn_weight[data_weight_ptr]; const data_t h_im = loc_h * spatial_h - 0.5; const data_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight + threadIdx.x) = 0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { deformable_attn_bilinear_backward( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc + (threadIdx.x << 1), cache_grad_attn_weight + threadIdx.x); } __syncthreads(); if (tid == 0) { data_t _grad_w = cache_grad_sampling_loc[0], _grad_h = cache_grad_sampling_loc[1], _grad_a = cache_grad_attn_weight[0]; int sid = 2; for (unsigned int tid = 1; tid < blockSize; ++tid) { _grad_w += cache_grad_sampling_loc[sid]; _grad_h += cache_grad_sampling_loc[sid + 1]; _grad_a += cache_grad_attn_weight[tid]; sid += 2; } *grad_sampling_loc = _grad_w; *(grad_sampling_loc + 1) = _grad_h; *grad_attn_weight = _grad_a; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2( const int n, const data_t *grad_col, const data_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const data_t *data_sampling_loc, const data_t *data_attn_weight, const int batch_size, const int value_length, const int num_heads, const int channels, const int num_levels, const int query_length, const int num_points, data_t *grad_value, data_t *grad_sampling_loc, data_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { __shared__ data_t cache_grad_sampling_loc[blockSize * 2]; __shared__ data_t cache_grad_attn_weight[blockSize]; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % query_length; _temp /= query_length; const int b_col = _temp; const data_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_points; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * value_length * qid_stride; for (int l_col = 0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const data_t *data_value_ptr = data_value + value_ptr_offset; data_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col = 0; p_col < num_points; ++p_col) { const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const data_t weight = data_attn_weight[data_weight_ptr]; const data_t h_im = loc_h * spatial_h - 0.5; const data_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight + threadIdx.x) = 0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { deformable_attn_bilinear_backward( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc + (threadIdx.x << 1), cache_grad_attn_weight + threadIdx.x); } __syncthreads(); for (unsigned int s = blockSize / 2; s > 0; s >>= 1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; } __syncthreads(); } if (tid == 0) { *grad_sampling_loc = cache_grad_sampling_loc[0]; *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; *grad_attn_weight = cache_grad_attn_weight[0]; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v1( const int n, const data_t *grad_col, const data_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const data_t *data_sampling_loc, const data_t *data_attn_weight, const int batch_size, const int value_length, const int num_heads, const int channels, const int num_levels, const int query_length, const int num_points, data_t *grad_value, data_t *grad_sampling_loc, data_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { extern __shared__ int _s[]; data_t *cache_grad_sampling_loc = (data_t *)_s; data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % query_length; _temp /= query_length; const int b_col = _temp; const data_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_points; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * value_length * qid_stride; for (int l_col = 0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const data_t *data_value_ptr = data_value + value_ptr_offset; data_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col = 0; p_col < num_points; ++p_col) { const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const data_t weight = data_attn_weight[data_weight_ptr]; const data_t h_im = loc_h * spatial_h - 0.5; const data_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight + threadIdx.x) = 0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { deformable_attn_bilinear_backward( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc + (threadIdx.x << 1), cache_grad_attn_weight + threadIdx.x); } __syncthreads(); if (tid == 0) { data_t _grad_w = cache_grad_sampling_loc[0], _grad_h = cache_grad_sampling_loc[1], _grad_a = cache_grad_attn_weight[0]; int sid = 2; for (unsigned int tid = 1; tid < blockDim.x; ++tid) { _grad_w += cache_grad_sampling_loc[sid]; _grad_h += cache_grad_sampling_loc[sid + 1]; _grad_a += cache_grad_attn_weight[tid]; sid += 2; } *grad_sampling_loc = _grad_w; *(grad_sampling_loc + 1) = _grad_h; *grad_attn_weight = _grad_a; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v2( const int n, const data_t *grad_col, const data_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const data_t *data_sampling_loc, const data_t *data_attn_weight, const int batch_size, const int value_length, const int num_heads, const int channels, const int num_levels, const int query_length, const int num_points, data_t *grad_value, data_t *grad_sampling_loc, data_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { extern __shared__ int _s[]; data_t *cache_grad_sampling_loc = (data_t *)_s; data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % query_length; _temp /= query_length; const int b_col = _temp; const data_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_points; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * value_length * qid_stride; for (int l_col = 0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const data_t *data_value_ptr = data_value + value_ptr_offset; data_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col = 0; p_col < num_points; ++p_col) { const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const data_t weight = data_attn_weight[data_weight_ptr]; const data_t h_im = loc_h * spatial_h - 0.5; const data_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight + threadIdx.x) = 0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { deformable_attn_bilinear_backward( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc + (threadIdx.x << 1), cache_grad_attn_weight + threadIdx.x); } __syncthreads(); for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0; s >>= 1, spre >>= 1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; if (tid + (s << 1) < spre) { cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; } } __syncthreads(); } if (tid == 0) { *grad_sampling_loc = cache_grad_sampling_loc[0]; *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; *grad_attn_weight = cache_grad_attn_weight[0]; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } // backward branch template void deformable_attn_cuda_backward( cudaStream_t stream, const data_t *grad_out, const data_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const data_t *data_sampling_loc, const data_t *data_attn_weight, const int batch_size, const int value_length, const int num_heads, const int channels, const int num_levels, const int query_length, const int num_points, data_t *grad_value, data_t *grad_sampling_loc, data_t *grad_attn_weight) { const int num_threads = (channels > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : channels; const int num_kernels = batch_size * query_length * num_heads * channels; const int num_actual_kernels = batch_size * query_length * num_heads * channels; if (channels > 1024) { if ((channels & 1023) == 0) { deformable_attn_cuda_kernel_backward_shm_reduce_v2_multi_blocks <<>>( num_kernels, grad_out, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, value_length, num_heads, channels, num_levels, query_length, num_points, grad_value, grad_sampling_loc, grad_attn_weight); } else { deformable_attn_cuda_kernel_backward_gm <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, value_length, num_heads, channels, num_levels, query_length, num_points, grad_value, grad_sampling_loc, grad_attn_weight); } } else { switch (channels) { case 1: deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1 <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, value_length, num_heads, channels, num_levels, query_length, num_points, grad_value, grad_sampling_loc, grad_attn_weight); break; case 2: deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1 <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, value_length, num_heads, channels, num_levels, query_length, num_points, grad_value, grad_sampling_loc, grad_attn_weight); break; case 4: deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1 <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, value_length, num_heads, channels, num_levels, query_length, num_points, grad_value, grad_sampling_loc, grad_attn_weight); break; case 8: deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1 <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, value_length, num_heads, channels, num_levels, query_length, num_points, grad_value, grad_sampling_loc, grad_attn_weight); break; case 16: deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1 <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, value_length, num_heads, channels, num_levels, query_length, num_points, grad_value, grad_sampling_loc, grad_attn_weight); break; case 32: deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1 <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, value_length, num_heads, channels, num_levels, query_length, num_points, grad_value, grad_sampling_loc, grad_attn_weight); break; case 64: deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2 <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, value_length, num_heads, channels, num_levels, query_length, num_points, grad_value, grad_sampling_loc, grad_attn_weight); break; case 128: deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2 <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, value_length, num_heads, channels, num_levels, query_length, num_points, grad_value, grad_sampling_loc, grad_attn_weight); break; case 256: deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2 <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, value_length, num_heads, channels, num_levels, query_length, num_points, grad_value, grad_sampling_loc, grad_attn_weight); break; case 512: deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2 <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, value_length, num_heads, channels, num_levels, query_length, num_points, grad_value, grad_sampling_loc, grad_attn_weight); break; case 1024: deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2 <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, value_length, num_heads, channels, num_levels, query_length, num_points, grad_value, grad_sampling_loc, grad_attn_weight); break; default: if (channels < 64) { deformable_attn_cuda_kernel_backward_shm_reduce_v1 <<>>( num_kernels, grad_out, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, value_length, num_heads, channels, num_levels, query_length, num_points, grad_value, grad_sampling_loc, grad_attn_weight); } else { deformable_attn_cuda_kernel_backward_shm_reduce_v2 <<>>( num_kernels, grad_out, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, value_length, num_heads, channels, num_levels, query_length, num_points, grad_value, grad_sampling_loc, grad_attn_weight); } } } } // backward std::vector MSDeformableAttnCUDABackward( const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes, const paddle::Tensor &value_level_start_index, const paddle::Tensor &sampling_locations, const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out) { CHECK_INPUT_GPU(value); CHECK_INPUT_GPU(value_spatial_shapes); CHECK_INPUT_GPU(value_level_start_index); CHECK_INPUT_GPU(sampling_locations); CHECK_INPUT_GPU(attention_weights); CHECK_INPUT_GPU(grad_out); const int batch_size = value.shape()[0]; const int value_length = value.shape()[1]; const int num_heads = value.shape()[2]; const int channels = value.shape()[3]; const int num_levels = value_spatial_shapes.shape()[0]; const int query_length = sampling_locations.shape()[1]; const int num_points = sampling_locations.shape()[4]; auto grad_value = paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace()); auto grad_spatial_shapes = paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace()); auto grad_level_start_index = paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace()); auto grad_sampling_locations = paddle::full(sampling_locations.shape(), 0, sampling_locations.dtype(), paddle::GPUPlace()); auto grad_attention_weights = paddle::full(attention_weights.shape(), 0, attention_weights.dtype(), paddle::GPUPlace()); deformable_attn_cuda_backward( value.stream(), grad_out.data(), value.data(), value_spatial_shapes.data(), value_level_start_index.data(), sampling_locations.data(), attention_weights.data(), batch_size, value_length, num_heads, channels, num_levels, query_length, num_points, grad_value.data(), grad_sampling_locations.data(), grad_attention_weights.data()); return {grad_value, grad_spatial_shapes, grad_level_start_index, grad_sampling_locations, grad_attention_weights}; } ================================================ FILE: ppdet/modeling/transformers/ext_op/setup_ms_deformable_attn_op.py ================================================ from paddle.utils.cpp_extension import CUDAExtension, setup if __name__ == "__main__": setup( name='deformable_detr_ops', ext_modules=CUDAExtension( sources=['ms_deformable_attn_op.cc', 'ms_deformable_attn_op.cu'])) ================================================ FILE: ppdet/modeling/transformers/ext_op/test_ms_deformable_attn_op.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import print_function from __future__ import division import os import sys import random import numpy as np import paddle # add python path of PaddleDetection to sys.path parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 5))) if parent_path not in sys.path: sys.path.append(parent_path) from ppdet.modeling.transformers.utils import deformable_attention_core_func ms_deform_attn_core_paddle = deformable_attention_core_func try: gpu_index = int(sys.argv[1]) except: gpu_index = 0 print(f'Use gpu {gpu_index} to test...') paddle.set_device(f'gpu:{gpu_index}') try: from deformable_detr_ops import ms_deformable_attn except Exception as e: print('import deformable_detr_ops error', e) sys.exit(-1) paddle.seed(1) random.seed(1) np.random.seed(1) bs, n_heads, c = 2, 8, 8 query_length, n_levels, n_points = 2, 2, 2 spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64) level_start_index = paddle.concat((paddle.to_tensor( [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1])) value_length = sum([(H * W).item() for H, W in spatial_shapes]) def get_test_tensors(channels): value = paddle.rand( [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01 sampling_locations = paddle.rand( [bs, query_length, n_heads, n_levels, n_points, 2], dtype=paddle.float32) attention_weights = paddle.rand( [bs, query_length, n_heads, n_levels, n_points], dtype=paddle.float32) + 1e-5 attention_weights /= attention_weights.sum(-1, keepdim=True).sum( -2, keepdim=True) return [value, sampling_locations, attention_weights] @paddle.no_grad() def check_forward_equal_with_paddle_float(): value, sampling_locations, attention_weights = get_test_tensors(c) output_paddle = ms_deform_attn_core_paddle( value, spatial_shapes, level_start_index, sampling_locations, attention_weights).detach().cpu() output_cuda = ms_deformable_attn(value, spatial_shapes, level_start_index, sampling_locations, attention_weights).detach().cpu() fwdok = paddle.allclose( output_cuda, output_paddle, rtol=1e-2, atol=1e-3).item() max_abs_err = (output_cuda - output_paddle).abs().max().item() max_rel_err = ( (output_cuda - output_paddle).abs() / output_paddle.abs()).max().item() print( f'*{fwdok} check_forward_equal_with_paddle_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}' ) def check_gradient_numerical(channels=4): value_paddle, sampling_locations_paddle, attention_weights_paddle = get_test_tensors( channels) value_paddle.stop_gradient = False sampling_locations_paddle.stop_gradient = False attention_weights_paddle.stop_gradient = False value_cuda = value_paddle.detach().clone() sampling_locations_cuda = sampling_locations_paddle.detach().clone() attention_weights_cuda = attention_weights_paddle.detach().clone() value_cuda.stop_gradient = False sampling_locations_cuda.stop_gradient = False attention_weights_cuda.stop_gradient = False output_paddle = ms_deform_attn_core_paddle( value_paddle, spatial_shapes, level_start_index, sampling_locations_paddle, attention_weights_paddle) output_paddle.sum().backward() output_cuda = ms_deformable_attn(value_cuda, spatial_shapes, level_start_index, sampling_locations_cuda, attention_weights_cuda) output_cuda.sum().backward() res = paddle.allclose( value_paddle.grad, value_cuda.grad, rtol=1e-2, atol=1e-3).item() print(f'*tensor1 {res} check_gradient_numerical(D={channels})') res = paddle.allclose( sampling_locations_paddle.grad, sampling_locations_cuda.grad, rtol=1e-2, atol=1e-3).item() print(f'*tensor2 {res} check_gradient_numerical(D={channels})') res = paddle.allclose( attention_weights_paddle.grad, attention_weights_cuda.grad, rtol=1e-2, atol=1e-3).item() print(f'*tensor3 {res} check_gradient_numerical(D={channels})') if __name__ == '__main__': check_forward_equal_with_paddle_float() for channels in [30, 32, 64, 71, 128, 1024, 1025, 2048, 3096]: check_gradient_numerical(channels) ================================================ FILE: ppdet/modeling/transformers/group_detr_transformer.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR) # Copyright (c) 2020 SenseTime. All Rights Reserved. # Modified from detrex (https://github.com/IDEA-Research/detrex) # Copyright 2022 The IDEA Authors. All rights reserved. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.regularizer import L2Decay from ppdet.core.workspace import register from ..layers import MultiHeadAttention from .position_encoding import PositionEmbedding from ..heads.detr_head import MLP from .deformable_transformer import MSDeformableAttention from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_, bias_init_with_prob) from .utils import (_get_clones, get_valid_ratio, get_contrastive_denoising_training_group, get_sine_pos_embed, inverse_sigmoid) __all__ = ['GroupDINOTransformer'] class DINOTransformerEncoderLayer(nn.Layer): def __init__(self, d_model=256, n_head=8, dim_feedforward=1024, dropout=0., activation="relu", n_levels=4, n_points=4, weight_attr=None, bias_attr=None): super(DINOTransformerEncoderLayer, self).__init__() # self attention self.self_attn = MSDeformableAttention(d_model, n_head, n_levels, n_points, 1.0) self.dropout1 = nn.Dropout(dropout) self.norm1 = nn.LayerNorm( d_model, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) # ffn self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr, bias_attr) self.activation = getattr(F, activation) self.dropout2 = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr, bias_attr) self.dropout3 = nn.Dropout(dropout) self.norm2 = nn.LayerNorm( d_model, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) self._reset_parameters() def _reset_parameters(self): linear_init_(self.linear1) linear_init_(self.linear2) xavier_uniform_(self.linear1.weight) xavier_uniform_(self.linear2.weight) def with_pos_embed(self, tensor, pos): return tensor if pos is None else tensor + pos def forward_ffn(self, src): src2 = self.linear2(self.dropout2(self.activation(self.linear1(src)))) src = src + self.dropout3(src2) src = self.norm2(src) return src def forward(self, src, reference_points, spatial_shapes, level_start_index, src_mask=None, query_pos_embed=None): # self attention src2 = self.self_attn( self.with_pos_embed(src, query_pos_embed), reference_points, src, spatial_shapes, level_start_index, src_mask) src = src + self.dropout1(src2) src = self.norm1(src) # ffn src = self.forward_ffn(src) return src class DINOTransformerEncoder(nn.Layer): def __init__(self, encoder_layer, num_layers): super(DINOTransformerEncoder, self).__init__() self.layers = _get_clones(encoder_layer, num_layers) self.num_layers = num_layers @staticmethod def get_reference_points(spatial_shapes, valid_ratios, offset=0.5): valid_ratios = valid_ratios.unsqueeze(1) reference_points = [] for i, (H, W) in enumerate(spatial_shapes): ref_y, ref_x = paddle.meshgrid( paddle.arange(end=H) + offset, paddle.arange(end=W) + offset) ref_y = ref_y.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 1] * H) ref_x = ref_x.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 0] * W) reference_points.append(paddle.stack((ref_x, ref_y), axis=-1)) reference_points = paddle.concat(reference_points, 1).unsqueeze(2) reference_points = reference_points * valid_ratios return reference_points def forward(self, feat, spatial_shapes, level_start_index, feat_mask=None, query_pos_embed=None, valid_ratios=None): if valid_ratios is None: valid_ratios = paddle.ones( [feat.shape[0], spatial_shapes.shape[0], 2]) reference_points = self.get_reference_points(spatial_shapes, valid_ratios) for layer in self.layers: feat = layer(feat, reference_points, spatial_shapes, level_start_index, feat_mask, query_pos_embed) return feat class DINOTransformerDecoderLayer(nn.Layer): def __init__(self, d_model=256, n_head=8, dim_feedforward=1024, dropout=0., activation="relu", n_levels=4, n_points=4, dual_queries=False, dual_groups=0, weight_attr=None, bias_attr=None): super(DINOTransformerDecoderLayer, self).__init__() # self attention self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout) self.dropout1 = nn.Dropout(dropout) self.norm1 = nn.LayerNorm( d_model, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) # cross attention self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels, n_points, 1.0) self.dropout2 = nn.Dropout(dropout) self.norm2 = nn.LayerNorm( d_model, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) # ffn self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr, bias_attr) self.activation = getattr(F, activation) self.dropout3 = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr, bias_attr) self.dropout4 = nn.Dropout(dropout) self.norm3 = nn.LayerNorm( d_model, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) # for dual groups self.dual_queries = dual_queries self.dual_groups = dual_groups self.n_head = n_head self._reset_parameters() def _reset_parameters(self): linear_init_(self.linear1) linear_init_(self.linear2) xavier_uniform_(self.linear1.weight) xavier_uniform_(self.linear2.weight) def with_pos_embed(self, tensor, pos): return tensor if pos is None else tensor + pos def forward_ffn(self, tgt): return self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) def forward(self, tgt, reference_points, memory, memory_spatial_shapes, memory_level_start_index, attn_mask=None, memory_mask=None, query_pos_embed=None): # self attention q = k = self.with_pos_embed(tgt, query_pos_embed) if self.dual_queries: dual_groups = self.dual_groups bs, num_queries, n_model = q.shape q = paddle.concat(q.split(dual_groups + 1, axis=1), axis=0) k = paddle.concat(k.split(dual_groups + 1, axis=1), axis=0) tgt = paddle.concat(tgt.split(dual_groups + 1, axis=1), axis=0) g_num_queries = num_queries // (dual_groups + 1) if attn_mask is None or attn_mask[0] is None: attn_mask = None else: # [(dual_groups + 1), g_num_queries, g_num_queries] attn_mask = paddle.concat( [sa_mask.unsqueeze(0) for sa_mask in attn_mask], axis=0) # [1, (dual_groups + 1), 1, g_num_queries, g_num_queries] # --> [bs, (dual_groups + 1), nhead, g_num_queries, g_num_queries] # --> [bs * (dual_groups + 1), nhead, g_num_queries, g_num_queries] attn_mask = attn_mask.unsqueeze(0).unsqueeze(2).tile( [bs, 1, self.n_head, 1, 1]) attn_mask = attn_mask.reshape([ bs * (dual_groups + 1), self.n_head, g_num_queries, g_num_queries ]) if attn_mask is not None: attn_mask = attn_mask.astype('bool') tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask) tgt = tgt + self.dropout1(tgt2) tgt = self.norm2(tgt) # trace back if self.dual_queries: tgt = paddle.concat(tgt.split(dual_groups + 1, axis=0), axis=1) # cross attention tgt2 = self.cross_attn( self.with_pos_embed(tgt, query_pos_embed), reference_points, memory, memory_spatial_shapes, memory_level_start_index, memory_mask) tgt = tgt + self.dropout2(tgt2) tgt = self.norm1(tgt) # ffn tgt2 = self.forward_ffn(tgt) tgt = tgt + self.dropout4(tgt2) tgt = self.norm3(tgt) return tgt class DINOTransformerDecoder(nn.Layer): def __init__(self, hidden_dim, decoder_layer, num_layers, return_intermediate=True): super(DINOTransformerDecoder, self).__init__() self.layers = _get_clones(decoder_layer, num_layers) self.hidden_dim = hidden_dim self.num_layers = num_layers self.return_intermediate = return_intermediate self.norm = nn.LayerNorm( hidden_dim, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) def forward(self, tgt, reference_points, memory, memory_spatial_shapes, memory_level_start_index, bbox_head, query_pos_head, valid_ratios=None, attn_mask=None, memory_mask=None): if valid_ratios is None: valid_ratios = paddle.ones( [memory.shape[0], memory_spatial_shapes.shape[0], 2]) output = tgt intermediate = [] inter_ref_bboxes = [] for i, layer in enumerate(self.layers): reference_points_input = reference_points.unsqueeze( 2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1) query_pos_embed = get_sine_pos_embed( reference_points_input[..., 0, :], self.hidden_dim // 2) query_pos_embed = query_pos_head(query_pos_embed) output = layer(output, reference_points_input, memory, memory_spatial_shapes, memory_level_start_index, attn_mask, memory_mask, query_pos_embed) inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid( reference_points)) if self.return_intermediate: intermediate.append(self.norm(output)) inter_ref_bboxes.append(inter_ref_bbox) reference_points = inter_ref_bbox.detach() if self.return_intermediate: return paddle.stack(intermediate), paddle.stack(inter_ref_bboxes) return output, reference_points @register class GroupDINOTransformer(nn.Layer): __shared__ = ['num_classes', 'hidden_dim'] def __init__(self, num_classes=80, hidden_dim=256, num_queries=900, position_embed_type='sine', return_intermediate_dec=True, backbone_feat_channels=[512, 1024, 2048], num_levels=4, num_encoder_points=4, num_decoder_points=4, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=1024, dropout=0., activation="relu", pe_temperature=10000, pe_offset=-0.5, num_denoising=100, label_noise_ratio=0.5, box_noise_scale=1.0, learnt_init_query=True, use_input_proj=True, dual_queries=False, dual_groups=0, eps=1e-2): super(GroupDINOTransformer, self).__init__() assert position_embed_type in ['sine', 'learned'], \ f'ValueError: position_embed_type not supported {position_embed_type}!' assert len(backbone_feat_channels) <= num_levels self.hidden_dim = hidden_dim self.nhead = nhead self.num_levels = num_levels self.num_classes = num_classes self.num_queries = num_queries self.eps = eps self.num_decoder_layers = num_decoder_layers self.use_input_proj = use_input_proj if use_input_proj: # backbone feature projection self._build_input_proj_layer(backbone_feat_channels) # Transformer module encoder_layer = DINOTransformerEncoderLayer( hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels, num_encoder_points) self.encoder = DINOTransformerEncoder(encoder_layer, num_encoder_layers) decoder_layer = DINOTransformerDecoderLayer( hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels, num_decoder_points, dual_queries=dual_queries, dual_groups=dual_groups) self.decoder = DINOTransformerDecoder(hidden_dim, decoder_layer, num_decoder_layers, return_intermediate_dec) # denoising part self.denoising_class_embed = nn.Embedding( num_classes, hidden_dim, weight_attr=ParamAttr(initializer=nn.initializer.Normal())) self.num_denoising = num_denoising self.label_noise_ratio = label_noise_ratio self.box_noise_scale = box_noise_scale # for dual group self.dual_queries = dual_queries self.dual_groups = dual_groups if self.dual_queries: self.denoising_class_embed_groups = nn.LayerList([ nn.Embedding( num_classes, hidden_dim, weight_attr=ParamAttr(initializer=nn.initializer.Normal())) for _ in range(self.dual_groups) ]) # position embedding self.position_embedding = PositionEmbedding( hidden_dim // 2, temperature=pe_temperature, normalize=True if position_embed_type == 'sine' else False, embed_type=position_embed_type, offset=pe_offset) self.level_embed = nn.Embedding(num_levels, hidden_dim) # decoder embedding self.learnt_init_query = learnt_init_query if learnt_init_query: self.tgt_embed = nn.Embedding(num_queries, hidden_dim) normal_(self.tgt_embed.weight) if self.dual_queries: self.tgt_embed_dual = nn.LayerList([ nn.Embedding(num_queries, hidden_dim) for _ in range(self.dual_groups) ]) for dual_tgt_module in self.tgt_embed_dual: normal_(dual_tgt_module.weight) self.query_pos_head = MLP(2 * hidden_dim, hidden_dim, hidden_dim, num_layers=2) # encoder head self.enc_output = nn.Sequential( nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm( hidden_dim, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0)))) if self.dual_queries: self.enc_output = _get_clones(self.enc_output, self.dual_groups + 1) else: self.enc_output = _get_clones(self.enc_output, 1) self.enc_score_head = nn.Linear(hidden_dim, num_classes) self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3) if self.dual_queries: self.enc_bbox_head_dq = nn.LayerList([ MLP(hidden_dim, hidden_dim, 4, num_layers=3) for i in range(self.dual_groups) ]) self.enc_score_head_dq = nn.LayerList([ nn.Linear(hidden_dim, num_classes) for i in range(self.dual_groups) ]) # decoder head self.dec_score_head = nn.LayerList([ nn.Linear(hidden_dim, num_classes) for _ in range(num_decoder_layers) ]) self.dec_bbox_head = nn.LayerList([ MLP(hidden_dim, hidden_dim, 4, num_layers=3) for _ in range(num_decoder_layers) ]) self._reset_parameters() def _reset_parameters(self): # class and bbox head init bias_cls = bias_init_with_prob(0.01) linear_init_(self.enc_score_head) constant_(self.enc_score_head.bias, bias_cls) constant_(self.enc_bbox_head.layers[-1].weight) constant_(self.enc_bbox_head.layers[-1].bias) for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head): linear_init_(cls_) constant_(cls_.bias, bias_cls) constant_(reg_.layers[-1].weight) constant_(reg_.layers[-1].bias) for enc_output in self.enc_output: linear_init_(enc_output[0]) xavier_uniform_(enc_output[0].weight) normal_(self.level_embed.weight) if self.learnt_init_query: xavier_uniform_(self.tgt_embed.weight) xavier_uniform_(self.query_pos_head.layers[0].weight) xavier_uniform_(self.query_pos_head.layers[1].weight) normal_(self.denoising_class_embed.weight) if self.use_input_proj: for l in self.input_proj: xavier_uniform_(l[0].weight) constant_(l[0].bias) @classmethod def from_config(cls, cfg, input_shape): return {'backbone_feat_channels': [i.channels for i in input_shape], } def _build_input_proj_layer(self, backbone_feat_channels): self.input_proj = nn.LayerList() for in_channels in backbone_feat_channels: self.input_proj.append( nn.Sequential( ('conv', nn.Conv2D( in_channels, self.hidden_dim, kernel_size=1)), ('norm', nn.GroupNorm( 32, self.hidden_dim, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))) in_channels = backbone_feat_channels[-1] for _ in range(self.num_levels - len(backbone_feat_channels)): self.input_proj.append( nn.Sequential( ('conv', nn.Conv2D( in_channels, self.hidden_dim, kernel_size=3, stride=2, padding=1)), ('norm', nn.GroupNorm( 32, self.hidden_dim, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))) in_channels = self.hidden_dim def _get_encoder_input(self, feats, pad_mask=None): if self.use_input_proj: # get projection features proj_feats = [ self.input_proj[i](feat) for i, feat in enumerate(feats) ] if self.num_levels > len(proj_feats): len_srcs = len(proj_feats) for i in range(len_srcs, self.num_levels): if i == len_srcs: proj_feats.append(self.input_proj[i](feats[-1])) else: proj_feats.append(self.input_proj[i](proj_feats[-1])) else: proj_feats = feats # get encoder inputs feat_flatten = [] mask_flatten = [] lvl_pos_embed_flatten = [] spatial_shapes = [] valid_ratios = [] for i, feat in enumerate(proj_feats): bs, _, h, w = feat.shape spatial_shapes.append(paddle.concat([h, w])) # [b,c,h,w] -> [b,h*w,c] feat_flatten.append(feat.flatten(2).transpose([0, 2, 1])) if pad_mask is not None: mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0] else: mask = paddle.ones([bs, h, w]) valid_ratios.append(get_valid_ratio(mask)) # [b, h*w, c] pos_embed = self.position_embedding(mask).flatten(1, 2) lvl_pos_embed = pos_embed + self.level_embed.weight[i].reshape( [1, 1, -1]) lvl_pos_embed_flatten.append(lvl_pos_embed) if pad_mask is not None: # [b, h*w] mask_flatten.append(mask.flatten(1)) # [b, l, c] feat_flatten = paddle.concat(feat_flatten, 1) # [b, l] mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten, 1) # [b, l, c] lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1) # [num_levels, 2] spatial_shapes = paddle.to_tensor( paddle.stack(spatial_shapes).astype('int64')) # [l] start index of each level level_start_index = paddle.concat([ paddle.zeros( [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1] ]) # [b, num_levels, 2] valid_ratios = paddle.stack(valid_ratios, 1) return (feat_flatten, spatial_shapes, level_start_index, mask_flatten, lvl_pos_embed_flatten, valid_ratios) def forward(self, feats, pad_mask=None, gt_meta=None): # input projection and embedding (feat_flatten, spatial_shapes, level_start_index, mask_flatten, lvl_pos_embed_flatten, valid_ratios) = self._get_encoder_input(feats, pad_mask) # encoder memory = self.encoder(feat_flatten, spatial_shapes, level_start_index, mask_flatten, lvl_pos_embed_flatten, valid_ratios) # prepare denoising training if self.training: denoising_class, denoising_bbox, attn_mask, dn_meta = \ get_contrastive_denoising_training_group(gt_meta, self.num_classes, self.num_queries, self.denoising_class_embed.weight, self.num_denoising, self.label_noise_ratio, self.box_noise_scale) if self.dual_queries: denoising_class_groups = [] denoising_bbox_groups = [] attn_mask_groups = [] dn_meta_groups = [] for g_id in range(self.dual_groups): denoising_class_gid, denoising_bbox_gid, attn_mask_gid, dn_meta_gid = \ get_contrastive_denoising_training_group(gt_meta, self.num_classes, self.num_queries, self.denoising_class_embed_groups[g_id].weight, self.num_denoising, self.label_noise_ratio, self.box_noise_scale) denoising_class_groups.append(denoising_class_gid) denoising_bbox_groups.append(denoising_bbox_gid) attn_mask_groups.append(attn_mask_gid) dn_meta_groups.append(dn_meta_gid) # combine denoising_class = [denoising_class] + denoising_class_groups denoising_bbox = [denoising_bbox] + denoising_bbox_groups attn_mask = [attn_mask] + attn_mask_groups dn_meta = [dn_meta] + dn_meta_groups else: denoising_class, denoising_bbox, attn_mask, dn_meta = None, None, None, None target, init_ref_points, enc_topk_bboxes, enc_topk_logits = \ self._get_decoder_input( memory, spatial_shapes, mask_flatten, denoising_class, denoising_bbox) # decoder inter_feats, inter_ref_bboxes = self.decoder( target, init_ref_points, memory, spatial_shapes, level_start_index, self.dec_bbox_head, self.query_pos_head, valid_ratios, attn_mask, mask_flatten) # solve hang during distributed training inter_feats[0] += self.denoising_class_embed.weight[0, 0] * 0. if self.dual_queries: for g_id in range(self.dual_groups): inter_feats[0] += self.denoising_class_embed_groups[ g_id].weight[0, 0] * 0.0 out_bboxes = [] out_logits = [] for i in range(self.num_decoder_layers): out_logits.append(self.dec_score_head[i](inter_feats[i])) if i == 0: out_bboxes.append( F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) + inverse_sigmoid(init_ref_points))) else: out_bboxes.append( F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) + inverse_sigmoid(inter_ref_bboxes[i - 1]))) out_bboxes = paddle.stack(out_bboxes) out_logits = paddle.stack(out_logits) return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits, dn_meta) def _get_encoder_output_anchors(self, memory, spatial_shapes, memory_mask=None, grid_size=0.05): output_anchors = [] idx = 0 for lvl, (h, w) in enumerate(spatial_shapes): if memory_mask is not None: mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w]) valid_H = paddle.sum(mask_[:, :, 0], 1) valid_W = paddle.sum(mask_[:, 0, :], 1) else: valid_H, valid_W = h, w grid_y, grid_x = paddle.meshgrid( paddle.arange( end=h, dtype=memory.dtype), paddle.arange( end=w, dtype=memory.dtype)) grid_xy = paddle.stack([grid_x, grid_y], -1) valid_WH = paddle.stack([valid_W, valid_H], -1).reshape( [-1, 1, 1, 2]).astype(grid_xy.dtype) grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl) output_anchors.append( paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4])) idx += h * w output_anchors = paddle.concat(output_anchors, 1) valid_mask = ((output_anchors > self.eps) * (output_anchors < 1 - self.eps)).all(-1, keepdim=True) output_anchors = paddle.log(output_anchors / (1 - output_anchors)) if memory_mask is not None: valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0 output_anchors = paddle.where(valid_mask, output_anchors, paddle.to_tensor(float("inf"))) memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.)) if self.dual_queries: output_memory = [ self.enc_output[g_id](memory) for g_id in range(self.dual_groups + 1) ] else: output_memory = self.enc_output[0](memory) return output_memory, output_anchors def _get_decoder_input(self, memory, spatial_shapes, memory_mask=None, denoising_class=None, denoising_bbox=None): bs, _, _ = memory.shape # prepare input for decoder output_memory, output_anchors = self._get_encoder_output_anchors( memory, spatial_shapes, memory_mask) if self.dual_queries: enc_outputs_class = self.enc_score_head(output_memory[0]) enc_outputs_coord_unact = self.enc_bbox_head(output_memory[ 0]) + output_anchors else: enc_outputs_class = self.enc_score_head(output_memory) enc_outputs_coord_unact = self.enc_bbox_head( output_memory) + output_anchors _, topk_ind = paddle.topk( enc_outputs_class.max(-1), self.num_queries, axis=1) # extract region proposal boxes batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype) batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries]) topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1) topk_coords_unact = paddle.gather_nd(enc_outputs_coord_unact, topk_ind) # unsigmoided. enc_topk_bboxes = F.sigmoid(topk_coords_unact) reference_points = enc_topk_bboxes.detach() enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind) if self.dual_queries: enc_topk_logits_groups = [] enc_topk_bboxes_groups = [] reference_points_groups = [] topk_ind_groups = [] for g_id in range(self.dual_groups): enc_outputs_class_gid = self.enc_score_head_dq[g_id]( output_memory[g_id + 1]) enc_outputs_coord_unact_gid = self.enc_bbox_head_dq[g_id]( output_memory[g_id + 1]) + output_anchors _, topk_ind_gid = paddle.topk( enc_outputs_class_gid.max(-1), self.num_queries, axis=1) # extract region proposal boxes batch_ind = paddle.arange(end=bs, dtype=topk_ind_gid.dtype) batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries]) topk_ind_gid = paddle.stack([batch_ind, topk_ind_gid], axis=-1) topk_coords_unact_gid = paddle.gather_nd( enc_outputs_coord_unact_gid, topk_ind_gid) # unsigmoided. enc_topk_bboxes_gid = F.sigmoid(topk_coords_unact_gid) reference_points_gid = enc_topk_bboxes_gid.detach() enc_topk_logits_gid = paddle.gather_nd(enc_outputs_class_gid, topk_ind_gid) # append and combine topk_ind_groups.append(topk_ind_gid) enc_topk_logits_groups.append(enc_topk_logits_gid) enc_topk_bboxes_groups.append(enc_topk_bboxes_gid) reference_points_groups.append(reference_points_gid) enc_topk_bboxes = paddle.concat( [enc_topk_bboxes] + enc_topk_bboxes_groups, 1) enc_topk_logits = paddle.concat( [enc_topk_logits] + enc_topk_logits_groups, 1) reference_points = paddle.concat( [reference_points] + reference_points_groups, 1) topk_ind = paddle.concat([topk_ind] + topk_ind_groups, 1) # extract region features if self.learnt_init_query: target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1]) if self.dual_queries: target = paddle.concat([target] + [ self.tgt_embed_dual[g_id].weight.unsqueeze(0).tile( [bs, 1, 1]) for g_id in range(self.dual_groups) ], 1) else: if self.dual_queries: target = paddle.gather_nd(output_memory[0], topk_ind) target_groups = [] for g_id in range(self.dual_groups): target_gid = paddle.gather_nd(output_memory[g_id + 1], topk_ind_groups[g_id]) target_groups.append(target_gid) target = paddle.concat([target] + target_groups, 1).detach() else: target = paddle.gather_nd(output_memory, topk_ind).detach() if denoising_bbox is not None: if isinstance(denoising_bbox, list) and isinstance( denoising_class, list) and self.dual_queries: if denoising_bbox[0] is not None: reference_points_list = paddle.split( reference_points, self.dual_groups + 1, axis=1) reference_points = paddle.concat( [ paddle.concat( [ref, ref_], axis=1) for ref, ref_ in zip(denoising_bbox, reference_points_list) ], axis=1) target_list = paddle.split( target, self.dual_groups + 1, axis=1) target = paddle.concat( [ paddle.concat( [tgt, tgt_], axis=1) for tgt, tgt_ in zip(denoising_class, target_list) ], axis=1) else: reference_points, target = reference_points, target else: reference_points = paddle.concat( [denoising_bbox, reference_points], 1) target = paddle.concat([denoising_class, target], 1) return target, reference_points, enc_topk_bboxes, enc_topk_logits ================================================ FILE: ppdet/modeling/transformers/hybrid_encoder.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register, serializable from ppdet.modeling.ops import get_act_fn from ..shape_spec import ShapeSpec from ..backbones.csp_darknet import BaseConv from ..backbones.cspresnet import RepVggBlock from ppdet.modeling.transformers.detr_transformer import TransformerEncoder from ..initializer import xavier_uniform_, linear_init_ from ..layers import MultiHeadAttention from paddle import ParamAttr from paddle.regularizer import L2Decay __all__ = ['HybridEncoder', 'MaskHybridEncoder'] class CSPRepLayer(nn.Layer): def __init__(self, in_channels, out_channels, num_blocks=3, expansion=1.0, bias=False, act="silu"): super(CSPRepLayer, self).__init__() hidden_channels = int(out_channels * expansion) self.conv1 = BaseConv( in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) self.conv2 = BaseConv( in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) self.bottlenecks = nn.Sequential(* [ RepVggBlock( hidden_channels, hidden_channels, act=act) for _ in range(num_blocks) ]) if hidden_channels != out_channels: self.conv3 = BaseConv( hidden_channels, out_channels, ksize=1, stride=1, bias=bias, act=act) else: self.conv3 = nn.Identity() def forward(self, x): x_1 = self.conv1(x) x_1 = self.bottlenecks(x_1) x_2 = self.conv2(x) return self.conv3(x_1 + x_2) @register class TransformerLayer(nn.Layer): def __init__(self, d_model, nhead, dim_feedforward=1024, dropout=0., activation="relu", attn_dropout=None, act_dropout=None, normalize_before=False): super(TransformerLayer, self).__init__() attn_dropout = dropout if attn_dropout is None else attn_dropout act_dropout = dropout if act_dropout is None else act_dropout self.normalize_before = normalize_before self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout) # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train") self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train") self.activation = getattr(F, activation) self._reset_parameters() def _reset_parameters(self): linear_init_(self.linear1) linear_init_(self.linear2) @staticmethod def with_pos_embed(tensor, pos_embed): return tensor if pos_embed is None else tensor + pos_embed def forward(self, src, src_mask=None, pos_embed=None): residual = src if self.normalize_before: src = self.norm1(src) q = k = self.with_pos_embed(src, pos_embed) src = self.self_attn(q, k, value=src, attn_mask=src_mask) src = residual + self.dropout1(src) if not self.normalize_before: src = self.norm1(src) residual = src if self.normalize_before: src = self.norm2(src) src = self.linear2(self.dropout(self.activation(self.linear1(src)))) src = residual + self.dropout2(src) if not self.normalize_before: src = self.norm2(src) return src @register @serializable class HybridEncoder(nn.Layer): __shared__ = ['depth_mult', 'act', 'trt', 'eval_size'] __inject__ = ['encoder_layer'] def __init__(self, in_channels=[512, 1024, 2048], feat_strides=[8, 16, 32], hidden_dim=256, use_encoder_idx=[2], num_encoder_layers=1, encoder_layer='TransformerLayer', pe_temperature=10000, expansion=1.0, depth_mult=1.0, act='silu', trt=False, eval_size=None): super(HybridEncoder, self).__init__() self.in_channels = in_channels self.feat_strides = feat_strides self.hidden_dim = hidden_dim self.use_encoder_idx = use_encoder_idx self.num_encoder_layers = num_encoder_layers self.pe_temperature = pe_temperature self.eval_size = eval_size # channel projection self.input_proj = nn.LayerList() for in_channel in in_channels: self.input_proj.append( nn.Sequential( nn.Conv2D( in_channel, hidden_dim, kernel_size=1, bias_attr=False), nn.BatchNorm2D( hidden_dim, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))))) # encoder transformer self.encoder = nn.LayerList([ TransformerEncoder(encoder_layer, num_encoder_layers) for _ in range(len(use_encoder_idx)) ]) act = get_act_fn( act, trt=trt) if act is None or isinstance(act, (str, dict)) else act # top-down fpn self.lateral_convs = nn.LayerList() self.fpn_blocks = nn.LayerList() for idx in range(len(in_channels) - 1, 0, -1): self.lateral_convs.append( BaseConv( hidden_dim, hidden_dim, 1, 1, act=act)) self.fpn_blocks.append( CSPRepLayer( hidden_dim * 2, hidden_dim, round(3 * depth_mult), act=act, expansion=expansion)) # bottom-up pan self.downsample_convs = nn.LayerList() self.pan_blocks = nn.LayerList() for idx in range(len(in_channels) - 1): self.downsample_convs.append( BaseConv( hidden_dim, hidden_dim, 3, stride=2, act=act)) self.pan_blocks.append( CSPRepLayer( hidden_dim * 2, hidden_dim, round(3 * depth_mult), act=act, expansion=expansion)) self._reset_parameters() def _reset_parameters(self): if self.eval_size: for idx in self.use_encoder_idx: stride = self.feat_strides[idx] pos_embed = self.build_2d_sincos_position_embedding( self.eval_size[1] // stride, self.eval_size[0] // stride, self.hidden_dim, self.pe_temperature) setattr(self, f'pos_embed{idx}', pos_embed) @staticmethod def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.): grid_w = paddle.arange(int(w), dtype=paddle.float32) grid_h = paddle.arange(int(h), dtype=paddle.float32) grid_w, grid_h = paddle.meshgrid(grid_w, grid_h) assert embed_dim % 4 == 0, \ 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding' pos_dim = embed_dim // 4 omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim omega = 1. / (temperature**omega) out_w = grid_w.flatten()[..., None] @omega[None] out_h = grid_h.flatten()[..., None] @omega[None] return paddle.concat( [ paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h), paddle.cos(out_h) ], axis=1)[None, :, :] def forward(self, feats, for_mot=False, is_teacher=False): assert len(feats) == len(self.in_channels) # get projection features proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)] # encoder if self.num_encoder_layers > 0: for i, enc_ind in enumerate(self.use_encoder_idx): h, w = proj_feats[enc_ind].shape[2:] # flatten [B, C, H, W] to [B, HxW, C] src_flatten = proj_feats[enc_ind].flatten(2).transpose( [0, 2, 1]) if self.training or self.eval_size is None or is_teacher: pos_embed = self.build_2d_sincos_position_embedding( w, h, self.hidden_dim, self.pe_temperature) else: pos_embed = getattr(self, f'pos_embed{enc_ind}', None) memory = self.encoder[i](src_flatten, pos_embed=pos_embed) proj_feats[enc_ind] = memory.transpose([0, 2, 1]).reshape( [-1, self.hidden_dim, h, w]) # top-down fpn inner_outs = [proj_feats[-1]] for idx in range(len(self.in_channels) - 1, 0, -1): feat_heigh = inner_outs[0] feat_low = proj_feats[idx - 1] feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx]( feat_heigh) inner_outs[0] = feat_heigh upsample_feat = F.interpolate( feat_heigh, scale_factor=2., mode="nearest") inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx]( paddle.concat( [upsample_feat, feat_low], axis=1)) inner_outs.insert(0, inner_out) # bottom-up pan outs = [inner_outs[0]] for idx in range(len(self.in_channels) - 1): feat_low = outs[-1] feat_height = inner_outs[idx + 1] downsample_feat = self.downsample_convs[idx](feat_low) out = self.pan_blocks[idx](paddle.concat( [downsample_feat, feat_height], axis=1)) outs.append(out) return outs @classmethod def from_config(cls, cfg, input_shape): return { 'in_channels': [i.channels for i in input_shape], 'feat_strides': [i.stride for i in input_shape] } @property def out_shape(self): return [ ShapeSpec( channels=self.hidden_dim, stride=self.feat_strides[idx]) for idx in range(len(self.in_channels)) ] class MaskFeatFPN(nn.Layer): def __init__(self, in_channels=[256, 256, 256], fpn_strides=[32, 16, 8], feat_channels=256, dropout_ratio=0.0, out_channels=256, align_corners=False, act='swish'): super(MaskFeatFPN, self).__init__() assert len(in_channels) == len(fpn_strides) reorder_index = np.argsort(fpn_strides, axis=0) in_channels = [in_channels[i] for i in reorder_index] fpn_strides = [fpn_strides[i] for i in reorder_index] assert min(fpn_strides) == fpn_strides[0] self.reorder_index = reorder_index self.fpn_strides = fpn_strides self.dropout_ratio = dropout_ratio self.align_corners = align_corners if self.dropout_ratio > 0: self.dropout = nn.Dropout2D(dropout_ratio) self.scale_heads = nn.LayerList() for i in range(len(fpn_strides)): head_length = max( 1, int(np.log2(fpn_strides[i]) - np.log2(fpn_strides[0]))) scale_head = [] for k in range(head_length): in_c = in_channels[i] if k == 0 else feat_channels scale_head.append( nn.Sequential( BaseConv(in_c, feat_channels, 3, 1, act=act)) ) if fpn_strides[i] != fpn_strides[0]: scale_head.append( nn.Upsample( scale_factor=2, mode='bilinear', align_corners=align_corners)) self.scale_heads.append(nn.Sequential(*scale_head)) self.output_conv = BaseConv( feat_channels, out_channels, 3, 1, act=act) def forward(self, inputs): x = [inputs[i] for i in self.reorder_index] output = self.scale_heads[0](x[0]) for i in range(1, len(self.fpn_strides)): output = output + F.interpolate( self.scale_heads[i](x[i]), size=output.shape[2:], mode='bilinear', align_corners=self.align_corners) if self.dropout_ratio > 0: output = self.dropout(output) output = self.output_conv(output) return output @register @serializable class MaskHybridEncoder(HybridEncoder): __shared__ = ['depth_mult', 'act', 'trt', 'eval_size', 'num_prototypes'] __inject__ = ['encoder_layer'] def __init__(self, in_channels=[256, 512, 1024, 2048], feat_strides=[4, 8, 16, 32], hidden_dim=256, use_encoder_idx=[3], num_encoder_layers=1, encoder_layer='TransformerLayer', num_prototypes=32, pe_temperature=10000, expansion=1.0, depth_mult=1.0, mask_feat_channels=[64, 64], act='silu', trt=False, eval_size=None): assert len(in_channels) == len(feat_strides) x4_feat_dim = in_channels.pop(0) x4_feat_stride = feat_strides.pop(0) use_encoder_idx = [i - 1 for i in use_encoder_idx] assert x4_feat_stride == 4 super(MaskHybridEncoder, self).__init__( in_channels=in_channels, feat_strides=feat_strides, hidden_dim=hidden_dim, use_encoder_idx=use_encoder_idx, num_encoder_layers=num_encoder_layers, encoder_layer=encoder_layer, pe_temperature=pe_temperature, expansion=expansion, depth_mult=depth_mult, act=act, trt=trt, eval_size=eval_size) self.mask_feat_head = MaskFeatFPN( [hidden_dim] * len(feat_strides), feat_strides, feat_channels=mask_feat_channels[0], out_channels=mask_feat_channels[1], act=act) self.enc_mask_lateral = BaseConv( x4_feat_dim, mask_feat_channels[1], 3, 1, act=act) self.enc_mask_output = nn.Sequential( BaseConv( mask_feat_channels[1], mask_feat_channels[1], 3, 1, act=act), nn.Conv2D(mask_feat_channels[1], num_prototypes, 1)) def forward(self, feats, for_mot=False, is_teacher=False): x4_feat = feats.pop(0) enc_feats = super(MaskHybridEncoder, self).forward( feats, for_mot=for_mot, is_teacher=is_teacher) mask_feat = self.mask_feat_head(enc_feats) mask_feat = F.interpolate( mask_feat, scale_factor=2, mode='bilinear', align_corners=False) mask_feat += self.enc_mask_lateral(x4_feat) mask_feat = self.enc_mask_output(mask_feat) return enc_feats, mask_feat ================================================ FILE: ppdet/modeling/transformers/mask_dino_transformer.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR) # Copyright (c) 2020 SenseTime. All Rights Reserved. # Modified from detrex (https://github.com/IDEA-Research/detrex) # Copyright 2022 The IDEA Authors. All rights reserved. from __future__ import absolute_import from __future__ import division from __future__ import print_function import math import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.regularizer import L2Decay from ppdet.core.workspace import register from .position_encoding import PositionEmbedding from ..heads.detr_head import MLP from .deformable_transformer import (DeformableTransformerEncoderLayer, DeformableTransformerEncoder) from .dino_transformer import (DINOTransformerDecoderLayer) from ..initializer import (linear_init_, constant_, xavier_uniform_, bias_init_with_prob) from .utils import (_get_clones, get_valid_ratio, get_denoising_training_group, get_sine_pos_embed, inverse_sigmoid, mask_to_box_coordinate) __all__ = ['MaskDINO'] class ConvGNBlock(nn.Layer): def __init__(self, in_channels, out_channels, kernel_size, stride=1, groups=1, num_groups=32, bias=False, act=None): super(ConvGNBlock, self).__init__() self.conv = nn.Conv2D( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=(kernel_size - 1) // 2, groups=groups, bias_attr=bias) self.norm = nn.GroupNorm( num_groups, out_channels, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) self.act = getattr(F, act) if act is not None else None self._init_weights() def _init_weights(self): xavier_uniform_(self.conv.weight) def forward(self, x): x = self.norm(self.conv(x)) if self.act is not None: x = self.act(x) return x class MaskDINOTransformerDecoder(nn.Layer): def __init__(self, hidden_dim, decoder_layer, num_layers): super(MaskDINOTransformerDecoder, self).__init__() self.layers = _get_clones(decoder_layer, num_layers) self.hidden_dim = hidden_dim self.num_layers = num_layers def forward(self, tgt, ref_points_unact, memory, memory_spatial_shapes, memory_level_start_index, bbox_head, query_pos_head, dec_norm, valid_ratios=None, attn_mask=None, memory_mask=None): if valid_ratios is None: valid_ratios = paddle.ones( [memory.shape[0], memory_spatial_shapes.shape[0], 2]) output = tgt intermediate = [] inter_bboxes = [] ref_points = F.sigmoid(ref_points_unact) for i, layer in enumerate(self.layers): reference_points_input = ref_points.detach().unsqueeze( 2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1) query_pos_embed = get_sine_pos_embed( reference_points_input[..., 0, :], self.hidden_dim // 2) query_pos_embed = query_pos_head(query_pos_embed) output = layer(output, reference_points_input, memory, memory_spatial_shapes, memory_level_start_index, attn_mask, memory_mask, query_pos_embed) ref_points = F.sigmoid( bbox_head(output) + inverse_sigmoid(ref_points.detach())) intermediate.append(dec_norm(output)) inter_bboxes.append(ref_points) return paddle.stack(intermediate), paddle.stack(inter_bboxes) @register class MaskDINO(nn.Layer): __shared__ = ['num_classes', 'hidden_dim'] def __init__(self, num_classes=80, hidden_dim=256, num_queries=300, position_embed_type='sine', in_feats_channel=[256, 512, 1024, 2048], num_levels=3, num_encoder_points=4, num_decoder_points=4, nhead=8, num_encoder_layers=6, num_decoder_layers=9, enc_dim_feedforward=1024, dec_dim_feedforward=2048, dropout=0., activation="relu", lr_mult=1.0, pe_temperature=10000, pe_offset=-0.5, num_denoising=100, label_noise_ratio=0.4, box_noise_scale=0.4, learnt_init_query=False, mask_enhanced=True, eps=1e-2): super(MaskDINO, self).__init__() assert position_embed_type in ['sine', 'learned'], \ f'ValueError: position_embed_type not supported {position_embed_type}!' feat0_dim = in_feats_channel.pop(0) assert len(in_feats_channel) <= num_levels self.hidden_dim = hidden_dim self.nhead = nhead self.num_levels = num_levels self.num_classes = num_classes self.num_queries = num_queries self.eps = eps self.num_decoder_layers = num_decoder_layers self.mask_enhanced = mask_enhanced weight_attr = ParamAttr(regularizer=L2Decay(0.0)) bias_attr = ParamAttr(regularizer=L2Decay(0.0)) # backbone feature projection self._build_input_proj_layer(in_feats_channel, weight_attr, bias_attr) # Transformer module encoder_layer = DeformableTransformerEncoderLayer( hidden_dim, nhead, enc_dim_feedforward, dropout, activation, num_levels, num_encoder_points, lr_mult, weight_attr, bias_attr) self.encoder = DeformableTransformerEncoder(encoder_layer, num_encoder_layers) decoder_layer = DINOTransformerDecoderLayer( hidden_dim, nhead, dec_dim_feedforward, dropout, activation, num_levels, num_decoder_points, lr_mult, weight_attr, bias_attr) self.decoder = MaskDINOTransformerDecoder(hidden_dim, decoder_layer, num_decoder_layers) # denoising part self.denoising_class_embed = nn.Embedding( num_classes, hidden_dim, weight_attr=ParamAttr(initializer=nn.initializer.Normal())) self.num_denoising = num_denoising self.label_noise_ratio = label_noise_ratio self.box_noise_scale = box_noise_scale # position embedding self.position_embedding = PositionEmbedding( hidden_dim // 2, temperature=pe_temperature, normalize=True if position_embed_type == 'sine' else False, embed_type=position_embed_type, offset=pe_offset) self.level_embed = nn.Embedding( num_levels, hidden_dim, weight_attr=ParamAttr(initializer=nn.initializer.Normal())) # decoder embedding self.learnt_init_query = learnt_init_query if learnt_init_query: self.tgt_embed = nn.Embedding(num_queries, hidden_dim) self.query_pos_head = MLP(2 * hidden_dim, hidden_dim, hidden_dim, num_layers=2) # mask embedding self.mask_query_head = MLP(hidden_dim, hidden_dim, hidden_dim, num_layers=3) # encoder mask head self.enc_mask_lateral = ConvGNBlock(feat0_dim, hidden_dim, 1) self.enc_mask_output = nn.Sequential( ConvGNBlock( hidden_dim, hidden_dim, 3, act=activation), nn.Conv2D(hidden_dim, hidden_dim, 1)) # encoder head self.enc_output = nn.Sequential( nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm( hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr)) # decoder norm layer self.dec_norm = nn.LayerNorm( hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr) # shared prediction head self.class_head = nn.Linear(hidden_dim, num_classes) self.bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3) self._reset_parameters() def _reset_parameters(self): # class and bbox head init bias_cls = bias_init_with_prob(0.01) linear_init_(self.class_head) constant_(self.class_head.bias, bias_cls) constant_(self.bbox_head.layers[-1].weight) constant_(self.bbox_head.layers[-1].bias) xavier_uniform_(self.enc_mask_output[1].weight) linear_init_(self.enc_output[0]) xavier_uniform_(self.enc_output[0].weight) if self.learnt_init_query: xavier_uniform_(self.tgt_embed.weight) xavier_uniform_(self.query_pos_head.layers[0].weight) xavier_uniform_(self.query_pos_head.layers[1].weight) for l in self.input_proj: xavier_uniform_(l[0].weight) @classmethod def from_config(cls, cfg, input_shape): return {'in_feats_channel': [i.channels for i in input_shape], } def _build_input_proj_layer(self, in_feats_channel, weight_attr=None, bias_attr=None): self.input_proj = nn.LayerList() for in_channels in in_feats_channel: self.input_proj.append( nn.Sequential( ('conv', nn.Conv2D( in_channels, self.hidden_dim, kernel_size=1)), ( 'norm', nn.GroupNorm( 32, self.hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr)))) in_channels = in_feats_channel[-1] for _ in range(self.num_levels - len(in_feats_channel)): self.input_proj.append( nn.Sequential( ('conv', nn.Conv2D( in_channels, self.hidden_dim, kernel_size=3, stride=2, padding=1)), ('norm', nn.GroupNorm( 32, self.hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr)))) in_channels = self.hidden_dim def _get_encoder_input(self, feats, pad_mask=None): # get projection features proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)] if self.num_levels > len(proj_feats): len_srcs = len(proj_feats) for i in range(len_srcs, self.num_levels): if i == len_srcs: proj_feats.append(self.input_proj[i](feats[-1])) else: proj_feats.append(self.input_proj[i](proj_feats[-1])) # get encoder inputs feat_flatten = [] mask_flatten = [] lvl_pos_embed_flatten = [] spatial_shapes = [] valid_ratios = [] for i, feat in enumerate(proj_feats): bs, _, h, w = feat.shape spatial_shapes.append(paddle.concat([h, w])) # [b,c,h,w] -> [b,h*w,c] feat_flatten.append(feat.flatten(2).transpose([0, 2, 1])) if pad_mask is not None: mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0] else: mask = paddle.ones([bs, h, w]) valid_ratios.append(get_valid_ratio(mask)) # [b, h*w, c] pos_embed = self.position_embedding(mask).flatten(1, 2) lvl_pos_embed = pos_embed + self.level_embed.weight[i] lvl_pos_embed_flatten.append(lvl_pos_embed) if pad_mask is not None: # [b, h*w] mask_flatten.append(mask.flatten(1)) # [b, l, c] feat_flatten = paddle.concat(feat_flatten, 1) # [b, l] mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten, 1) # [b, l, c] lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1) # [num_levels, 2] spatial_shapes = paddle.to_tensor( paddle.stack(spatial_shapes).astype('int64')) # [l], 每一个level的起始index level_start_index = paddle.concat([ paddle.zeros( [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1] ]) # [b, num_levels, 2] valid_ratios = paddle.stack(valid_ratios, 1) return (feat_flatten, spatial_shapes, level_start_index, mask_flatten, lvl_pos_embed_flatten, valid_ratios) def forward(self, feats, pad_mask=None, gt_meta=None): feat0 = feats.pop(0) # input projection and embedding (feat_flatten, spatial_shapes, level_start_index, mask_flatten, lvl_pos_embed_flatten, valid_ratios) = self._get_encoder_input(feats, pad_mask) # encoder memory = self.encoder(feat_flatten, spatial_shapes, level_start_index, mask_flatten, lvl_pos_embed_flatten, valid_ratios) mask_feat = self._get_encoder_mask_feature(feat0, memory, spatial_shapes) # prepare denoising training if self.training: denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \ get_denoising_training_group(gt_meta, self.num_classes, self.num_queries, self.denoising_class_embed.weight, self.num_denoising, self.label_noise_ratio, self.box_noise_scale) else: denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None target, init_ref_points_unact, enc_out, init_out = \ self._get_decoder_input( memory, mask_feat, spatial_shapes, mask_flatten, denoising_class, denoising_bbox_unact) # decoder inter_feats, inter_bboxes = self.decoder( target, init_ref_points_unact, memory, spatial_shapes, level_start_index, self.bbox_head, self.query_pos_head, self.dec_norm, valid_ratios, attn_mask, mask_flatten) out_logits = [] out_bboxes = [] out_masks = [] for i in range(self.num_decoder_layers): if self.training or i == self.num_decoder_layers - 1: logits_, masks_ = self._get_pred_class_and_mask(inter_feats[i], mask_feat) else: continue out_logits.append(logits_) out_masks.append(masks_) if i == 0: out_bboxes.append( F.sigmoid( self.bbox_head(inter_feats[i]) + init_ref_points_unact)) else: out_bboxes.append( F.sigmoid( self.bbox_head(inter_feats[i]) + inverse_sigmoid( inter_bboxes[i - 1]))) out_bboxes = paddle.stack(out_bboxes) out_logits = paddle.stack(out_logits) out_masks = paddle.stack(out_masks) return (out_logits, out_bboxes, out_masks, enc_out, init_out, dn_meta) def _get_encoder_mask_feature(self, in_feat, memory, spatial_shapes): memory_feat0 = memory.split( spatial_shapes.prod(1).split(self.num_levels), axis=1)[0] h, w = spatial_shapes[0] memory_feat0 = memory_feat0.reshape( [0, h, w, self.hidden_dim]).transpose([0, 3, 1, 2]) out = self.enc_mask_lateral(in_feat) + F.interpolate( memory_feat0, scale_factor=2.0, mode='bilinear', align_corners=False) return self.enc_mask_output(out) def _get_encoder_output_anchors(self, memory, spatial_shapes, memory_mask=None, grid_size=0.05): output_anchors = [] idx = 0 for lvl, (h, w) in enumerate(spatial_shapes): if memory_mask is not None: mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w]) valid_H = paddle.sum(mask_[:, :, 0], 1) valid_W = paddle.sum(mask_[:, 0, :], 1) else: valid_H, valid_W = h, w grid_y, grid_x = paddle.meshgrid( paddle.arange(end=h), paddle.arange(end=w)) grid_xy = paddle.stack([grid_x, grid_y], -1).astype(memory.dtype) valid_WH = paddle.stack([valid_W, valid_H], -1).reshape( [-1, 1, 1, 2]).astype(grid_xy.dtype) grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl) output_anchors.append( paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4])) idx += h * w output_anchors = paddle.concat(output_anchors, 1) valid_mask = ((output_anchors > self.eps) * (output_anchors < 1 - self.eps)).all(-1, keepdim=True) output_anchors = paddle.log(output_anchors / (1 - output_anchors)) if memory_mask is not None: valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0 output_anchors = paddle.where(valid_mask, output_anchors, paddle.to_tensor(float("inf"))) memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.)) output_memory = self.enc_output(memory) return output_memory, output_anchors def _get_decoder_input(self, memory, mask_feat, spatial_shapes, memory_mask=None, denoising_class=None, denoising_bbox_unact=None): # prepare input for decoder bs, _, _ = memory.shape output_memory, output_anchors = self._get_encoder_output_anchors( memory, spatial_shapes, memory_mask) enc_logits_unact = self.class_head(output_memory) enc_bboxes_unact = self.bbox_head(output_memory) + output_anchors # get topk index _, topk_ind = paddle.topk( enc_logits_unact.max(-1), self.num_queries, axis=1) batch_ind = paddle.arange(end=bs).astype(topk_ind.dtype) batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries]) topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1) # extract content and position query embedding target = paddle.gather_nd(output_memory, topk_ind) reference_points_unact = paddle.gather_nd(enc_bboxes_unact, topk_ind) # unsigmoided. # get encoder output: {logits, bboxes, masks} enc_out_logits, enc_out_masks = self._get_pred_class_and_mask(target, mask_feat) enc_out_bboxes = F.sigmoid(reference_points_unact) enc_out = (enc_out_logits, enc_out_bboxes, enc_out_masks) # concat denoising query if self.learnt_init_query: target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1]) else: target = target.detach() if denoising_class is not None: target = paddle.concat([denoising_class, target], 1) if self.mask_enhanced: # use mask-enhanced anchor box initialization reference_points = mask_to_box_coordinate( enc_out_masks > 0, normalize=True, format="xywh") reference_points_unact = inverse_sigmoid(reference_points) if denoising_bbox_unact is not None: reference_points_unact = paddle.concat( [denoising_bbox_unact, reference_points_unact], 1) # direct prediction from the matching and denoising part in the begining if self.training and denoising_class is not None: init_out_logits, init_out_masks = self._get_pred_class_and_mask( target, mask_feat) init_out_bboxes = F.sigmoid(reference_points_unact) init_out = (init_out_logits, init_out_bboxes, init_out_masks) else: init_out = None return target, reference_points_unact.detach(), enc_out, init_out def _get_pred_class_and_mask(self, query_embed, mask_feat): out_query = self.dec_norm(query_embed) out_logits = self.class_head(out_query) mask_query_embed = self.mask_query_head(out_query) _, _, h, w = mask_feat.shape # [b, q, c] x [b, c, h, w] -> [b, q, h, w] out_mask = paddle.bmm(mask_query_embed, mask_feat.flatten(2)).reshape( [0, 0, h, w]) return out_logits, out_mask ================================================ FILE: ppdet/modeling/transformers/mask_rtdetr_transformer.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.regularizer import L2Decay from ppdet.core.workspace import register from .rtdetr_transformer import TransformerDecoderLayer from .utils import (_get_clones, inverse_sigmoid, get_denoising_training_group, mask_to_box_coordinate) from ..heads.detr_head import MLP from ..initializer import (linear_init_, constant_, xavier_uniform_, bias_init_with_prob) __all__ = ['MaskRTDETR'] def _get_pred_class_and_mask(query_embed, mask_feat, dec_norm, score_head, mask_query_head): out_query = dec_norm(query_embed) out_logits = score_head(out_query) mask_query_embed = mask_query_head(out_query) batch_size, mask_dim, _ = mask_query_embed.shape _, _, mask_h, mask_w = mask_feat.shape out_mask = paddle.bmm( mask_query_embed, mask_feat.flatten(2)).reshape( [batch_size, mask_dim, mask_h, mask_w]) return out_logits, out_mask class MaskTransformerDecoder(nn.Layer): def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1, eval_topk=100): super(MaskTransformerDecoder, self).__init__() self.layers = _get_clones(decoder_layer, num_layers) self.hidden_dim = hidden_dim self.num_layers = num_layers self.eval_idx = eval_idx if eval_idx >= 0 \ else num_layers + eval_idx self.eval_topk = eval_topk def forward(self, tgt, ref_points_unact, memory, memory_spatial_shapes, memory_level_start_index, mask_feat, bbox_head, score_head, query_pos_head, mask_query_head, dec_norm, attn_mask=None, memory_mask=None, query_pos_head_inv_sig=False): output = tgt dec_out_bboxes = [] dec_out_logits = [] dec_out_masks = [] ref_points_detach = F.sigmoid(ref_points_unact) for i, layer in enumerate(self.layers): ref_points_input = ref_points_detach.unsqueeze(2) if not query_pos_head_inv_sig: query_pos_embed = query_pos_head(ref_points_detach) else: query_pos_embed = query_pos_head( inverse_sigmoid(ref_points_detach)) output = layer(output, ref_points_input, memory, memory_spatial_shapes, memory_level_start_index, attn_mask, memory_mask, query_pos_embed) inter_ref_bbox = F.sigmoid(bbox_head(output) + inverse_sigmoid(ref_points_detach)) if self.training: logits_, masks_ = _get_pred_class_and_mask( output, mask_feat, dec_norm, score_head, mask_query_head) dec_out_logits.append(logits_) dec_out_masks.append(masks_) if i == 0: dec_out_bboxes.append(inter_ref_bbox) else: dec_out_bboxes.append( F.sigmoid(bbox_head(output) + inverse_sigmoid(ref_points))) elif i == self.eval_idx: logits_, masks_ = _get_pred_class_and_mask( output, mask_feat, dec_norm, score_head, mask_query_head) dec_out_logits.append(logits_) dec_out_masks.append(masks_) dec_out_bboxes.append(inter_ref_bbox) return (paddle.stack(dec_out_bboxes), paddle.stack(dec_out_logits), paddle.stack(dec_out_masks)) ref_points = inter_ref_bbox ref_points_detach = inter_ref_bbox.detach( ) if self.training else inter_ref_bbox return (paddle.stack(dec_out_bboxes), paddle.stack(dec_out_logits), paddle.stack(dec_out_masks)) @register class MaskRTDETR(nn.Layer): __shared__ = ['num_classes', 'hidden_dim', 'eval_size', 'num_prototypes'] def __init__(self, num_classes=80, hidden_dim=256, num_queries=300, position_embed_type='sine', backbone_feat_channels=[512, 1024, 2048], feat_strides=[8, 16, 32], num_prototypes=32, num_levels=3, num_decoder_points=4, nhead=8, num_decoder_layers=6, dim_feedforward=1024, dropout=0., activation="relu", num_denoising=100, label_noise_ratio=0.4, box_noise_scale=0.4, learnt_init_query=False, query_pos_head_inv_sig=False, mask_enhanced=True, eval_size=None, eval_idx=-1, eps=1e-2): super(MaskRTDETR, self).__init__() assert position_embed_type in ['sine', 'learned'], \ f'ValueError: position_embed_type not supported {position_embed_type}!' assert len(backbone_feat_channels) <= num_levels assert len(feat_strides) == len(backbone_feat_channels) for _ in range(num_levels - len(feat_strides)): feat_strides.append(feat_strides[-1] * 2) self.hidden_dim = hidden_dim self.nhead = nhead self.feat_strides = feat_strides self.num_levels = num_levels self.num_classes = num_classes self.num_queries = num_queries self.eps = eps self.num_decoder_layers = num_decoder_layers self.mask_enhanced = mask_enhanced self.eval_size = eval_size # backbone feature projection self._build_input_proj_layer(backbone_feat_channels) # Transformer module decoder_layer = TransformerDecoderLayer( hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels, num_decoder_points) self.decoder = MaskTransformerDecoder(hidden_dim, decoder_layer, num_decoder_layers, eval_idx) # denoising part self.denoising_class_embed = nn.Embedding( num_classes, hidden_dim, weight_attr=ParamAttr(initializer=nn.initializer.Normal())) self.num_denoising = num_denoising self.label_noise_ratio = label_noise_ratio self.box_noise_scale = box_noise_scale # decoder embedding self.learnt_init_query = learnt_init_query if learnt_init_query: self.tgt_embed = nn.Embedding(num_queries, hidden_dim) self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2) self.query_pos_head_inv_sig = query_pos_head_inv_sig # mask embedding self.mask_query_head = MLP(hidden_dim, hidden_dim, num_prototypes, num_layers=3) # encoder head self.enc_output = nn.Sequential( nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm( hidden_dim, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0)))) # decoder norm layer self.dec_norm = nn.LayerNorm( hidden_dim, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) # shared prediction head self.score_head = nn.Linear(hidden_dim, num_classes) self.bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3) self._reset_parameters() def _reset_parameters(self): # class and bbox head init bias_cls = bias_init_with_prob(0.01) linear_init_(self.score_head) constant_(self.score_head.bias, bias_cls) constant_(self.bbox_head.layers[-1].weight) constant_(self.bbox_head.layers[-1].bias) linear_init_(self.enc_output[0]) xavier_uniform_(self.enc_output[0].weight) if self.learnt_init_query: xavier_uniform_(self.tgt_embed.weight) xavier_uniform_(self.query_pos_head.layers[0].weight) xavier_uniform_(self.query_pos_head.layers[1].weight) for l in self.input_proj: xavier_uniform_(l[0].weight) # init encoder output anchors and valid_mask if self.eval_size: self.anchors, self.valid_mask = self._generate_anchors() @classmethod def from_config(cls, cfg, input_shape): return {'backbone_feat_channels': [i.channels for i in input_shape], 'feat_strides': [i.stride for i in input_shape]} def _build_input_proj_layer(self, backbone_feat_channels): self.input_proj = nn.LayerList() for in_channels in backbone_feat_channels: self.input_proj.append( nn.Sequential( ('conv', nn.Conv2D( in_channels, self.hidden_dim, kernel_size=1, bias_attr=False)), ('norm', nn.BatchNorm2D( self.hidden_dim, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))) in_channels = backbone_feat_channels[-1] for _ in range(self.num_levels - len(backbone_feat_channels)): self.input_proj.append( nn.Sequential( ('conv', nn.Conv2D( in_channels, self.hidden_dim, kernel_size=3, stride=2, padding=1, bias_attr=False)), ('norm', nn.BatchNorm2D( self.hidden_dim, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))) in_channels = self.hidden_dim def _get_encoder_input(self, feats): # get projection features proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)] if self.num_levels > len(proj_feats): len_srcs = len(proj_feats) for i in range(len_srcs, self.num_levels): if i == len_srcs: proj_feats.append(self.input_proj[i](feats[-1])) else: proj_feats.append(self.input_proj[i](proj_feats[-1])) # get encoder inputs feat_flatten = [] spatial_shapes = [] level_start_index = [0, ] for i, feat in enumerate(proj_feats): _, _, h, w = feat.shape # [b, c, h, w] -> [b, h*w, c] feat_flatten.append(feat.flatten(2).transpose([0, 2, 1])) # [num_levels, 2] spatial_shapes.append([h, w]) # [l], start index of each level level_start_index.append(h * w + level_start_index[-1]) # [b, l, c] feat_flatten = paddle.concat(feat_flatten, 1) level_start_index.pop() return feat_flatten, spatial_shapes, level_start_index def forward(self, feats, pad_mask=None, gt_meta=None, is_teacher=False): enc_feats, mask_feat = feats # input projection and embedding (memory, spatial_shapes, level_start_index) = self._get_encoder_input(enc_feats) # prepare denoising training if self.training: denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \ get_denoising_training_group(gt_meta, self.num_classes, self.num_queries, self.denoising_class_embed.weight, self.num_denoising, self.label_noise_ratio, self.box_noise_scale) else: denoising_class, denoising_bbox_unact,\ attn_mask, dn_meta = None, None, None, None target, init_ref_points_unact, enc_out, init_out = \ self._get_decoder_input( memory, mask_feat, spatial_shapes, denoising_class, denoising_bbox_unact, is_teacher) # decoder out_bboxes, out_logits, out_masks = self.decoder( target, init_ref_points_unact, memory, spatial_shapes, level_start_index, mask_feat, self.bbox_head, self.score_head, self.query_pos_head, self.mask_query_head, self.dec_norm, attn_mask=attn_mask, memory_mask=None, query_pos_head_inv_sig=self.query_pos_head_inv_sig) return out_logits, out_bboxes, out_masks, enc_out, init_out, dn_meta def _generate_anchors(self, spatial_shapes=None, grid_size=0.05, dtype=paddle.float32): if spatial_shapes is None: spatial_shapes = [ [int(self.eval_size[0] / s), int(self.eval_size[1] / s)] for s in self.feat_strides ] anchors = [] for lvl, (h, w) in enumerate(spatial_shapes): grid_y, grid_x = paddle.meshgrid( paddle.arange( end=h, dtype=dtype), paddle.arange( end=w, dtype=dtype)) grid_xy = paddle.stack([grid_x, grid_y], -1) valid_WH = paddle.to_tensor([h, w]).astype(dtype) grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH wh = paddle.ones_like(grid_xy) * grid_size * (2.0 ** lvl) anchors.append( paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4])) anchors = paddle.concat(anchors, 1) valid_mask = ((anchors > self.eps) * (anchors < 1 - self.eps)).all(-1, keepdim=True) anchors = paddle.log(anchors / (1 - anchors)) anchors = paddle.where(valid_mask, anchors, paddle.to_tensor(float("inf"))) return anchors, valid_mask def _get_decoder_input(self, memory, mask_feat, spatial_shapes, denoising_class=None, denoising_bbox_unact=None, is_teacher=False): bs, _, _ = memory.shape # prepare input for decoder if self.training or self.eval_size is None or is_teacher: anchors, valid_mask = self._generate_anchors(spatial_shapes) else: anchors, valid_mask = self.anchors, self.valid_mask memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.)) output_memory = self.enc_output(memory) enc_logits_unact = self.score_head(output_memory) enc_bboxes_unact = self.bbox_head(output_memory) + anchors # get topk index _, topk_ind = paddle.topk( enc_logits_unact.max(-1), self.num_queries, axis=1) batch_ind = paddle.arange(end=bs).astype(topk_ind.dtype) batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries]) topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1) # extract content and position query embedding target = paddle.gather_nd(output_memory, topk_ind) reference_points_unact = paddle.gather_nd(enc_bboxes_unact, topk_ind) # unsigmoided. # get encoder output: {logits, bboxes, masks} enc_out_logits, enc_out_masks = _get_pred_class_and_mask( target, mask_feat, self.dec_norm, self.score_head, self.mask_query_head) enc_out_bboxes = F.sigmoid(reference_points_unact) enc_out = (enc_out_logits, enc_out_bboxes, enc_out_masks) # concat denoising query if self.learnt_init_query: target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1]) else: target = target.detach() if denoising_class is not None: target = paddle.concat([denoising_class, target], 1) if self.mask_enhanced: # use mask-enhanced anchor box initialization reference_points = mask_to_box_coordinate( enc_out_masks > 0, normalize=True, format="xywh") reference_points_unact = inverse_sigmoid(reference_points) if denoising_bbox_unact is not None: reference_points_unact = paddle.concat( [denoising_bbox_unact, reference_points_unact], 1) # direct prediction from the matching and denoising part in the beginning if self.training and denoising_class is not None: init_out_logits, init_out_masks = _get_pred_class_and_mask( target, mask_feat, self.dec_norm, self.score_head, self.mask_query_head) init_out_bboxes = F.sigmoid(reference_points_unact) init_out = (init_out_logits, init_out_bboxes, init_out_masks) else: init_out = None return target, reference_points_unact.detach(), enc_out, init_out ================================================ FILE: ppdet/modeling/transformers/matchers.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Modified from DETR (https://github.com/facebookresearch/detr) # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from scipy.optimize import linear_sum_assignment from ppdet.core.workspace import register, serializable from ..losses.iou_loss import GIoULoss from .utils import bbox_cxcywh_to_xyxy __all__ = ['HungarianMatcher'] @register @serializable class HungarianMatcher(nn.Layer): __shared__ = ['use_focal_loss', 'with_mask', 'num_sample_points'] def __init__(self, matcher_coeff={ 'class': 1, 'bbox': 5, 'giou': 2, 'mask': 1, 'dice': 1 }, use_focal_loss=False, with_mask=False, num_sample_points=12544, alpha=0.25, gamma=2.0): r""" Args: matcher_coeff (dict): The coefficient of hungarian matcher cost. """ super(HungarianMatcher, self).__init__() self.matcher_coeff = matcher_coeff self.use_focal_loss = use_focal_loss self.with_mask = with_mask self.num_sample_points = num_sample_points self.alpha = alpha self.gamma = gamma self.giou_loss = GIoULoss() def forward(self, boxes, logits, gt_bbox, gt_class, masks=None, gt_mask=None): r""" Args: boxes (Tensor): [b, query, 4] logits (Tensor): [b, query, num_classes] gt_bbox (List(Tensor)): list[[n, 4]] gt_class (List(Tensor)): list[[n, 1]] masks (Tensor|None): [b, query, h, w] gt_mask (List(Tensor)): list[[n, H, W]] Returns: A list of size batch_size, containing tuples of (index_i, index_j) where: - index_i is the indices of the selected predictions (in order) - index_j is the indices of the corresponding selected targets (in order) For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) """ bs, num_queries = boxes.shape[:2] num_gts = [len(a) for a in gt_class] if sum(num_gts) == 0: return [(paddle.to_tensor( [], dtype=paddle.int64), paddle.to_tensor( [], dtype=paddle.int64)) for _ in range(bs)] # We flatten to compute the cost matrices in a batch # [batch_size * num_queries, num_classes] logits = logits.detach() out_prob = F.sigmoid(logits.flatten( 0, 1)) if self.use_focal_loss else F.softmax(logits.flatten(0, 1)) # [batch_size * num_queries, 4] out_bbox = boxes.detach().flatten(0, 1) # Also concat the target labels and boxes tgt_ids = paddle.concat(gt_class).flatten() tgt_bbox = paddle.concat(gt_bbox) # Compute the classification cost out_prob = paddle.gather(out_prob, tgt_ids, axis=1) if self.use_focal_loss: neg_cost_class = (1 - self.alpha) * (out_prob**self.gamma) * (-( 1 - out_prob + 1e-8).log()) pos_cost_class = self.alpha * ( (1 - out_prob)**self.gamma) * (-(out_prob + 1e-8).log()) cost_class = pos_cost_class - neg_cost_class else: cost_class = -out_prob # Compute the L1 cost between boxes cost_bbox = ( out_bbox.unsqueeze(1) - tgt_bbox.unsqueeze(0)).abs().sum(-1) # Compute the giou cost betwen boxes giou_loss = self.giou_loss( bbox_cxcywh_to_xyxy(out_bbox.unsqueeze(1)), bbox_cxcywh_to_xyxy(tgt_bbox.unsqueeze(0))).squeeze(-1) cost_giou = giou_loss - 1 # Final cost matrix C = self.matcher_coeff['class'] * cost_class + \ self.matcher_coeff['bbox'] * cost_bbox + \ self.matcher_coeff['giou'] * cost_giou # Compute the mask cost and dice cost if self.with_mask: assert (masks is not None and gt_mask is not None, 'Make sure the input has `mask` and `gt_mask`') # all masks share the same set of points for efficient matching sample_points = paddle.rand([bs, 1, self.num_sample_points, 2]) sample_points = 2.0 * sample_points - 1.0 out_mask = F.grid_sample( masks.detach(), sample_points, align_corners=False).squeeze(-2) out_mask = out_mask.flatten(0, 1) tgt_mask = paddle.concat(gt_mask).unsqueeze(1) sample_points = paddle.concat([ a.tile([b, 1, 1, 1]) for a, b in zip(sample_points, num_gts) if b > 0 ]) tgt_mask = F.grid_sample( tgt_mask, sample_points, align_corners=False).squeeze([1, 2]) with paddle.amp.auto_cast(enable=False): # binary cross entropy cost pos_cost_mask = F.binary_cross_entropy_with_logits( out_mask, paddle.ones_like(out_mask), reduction='none') neg_cost_mask = F.binary_cross_entropy_with_logits( out_mask, paddle.zeros_like(out_mask), reduction='none') cost_mask = paddle.matmul( pos_cost_mask, tgt_mask, transpose_y=True) + paddle.matmul( neg_cost_mask, 1 - tgt_mask, transpose_y=True) cost_mask /= self.num_sample_points # dice cost out_mask = F.sigmoid(out_mask) numerator = 2 * paddle.matmul( out_mask, tgt_mask, transpose_y=True) denominator = out_mask.sum( -1, keepdim=True) + tgt_mask.sum(-1).unsqueeze(0) cost_dice = 1 - (numerator + 1) / (denominator + 1) C = C + self.matcher_coeff['mask'] * cost_mask + \ self.matcher_coeff['dice'] * cost_dice C = C.reshape([bs, num_queries, -1]) C = [a.squeeze(0) for a in C.chunk(bs)] sizes = [a.shape[0] for a in gt_bbox] if hasattr(paddle.Tensor, "contiguous"): indices = [ linear_sum_assignment(c.split(sizes, -1)[i].contiguous().numpy()) for i, c in enumerate(C) ] else: indices = [ linear_sum_assignment(c.split(sizes, -1)[i].numpy()) for i, c in enumerate(C) ] return [(paddle.to_tensor( i, dtype=paddle.int64), paddle.to_tensor( j, dtype=paddle.int64)) for i, j in indices] ================================================ FILE: ppdet/modeling/transformers/petr_transformer.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ this code is base on https://github.com/hikvision-research/opera/blob/main/opera/models/utils/transformer.py """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import math import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from ppdet.core.workspace import register from ..layers import MultiHeadAttention, _convert_attention_mask from .utils import _get_clones from ..initializer import linear_init_, normal_, constant_, xavier_uniform_ __all__ = [ 'PETRTransformer', 'MultiScaleDeformablePoseAttention', 'PETR_TransformerDecoderLayer', 'PETR_TransformerDecoder', 'PETR_DeformableDetrTransformerDecoder', 'PETR_DeformableTransformerDecoder', 'TransformerEncoderLayer', 'TransformerEncoder', 'MSDeformableAttention' ] def masked_fill(x, mask, value): y = paddle.full(x.shape, value, x.dtype) return paddle.where(mask, y, x) def inverse_sigmoid(x, eps=1e-5): """Inverse function of sigmoid. Args: x (Tensor): The tensor to do the inverse. eps (float): EPS avoid numerical overflow. Defaults 1e-5. Returns: Tensor: The x has passed the inverse function of sigmoid, has same shape with input. """ x = x.clip(min=0, max=1) x1 = x.clip(min=eps) x2 = (1 - x).clip(min=eps) return paddle.log(x1 / x2) @register class TransformerEncoderLayer(nn.Layer): __inject__ = ['attn'] def __init__(self, d_model, attn=None, nhead=8, dim_feedforward=2048, dropout=0.1, activation="relu", attn_dropout=None, act_dropout=None, normalize_before=False): super(TransformerEncoderLayer, self).__init__() attn_dropout = dropout if attn_dropout is None else attn_dropout act_dropout = dropout if act_dropout is None else act_dropout self.normalize_before = normalize_before self.embed_dims = d_model if attn is None: self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout) else: self.self_attn = attn # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train") self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train") self.activation = getattr(F, activation) self._reset_parameters() def _reset_parameters(self): linear_init_(self.linear1) linear_init_(self.linear2) @staticmethod def with_pos_embed(tensor, pos_embed): return tensor if pos_embed is None else tensor + pos_embed def forward(self, src, src_mask=None, pos_embed=None, **kwargs): residual = src if self.normalize_before: src = self.norm1(src) q = k = self.with_pos_embed(src, pos_embed) src = self.self_attn(q, k, value=src, attn_mask=src_mask, **kwargs) src = residual + self.dropout1(src) if not self.normalize_before: src = self.norm1(src) residual = src if self.normalize_before: src = self.norm2(src) src = self.linear2(self.dropout(self.activation(self.linear1(src)))) src = residual + self.dropout2(src) if not self.normalize_before: src = self.norm2(src) return src @register class TransformerEncoder(nn.Layer): __inject__ = ['encoder_layer'] def __init__(self, encoder_layer, num_layers, norm=None): super(TransformerEncoder, self).__init__() self.layers = _get_clones(encoder_layer, num_layers) self.num_layers = num_layers self.norm = norm self.embed_dims = encoder_layer.embed_dims def forward(self, src, src_mask=None, pos_embed=None, **kwargs): output = src for layer in self.layers: output = layer( output, src_mask=src_mask, pos_embed=pos_embed, **kwargs) if self.norm is not None: output = self.norm(output) return output @register class MSDeformableAttention(nn.Layer): def __init__(self, embed_dim=256, num_heads=8, num_levels=4, num_points=4, lr_mult=0.1): """ Multi-Scale Deformable Attention Module """ super(MSDeformableAttention, self).__init__() self.embed_dim = embed_dim self.num_heads = num_heads self.num_levels = num_levels self.num_points = num_points self.total_points = num_heads * num_levels * num_points self.head_dim = embed_dim // num_heads assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" self.sampling_offsets = nn.Linear( embed_dim, self.total_points * 2, weight_attr=ParamAttr(learning_rate=lr_mult), bias_attr=ParamAttr(learning_rate=lr_mult)) self.attention_weights = nn.Linear(embed_dim, self.total_points) self.value_proj = nn.Linear(embed_dim, embed_dim) self.output_proj = nn.Linear(embed_dim, embed_dim) try: # use cuda op print("use deformable_detr_ops in ms_deformable_attn") from deformable_detr_ops import ms_deformable_attn except: # use paddle func from .utils import deformable_attention_core_func as ms_deformable_attn self.ms_deformable_attn_core = ms_deformable_attn self._reset_parameters() def _reset_parameters(self): # sampling_offsets constant_(self.sampling_offsets.weight) thetas = paddle.arange( self.num_heads, dtype=paddle.float32) * (2.0 * math.pi / self.num_heads) grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1) grid_init = grid_init / grid_init.abs().max(-1, keepdim=True) grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile( [1, self.num_levels, self.num_points, 1]) scaling = paddle.arange( 1, self.num_points + 1, dtype=paddle.float32).reshape([1, 1, -1, 1]) grid_init *= scaling self.sampling_offsets.bias.set_value(grid_init.flatten()) # attention_weights constant_(self.attention_weights.weight) constant_(self.attention_weights.bias) # proj xavier_uniform_(self.value_proj.weight) constant_(self.value_proj.bias) xavier_uniform_(self.output_proj.weight) constant_(self.output_proj.bias) def forward(self, query, key, value, reference_points, value_spatial_shapes, value_level_start_index, attn_mask=None, **kwargs): """ Args: query (Tensor): [bs, query_length, C] reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area value (Tensor): [bs, value_length, C] value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] value_level_start_index (Tensor(int64)): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...] attn_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements Returns: output (Tensor): [bs, Length_{query}, C] """ bs, Len_q = query.shape[:2] Len_v = value.shape[1] assert int(value_spatial_shapes.prod(1).sum()) == Len_v value = self.value_proj(value) if attn_mask is not None: attn_mask = attn_mask.astype(value.dtype).unsqueeze(-1) value *= attn_mask value = value.reshape([bs, Len_v, self.num_heads, self.head_dim]) sampling_offsets = self.sampling_offsets(query).reshape( [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2]) attention_weights = self.attention_weights(query).reshape( [bs, Len_q, self.num_heads, self.num_levels * self.num_points]) attention_weights = F.softmax(attention_weights).reshape( [bs, Len_q, self.num_heads, self.num_levels, self.num_points]) if reference_points.shape[-1] == 2: offset_normalizer = value_spatial_shapes.flip([1]).reshape( [1, 1, 1, self.num_levels, 1, 2]) sampling_locations = reference_points.reshape([ bs, Len_q, 1, self.num_levels, 1, 2 ]) + sampling_offsets / offset_normalizer elif reference_points.shape[-1] == 4: sampling_locations = ( reference_points[:, :, None, :, None, :2] + sampling_offsets / self.num_points * reference_points[:, :, None, :, None, 2:] * 0.5) else: raise ValueError( "Last dim of reference_points must be 2 or 4, but get {} instead.". format(reference_points.shape[-1])) output = self.ms_deformable_attn_core( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) output = self.output_proj(output) return output @register class MultiScaleDeformablePoseAttention(nn.Layer): """An attention module used in PETR. `End-to-End Multi-Person Pose Estimation with Transformers`. Args: embed_dims (int): The embedding dimension of Attention. Default: 256. num_heads (int): Parallel attention heads. Default: 8. num_levels (int): The number of feature map used in Attention. Default: 4. num_points (int): The number of sampling points for each query in each head. Default: 17. im2col_step (int): The step used in image_to_column. Default: 64. dropout (float): A Dropout layer on `inp_residual`. Default: 0.1. init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. """ def __init__(self, embed_dims=256, num_heads=8, num_levels=4, num_points=17, im2col_step=64, dropout=0.1, norm_cfg=None, init_cfg=None, batch_first=False, lr_mult=0.1): super().__init__() if embed_dims % num_heads != 0: raise ValueError(f'embed_dims must be divisible by num_heads, ' f'but got {embed_dims} and {num_heads}') dim_per_head = embed_dims // num_heads self.norm_cfg = norm_cfg self.init_cfg = init_cfg self.dropout = nn.Dropout(dropout) self.batch_first = batch_first # you'd better set dim_per_head to a power of 2 # which is more efficient in the CUDA implementation def _is_power_of_2(n): if (not isinstance(n, int)) or (n < 0): raise ValueError( 'invalid input for _is_power_of_2: {} (type: {})'.format( n, type(n))) return (n & (n - 1) == 0) and n != 0 if not _is_power_of_2(dim_per_head): warnings.warn("You'd better set embed_dims in " 'MultiScaleDeformAttention to make ' 'the dimension of each attention head a power of 2 ' 'which is more efficient in our CUDA implementation.') self.im2col_step = im2col_step self.embed_dims = embed_dims self.num_levels = num_levels self.num_heads = num_heads self.num_points = num_points self.sampling_offsets = nn.Linear( embed_dims, num_heads * num_levels * num_points * 2, weight_attr=ParamAttr(learning_rate=lr_mult), bias_attr=ParamAttr(learning_rate=lr_mult)) self.attention_weights = nn.Linear(embed_dims, num_heads * num_levels * num_points) self.value_proj = nn.Linear(embed_dims, embed_dims) self.output_proj = nn.Linear(embed_dims, embed_dims) try: # use cuda op from deformable_detr_ops import ms_deformable_attn except: # use paddle func from .utils import deformable_attention_core_func as ms_deformable_attn self.ms_deformable_attn_core = ms_deformable_attn self.init_weights() def init_weights(self): """Default initialization for Parameters of Module.""" constant_(self.sampling_offsets.weight) constant_(self.sampling_offsets.bias) constant_(self.attention_weights.weight) constant_(self.attention_weights.bias) xavier_uniform_(self.value_proj.weight) constant_(self.value_proj.bias) xavier_uniform_(self.output_proj.weight) constant_(self.output_proj.bias) def forward(self, query, key, value, residual=None, attn_mask=None, reference_points=None, value_spatial_shapes=None, value_level_start_index=None, **kwargs): """Forward Function of MultiScaleDeformAttention. Args: query (Tensor): Query of Transformer with shape (num_query, bs, embed_dims). key (Tensor): The key tensor with shape (num_key, bs, embed_dims). value (Tensor): The value tensor with shape (num_key, bs, embed_dims). residual (Tensor): The tensor used for addition, with the same shape as `x`. Default None. If None, `x` will be used. reference_points (Tensor): The normalized reference points with shape (bs, num_query, num_levels, K*2), all elements is range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area. attn_mask (Tensor): ByteTensor for `query`, with shape [bs, num_key]. value_spatial_shapes (Tensor): Spatial shape of features in different level. With shape (num_levels, 2), last dimension represent (h, w). value_level_start_index (Tensor): The start index of each level. A tensor has shape (num_levels) and can be represented as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. Returns: Tensor: forwarded results with shape [num_query, bs, embed_dims]. """ if key is None: key = query if value is None: value = key bs, num_query, _ = query.shape bs, num_key, _ = value.shape assert (value_spatial_shapes[:, 0].numpy() * value_spatial_shapes[:, 1].numpy()).sum() == num_key value = self.value_proj(value) if attn_mask is not None: # value = value.masked_fill(attn_mask[..., None], 0.0) value *= attn_mask.unsqueeze(-1) value = value.reshape([bs, num_key, self.num_heads, -1]) sampling_offsets = self.sampling_offsets(query).reshape([ bs, num_query, self.num_heads, self.num_levels, self.num_points, 2 ]) attention_weights = self.attention_weights(query).reshape( [bs, num_query, self.num_heads, self.num_levels * self.num_points]) attention_weights = F.softmax(attention_weights, axis=-1) attention_weights = attention_weights.reshape( [bs, num_query, self.num_heads, self.num_levels, self.num_points]) if reference_points.shape[-1] == self.num_points * 2: reference_points_reshape = reference_points.reshape( (bs, num_query, self.num_levels, -1, 2)).unsqueeze(2) x1 = reference_points[:, :, :, 0::2].min(axis=-1, keepdim=True) y1 = reference_points[:, :, :, 1::2].min(axis=-1, keepdim=True) x2 = reference_points[:, :, :, 0::2].max(axis=-1, keepdim=True) y2 = reference_points[:, :, :, 1::2].max(axis=-1, keepdim=True) w = paddle.clip(x2 - x1, min=1e-4) h = paddle.clip(y2 - y1, min=1e-4) wh = paddle.concat([w, h], axis=-1)[:, :, None, :, None, :] sampling_locations = reference_points_reshape \ + sampling_offsets * wh * 0.5 else: raise ValueError( f'Last dim of reference_points must be' f' 2K, but get {reference_points.shape[-1]} instead.') output = self.ms_deformable_attn_core( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) output = self.output_proj(output) return output @register class PETR_TransformerDecoderLayer(nn.Layer): __inject__ = ['self_attn', 'cross_attn'] def __init__(self, d_model, nhead=8, self_attn=None, cross_attn=None, dim_feedforward=2048, dropout=0.1, activation="relu", attn_dropout=None, act_dropout=None, normalize_before=False): super(PETR_TransformerDecoderLayer, self).__init__() attn_dropout = dropout if attn_dropout is None else attn_dropout act_dropout = dropout if act_dropout is None else act_dropout self.normalize_before = normalize_before if self_attn is None: self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout) else: self.self_attn = self_attn if cross_attn is None: self.cross_attn = MultiHeadAttention(d_model, nhead, attn_dropout) else: self.cross_attn = cross_attn # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train") self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train") self.dropout3 = nn.Dropout(dropout, mode="upscale_in_train") self.activation = getattr(F, activation) self._reset_parameters() def _reset_parameters(self): linear_init_(self.linear1) linear_init_(self.linear2) @staticmethod def with_pos_embed(tensor, pos_embed): return tensor if pos_embed is None else tensor + pos_embed def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, pos_embed=None, query_pos_embed=None, **kwargs): tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype) residual = tgt if self.normalize_before: tgt = self.norm1(tgt) q = k = self.with_pos_embed(tgt, query_pos_embed) tgt = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask) tgt = residual + self.dropout1(tgt) if not self.normalize_before: tgt = self.norm1(tgt) residual = tgt if self.normalize_before: tgt = self.norm2(tgt) q = self.with_pos_embed(tgt, query_pos_embed) key_tmp = tgt # k = self.with_pos_embed(memory, pos_embed) tgt = self.cross_attn( q, key=key_tmp, value=memory, attn_mask=memory_mask, **kwargs) tgt = residual + self.dropout2(tgt) if not self.normalize_before: tgt = self.norm2(tgt) residual = tgt if self.normalize_before: tgt = self.norm3(tgt) tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) tgt = residual + self.dropout3(tgt) if not self.normalize_before: tgt = self.norm3(tgt) return tgt @register class PETR_TransformerDecoder(nn.Layer): """Implements the decoder in PETR transformer. Args: return_intermediate (bool): Whether to return intermediate outputs. coder_norm_cfg (dict): Config of last normalization layer. Default: `LN`. """ __inject__ = ['decoder_layer'] def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False, num_keypoints=17, **kwargs): super(PETR_TransformerDecoder, self).__init__() self.layers = _get_clones(decoder_layer, num_layers) self.num_layers = num_layers self.norm = norm self.return_intermediate = return_intermediate self.num_keypoints = num_keypoints def forward(self, query, *args, reference_points=None, valid_ratios=None, kpt_branches=None, **kwargs): """Forward function for `TransformerDecoder`. Args: query (Tensor): Input query with shape (num_query, bs, embed_dims). reference_points (Tensor): The reference points of offset, has shape (bs, num_query, K*2). valid_ratios (Tensor): The radios of valid points on the feature map, has shape (bs, num_levels, 2). kpt_branches: (obj:`nn.LayerList`): Used for refining the regression results. Only would be passed when `with_box_refine` is True, otherwise would be passed a `None`. Returns: tuple (Tensor): Results with shape [1, num_query, bs, embed_dims] when return_intermediate is `False`, otherwise it has shape [num_layers, num_query, bs, embed_dims] and [num_layers, bs, num_query, K*2]. """ output = query intermediate = [] intermediate_reference_points = [] for lid, layer in enumerate(self.layers): if reference_points.shape[-1] == self.num_keypoints * 2: reference_points_input = \ reference_points[:, :, None] * \ valid_ratios.tile((1, 1, self.num_keypoints))[:, None] else: assert reference_points.shape[-1] == 2 reference_points_input = reference_points[:, :, None] * \ valid_ratios[:, None] output = layer( output, *args, reference_points=reference_points_input, **kwargs) if kpt_branches is not None: tmp = kpt_branches[lid](output) if reference_points.shape[-1] == self.num_keypoints * 2: new_reference_points = tmp + inverse_sigmoid( reference_points) new_reference_points = F.sigmoid(new_reference_points) else: raise NotImplementedError reference_points = new_reference_points.detach() if self.return_intermediate: intermediate.append(output) intermediate_reference_points.append(reference_points) if self.return_intermediate: return paddle.stack(intermediate), paddle.stack( intermediate_reference_points) return output, reference_points @register class PETR_DeformableTransformerDecoder(nn.Layer): __inject__ = ['decoder_layer'] def __init__(self, decoder_layer, num_layers, return_intermediate=False): super(PETR_DeformableTransformerDecoder, self).__init__() self.layers = _get_clones(decoder_layer, num_layers) self.num_layers = num_layers self.return_intermediate = return_intermediate def forward(self, tgt, reference_points, memory, memory_spatial_shapes, memory_mask=None, query_pos_embed=None): output = tgt intermediate = [] for lid, layer in enumerate(self.layers): output = layer(output, reference_points, memory, memory_spatial_shapes, memory_mask, query_pos_embed) if self.return_intermediate: intermediate.append(output) if self.return_intermediate: return paddle.stack(intermediate) return output.unsqueeze(0) @register class PETR_DeformableDetrTransformerDecoder(PETR_DeformableTransformerDecoder): """Implements the decoder in DETR transformer. Args: return_intermediate (bool): Whether to return intermediate outputs. coder_norm_cfg (dict): Config of last normalization layer. Default: `LN`. """ def __init__(self, *args, return_intermediate=False, **kwargs): super(PETR_DeformableDetrTransformerDecoder, self).__init__(*args, **kwargs) self.return_intermediate = return_intermediate def forward(self, query, *args, reference_points=None, valid_ratios=None, reg_branches=None, **kwargs): """Forward function for `TransformerDecoder`. Args: query (Tensor): Input query with shape `(num_query, bs, embed_dims)`. reference_points (Tensor): The reference points of offset. has shape (bs, num_query, 4) when as_two_stage, otherwise has shape ((bs, num_query, 2). valid_ratios (Tensor): The radios of valid points on the feature map, has shape (bs, num_levels, 2) reg_branch: (obj:`nn.LayerList`): Used for refining the regression results. Only would be passed when with_box_refine is True, otherwise would be passed a `None`. Returns: Tensor: Results with shape [1, num_query, bs, embed_dims] when return_intermediate is `False`, otherwise it has shape [num_layers, num_query, bs, embed_dims]. """ output = query intermediate = [] intermediate_reference_points = [] for lid, layer in enumerate(self.layers): if reference_points.shape[-1] == 4: reference_points_input = reference_points[:, :, None] * \ paddle.concat([valid_ratios, valid_ratios], -1)[:, None] else: assert reference_points.shape[-1] == 2 reference_points_input = reference_points[:, :, None] * \ valid_ratios[:, None] output = layer( output, *args, reference_points=reference_points_input, **kwargs) if reg_branches is not None: tmp = reg_branches[lid](output) if reference_points.shape[-1] == 4: new_reference_points = tmp + inverse_sigmoid( reference_points) new_reference_points = F.sigmoid(new_reference_points) else: assert reference_points.shape[-1] == 2 new_reference_points = tmp new_reference_points[..., :2] = tmp[ ..., :2] + inverse_sigmoid(reference_points) new_reference_points = F.sigmoid(new_reference_points) reference_points = new_reference_points.detach() if self.return_intermediate: intermediate.append(output) intermediate_reference_points.append(reference_points) if self.return_intermediate: return paddle.stack(intermediate), paddle.stack( intermediate_reference_points) return output, reference_points @register class PETRTransformer(nn.Layer): """Implements the PETR transformer. Args: as_two_stage (bool): Generate query from encoder features. Default: False. num_feature_levels (int): Number of feature maps from FPN: Default: 4. two_stage_num_proposals (int): Number of proposals when set `as_two_stage` as True. Default: 300. """ __inject__ = ["encoder", "decoder", "hm_encoder", "refine_decoder"] def __init__(self, encoder="", decoder="", hm_encoder="", refine_decoder="", as_two_stage=True, num_feature_levels=4, two_stage_num_proposals=300, num_keypoints=17, **kwargs): super(PETRTransformer, self).__init__(**kwargs) self.as_two_stage = as_two_stage self.num_feature_levels = num_feature_levels self.two_stage_num_proposals = two_stage_num_proposals self.num_keypoints = num_keypoints self.encoder = encoder self.decoder = decoder self.embed_dims = self.encoder.embed_dims self.hm_encoder = hm_encoder self.refine_decoder = refine_decoder self.init_layers() self.init_weights() def init_layers(self): """Initialize layers of the DeformableDetrTransformer.""" #paddle.create_parameter self.level_embeds = paddle.create_parameter( (self.num_feature_levels, self.embed_dims), dtype="float32") if self.as_two_stage: self.enc_output = nn.Linear(self.embed_dims, self.embed_dims) self.enc_output_norm = nn.LayerNorm(self.embed_dims) self.refine_query_embedding = nn.Embedding(self.num_keypoints, self.embed_dims * 2) else: self.reference_points = nn.Linear(self.embed_dims, 2 * self.num_keypoints) def init_weights(self): """Initialize the transformer weights.""" for p in self.parameters(): if p.rank() > 1: xavier_uniform_(p) if hasattr(p, 'bias') and p.bias is not None: constant_(p.bais) for m in self.sublayers(): if isinstance(m, MSDeformableAttention): m._reset_parameters() for m in self.sublayers(): if isinstance(m, MultiScaleDeformablePoseAttention): m.init_weights() if not self.as_two_stage: xavier_uniform_(self.reference_points.weight) constant_(self.reference_points.bias) normal_(self.level_embeds) normal_(self.refine_query_embedding.weight) def gen_encoder_output_proposals(self, memory, memory_padding_mask, spatial_shapes): """Generate proposals from encoded memory. Args: memory (Tensor): The output of encoder, has shape (bs, num_key, embed_dim). num_key is equal the number of points on feature map from all level. memory_padding_mask (Tensor): Padding mask for memory. has shape (bs, num_key). spatial_shapes (Tensor): The shape of all feature maps. has shape (num_level, 2). Returns: tuple: A tuple of feature map and bbox prediction. - output_memory (Tensor): The input of decoder, has shape (bs, num_key, embed_dim). num_key is equal the number of points on feature map from all levels. - output_proposals (Tensor): The normalized proposal after a inverse sigmoid, has shape (bs, num_keys, 4). """ N, S, C = memory.shape proposals = [] _cur = 0 for lvl, (H, W) in enumerate(spatial_shapes): mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H * W)].reshape( [N, H, W, 1]) valid_H = paddle.sum(mask_flatten_[:, :, 0, 0], 1) valid_W = paddle.sum(mask_flatten_[:, 0, :, 0], 1) grid_y, grid_x = paddle.meshgrid( paddle.linspace( 0, H - 1, H, dtype="float32"), paddle.linspace( 0, W - 1, W, dtype="float32")) grid = paddle.concat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) scale = paddle.concat( [valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).reshape([N, 1, 1, 2]) grid = (grid.unsqueeze(0).expand((N, -1, -1, -1)) + 0.5) / scale proposal = grid.reshape([N, -1, 2]) proposals.append(proposal) _cur += (H * W) output_proposals = paddle.concat(proposals, 1) output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all( -1, keepdim=True).astype("bool") output_proposals = paddle.log(output_proposals / (1 - output_proposals)) output_proposals = masked_fill( output_proposals, ~memory_padding_mask.astype("bool").unsqueeze(-1), float('inf')) output_proposals = masked_fill(output_proposals, ~output_proposals_valid, float('inf')) output_memory = memory output_memory = masked_fill( output_memory, ~memory_padding_mask.astype("bool").unsqueeze(-1), float(0)) output_memory = masked_fill(output_memory, ~output_proposals_valid, float(0)) output_memory = self.enc_output_norm(self.enc_output(output_memory)) return output_memory, output_proposals @staticmethod def get_reference_points(spatial_shapes, valid_ratios): """Get the reference points used in decoder. Args: spatial_shapes (Tensor): The shape of all feature maps, has shape (num_level, 2). valid_ratios (Tensor): The radios of valid points on the feature map, has shape (bs, num_levels, 2). Returns: Tensor: reference points used in decoder, has \ shape (bs, num_keys, num_levels, 2). """ reference_points_list = [] for lvl, (H, W) in enumerate(spatial_shapes): ref_y, ref_x = paddle.meshgrid( paddle.linspace( 0.5, H - 0.5, H, dtype="float32"), paddle.linspace( 0.5, W - 0.5, W, dtype="float32")) ref_y = ref_y.reshape( (-1, ))[None] / (valid_ratios[:, None, lvl, 1] * H) ref_x = ref_x.reshape( (-1, ))[None] / (valid_ratios[:, None, lvl, 0] * W) ref = paddle.stack((ref_x, ref_y), -1) reference_points_list.append(ref) reference_points = paddle.concat(reference_points_list, 1) reference_points = reference_points[:, :, None] * valid_ratios[:, None] return reference_points def get_valid_ratio(self, mask): """Get the valid radios of feature maps of all level.""" _, H, W = mask.shape valid_H = paddle.sum(mask[:, :, 0].astype('float'), 1) valid_W = paddle.sum(mask[:, 0, :].astype('float'), 1) valid_ratio_h = valid_H.astype('float') / H valid_ratio_w = valid_W.astype('float') / W valid_ratio = paddle.stack([valid_ratio_w, valid_ratio_h], -1) return valid_ratio def get_proposal_pos_embed(self, proposals, num_pos_feats=128, temperature=10000): """Get the position embedding of proposal.""" scale = 2 * math.pi dim_t = paddle.arange(num_pos_feats, dtype="float32") dim_t = temperature**(2 * (dim_t // 2) / num_pos_feats) # N, L, 4 proposals = F.sigmoid(proposals) * scale # N, L, 4, 128 pos = proposals[:, :, :, None] / dim_t # N, L, 4, 64, 2 pos = paddle.stack( (pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), axis=4).flatten(2) return pos def forward(self, mlvl_feats, mlvl_masks, query_embed, mlvl_pos_embeds, kpt_branches=None, cls_branches=None): """Forward function for `Transformer`. Args: mlvl_feats (list(Tensor)): Input queries from different level. Each element has shape [bs, embed_dims, h, w]. mlvl_masks (list(Tensor)): The key_padding_mask from different level used for encoder and decoder, each element has shape [bs, h, w]. query_embed (Tensor): The query embedding for decoder, with shape [num_query, c]. mlvl_pos_embeds (list(Tensor)): The positional encoding of feats from different level, has the shape [bs, embed_dims, h, w]. kpt_branches (obj:`nn.LayerList`): Keypoint Regression heads for feature maps from each decoder layer. Only would be passed when `with_box_refine` is Ture. Default to None. cls_branches (obj:`nn.LayerList`): Classification heads for feature maps from each decoder layer. Only would be passed when `as_two_stage` is Ture. Default to None. Returns: tuple[Tensor]: results of decoder containing the following tensor. - inter_states: Outputs from decoder. If `return_intermediate_dec` is True output has shape \ (num_dec_layers, bs, num_query, embed_dims), else has \ shape (1, bs, num_query, embed_dims). - init_reference_out: The initial value of reference \ points, has shape (bs, num_queries, 4). - inter_references_out: The internal value of reference \ points in decoder, has shape \ (num_dec_layers, bs,num_query, embed_dims) - enc_outputs_class: The classification score of proposals \ generated from encoder's feature maps, has shape \ (batch, h*w, num_classes). \ Only would be returned when `as_two_stage` is True, \ otherwise None. - enc_outputs_kpt_unact: The regression results generated from \ encoder's feature maps., has shape (batch, h*w, K*2). Only would be returned when `as_two_stage` is True, \ otherwise None. """ assert self.as_two_stage or query_embed is not None feat_flatten = [] mask_flatten = [] lvl_pos_embed_flatten = [] spatial_shapes = [] for lvl, (feat, mask, pos_embed ) in enumerate(zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)): bs, c, h, w = feat.shape spatial_shape = (h, w) spatial_shapes.append(spatial_shape) feat = feat.flatten(2).transpose((0, 2, 1)) mask = mask.flatten(1) pos_embed = pos_embed.flatten(2).transpose((0, 2, 1)) lvl_pos_embed = pos_embed + self.level_embeds[lvl].reshape( [1, 1, -1]) lvl_pos_embed_flatten.append(lvl_pos_embed) feat_flatten.append(feat) mask_flatten.append(mask) feat_flatten = paddle.concat(feat_flatten, 1) mask_flatten = paddle.concat(mask_flatten, 1) lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1) spatial_shapes_cumsum = paddle.to_tensor( np.array(spatial_shapes).prod(1).cumsum(0)) spatial_shapes = paddle.to_tensor(spatial_shapes, dtype="int64") level_start_index = paddle.concat((paddle.zeros( (1, ), dtype=spatial_shapes.dtype), spatial_shapes_cumsum[:-1])) valid_ratios = paddle.stack( [self.get_valid_ratio(m) for m in mlvl_masks], 1) reference_points = \ self.get_reference_points(spatial_shapes, valid_ratios) memory = self.encoder( src=feat_flatten, pos_embed=lvl_pos_embed_flatten, src_mask=mask_flatten, value_spatial_shapes=spatial_shapes, reference_points=reference_points, value_level_start_index=level_start_index, valid_ratios=valid_ratios) bs, _, c = memory.shape hm_proto = None if self.training: hm_memory = paddle.slice( memory, starts=level_start_index[0], ends=level_start_index[1], axes=[1]) hm_pos_embed = paddle.slice( lvl_pos_embed_flatten, starts=level_start_index[0], ends=level_start_index[1], axes=[1]) hm_mask = paddle.slice( mask_flatten, starts=level_start_index[0], ends=level_start_index[1], axes=[1]) hm_reference_points = paddle.slice( reference_points, starts=level_start_index[0], ends=level_start_index[1], axes=[1])[:, :, :1, :] # official code make a mistake of pos_embed to pose_embed, which disable pos_embed hm_memory = self.hm_encoder( src=hm_memory, pose_embed=hm_pos_embed, src_mask=hm_mask, value_spatial_shapes=spatial_shapes[[0]], reference_points=hm_reference_points, value_level_start_index=level_start_index[0], valid_ratios=valid_ratios[:, :1, :]) hm_memory = hm_memory.reshape((bs, spatial_shapes[0, 0], spatial_shapes[0, 1], -1)) hm_proto = (hm_memory, mlvl_masks[0]) if self.as_two_stage: output_memory, output_proposals = \ self.gen_encoder_output_proposals( memory, mask_flatten, spatial_shapes) enc_outputs_class = cls_branches[self.decoder.num_layers]( output_memory) enc_outputs_kpt_unact = \ kpt_branches[self.decoder.num_layers](output_memory) enc_outputs_kpt_unact[..., 0::2] += output_proposals[..., 0:1] enc_outputs_kpt_unact[..., 1::2] += output_proposals[..., 1:2] topk = self.two_stage_num_proposals topk_proposals = paddle.topk( enc_outputs_class[..., 0], topk, axis=1)[1].unsqueeze(-1) #paddle.take_along_axis 对应torch.gather topk_kpts_unact = paddle.take_along_axis(enc_outputs_kpt_unact, topk_proposals, 1) topk_kpts_unact = topk_kpts_unact.detach() reference_points = F.sigmoid(topk_kpts_unact) init_reference_out = reference_points # learnable query and query_pos query_pos, query = paddle.split( query_embed, query_embed.shape[1] // c, axis=1) query_pos = query_pos.unsqueeze(0).expand((bs, -1, -1)) query = query.unsqueeze(0).expand((bs, -1, -1)) else: query_pos, query = paddle.split( query_embed, query_embed.shape[1] // c, axis=1) query_pos = query_pos.unsqueeze(0).expand((bs, -1, -1)) query = query.unsqueeze(0).expand((bs, -1, -1)) reference_points = F.sigmoid(self.reference_points(query_pos)) init_reference_out = reference_points # decoder inter_states, inter_references = self.decoder( query=query, memory=memory, query_pos_embed=query_pos, memory_mask=mask_flatten, reference_points=reference_points, value_spatial_shapes=spatial_shapes, value_level_start_index=level_start_index, valid_ratios=valid_ratios, kpt_branches=kpt_branches) inter_references_out = inter_references if self.as_two_stage: return inter_states, init_reference_out, \ inter_references_out, enc_outputs_class, \ enc_outputs_kpt_unact, hm_proto, memory return inter_states, init_reference_out, \ inter_references_out, None, None, None, None, None, hm_proto def forward_refine(self, mlvl_masks, memory, reference_points_pose, img_inds, kpt_branches=None, **kwargs): mask_flatten = [] spatial_shapes = [] for lvl, mask in enumerate(mlvl_masks): bs, h, w = mask.shape spatial_shape = (h, w) spatial_shapes.append(spatial_shape) mask = mask.flatten(1) mask_flatten.append(mask) mask_flatten = paddle.concat(mask_flatten, 1) spatial_shapes_cumsum = paddle.to_tensor( np.array( spatial_shapes, dtype='int64').prod(1).cumsum(0)) spatial_shapes = paddle.to_tensor(spatial_shapes, dtype="int64") level_start_index = paddle.concat((paddle.zeros( (1, ), dtype=spatial_shapes.dtype), spatial_shapes_cumsum[:-1])) valid_ratios = paddle.stack( [self.get_valid_ratio(m) for m in mlvl_masks], 1) # pose refinement (17 queries corresponding to 17 keypoints) # learnable query and query_pos refine_query_embedding = self.refine_query_embedding.weight query_pos, query = paddle.split(refine_query_embedding, 2, axis=1) pos_num = reference_points_pose.shape[0] query_pos = query_pos.unsqueeze(0).expand((pos_num, -1, -1)) query = query.unsqueeze(0).expand((pos_num, -1, -1)) reference_points = reference_points_pose.reshape( (pos_num, reference_points_pose.shape[1] // 2, 2)) pos_memory = memory[img_inds] mask_flatten = mask_flatten[img_inds] valid_ratios = valid_ratios[img_inds] if img_inds.size == 1: pos_memory = pos_memory.unsqueeze(0) mask_flatten = mask_flatten.unsqueeze(0) valid_ratios = valid_ratios.unsqueeze(0) inter_states, inter_references = self.refine_decoder( query=query, memory=pos_memory, query_pos_embed=query_pos, memory_mask=mask_flatten, reference_points=reference_points, value_spatial_shapes=spatial_shapes, value_level_start_index=level_start_index, valid_ratios=valid_ratios, reg_branches=kpt_branches, **kwargs) # [num_decoder, num_query, bs, embed_dim] init_reference_out = reference_points return inter_states, init_reference_out, inter_references ================================================ FILE: ppdet/modeling/transformers/position_encoding.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Modified from DETR (https://github.com/facebookresearch/detr) # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved from __future__ import absolute_import from __future__ import division from __future__ import print_function import math import paddle import paddle.nn as nn from ppdet.core.workspace import register, serializable @register @serializable class PositionEmbedding(nn.Layer): def __init__(self, num_pos_feats=128, temperature=10000, normalize=True, scale=2 * math.pi, embed_type='sine', num_embeddings=50, offset=0., eps=1e-6): super(PositionEmbedding, self).__init__() assert embed_type in ['sine', 'learned'] self.embed_type = embed_type self.offset = offset self.eps = eps if self.embed_type == 'sine': self.num_pos_feats = num_pos_feats self.temperature = temperature self.normalize = normalize self.scale = scale elif self.embed_type == 'learned': self.row_embed = nn.Embedding(num_embeddings, num_pos_feats) self.col_embed = nn.Embedding(num_embeddings, num_pos_feats) else: raise ValueError(f"{self.embed_type} is not supported.") def forward(self, mask): """ Args: mask (Tensor): [B, H, W] Returns: pos (Tensor): [B, H, W, C] """ if self.embed_type == 'sine': y_embed = mask.cumsum(1) x_embed = mask.cumsum(2) if self.normalize: y_embed = (y_embed + self.offset) / ( y_embed[:, -1:, :] + self.eps) * self.scale x_embed = (x_embed + self.offset) / ( x_embed[:, :, -1:] + self.eps) * self.scale dim_t = 2 * (paddle.arange(self.num_pos_feats) // 2).astype('float32') dim_t = self.temperature**(dim_t / self.num_pos_feats) pos_x = x_embed.unsqueeze(-1) / dim_t pos_y = y_embed.unsqueeze(-1) / dim_t pos_x = paddle.stack( (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), axis=4).flatten(3) pos_y = paddle.stack( (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), axis=4).flatten(3) return paddle.concat((pos_y, pos_x), axis=3) elif self.embed_type == 'learned': h, w = mask.shape[-2:] i = paddle.arange(w) j = paddle.arange(h) x_emb = self.col_embed(i) y_emb = self.row_embed(j) return paddle.concat( [ x_emb.unsqueeze(0).tile([h, 1, 1]), y_emb.unsqueeze(1).tile([1, w, 1]), ], axis=-1).unsqueeze(0) else: raise ValueError(f"not supported {self.embed_type}") ================================================ FILE: ppdet/modeling/transformers/rtdetr_transformer.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR) # Copyright (c) 2020 SenseTime. All Rights Reserved. # Modified from detrex (https://github.com/IDEA-Research/detrex) # Copyright 2022 The IDEA Authors. All rights reserved. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.regularizer import L2Decay from ppdet.core.workspace import register from ..layers import MultiHeadAttention from ..heads.detr_head import MLP from .deformable_transformer import MSDeformableAttention from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_, bias_init_with_prob) from .utils import (_get_clones, get_sine_pos_embed, get_contrastive_denoising_training_group, inverse_sigmoid) __all__ = ['RTDETRTransformer'] class PPMSDeformableAttention(MSDeformableAttention): def forward(self, query, reference_points, value, value_spatial_shapes, value_level_start_index, value_mask=None): """ Args: query (Tensor): [bs, query_length, C] reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area value (Tensor): [bs, value_length, C] value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] value_level_start_index (List): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...] value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements Returns: output (Tensor): [bs, Length_{query}, C] """ bs, Len_q = query.shape[:2] Len_v = value.shape[1] value = self.value_proj(value) if value_mask is not None: value_mask = value_mask.astype(value.dtype).unsqueeze(-1) value *= value_mask value = value.reshape([bs, Len_v, self.num_heads, self.head_dim]) sampling_offsets = self.sampling_offsets(query).reshape( [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2]) attention_weights = self.attention_weights(query).reshape( [bs, Len_q, self.num_heads, self.num_levels * self.num_points]) attention_weights = F.softmax(attention_weights).reshape( [bs, Len_q, self.num_heads, self.num_levels, self.num_points]) if reference_points.shape[-1] == 2: offset_normalizer = paddle.to_tensor(value_spatial_shapes) offset_normalizer = offset_normalizer.flip([1]).reshape( [1, 1, 1, self.num_levels, 1, 2]) sampling_locations = reference_points.reshape([ bs, Len_q, 1, self.num_levels, 1, 2 ]) + sampling_offsets / offset_normalizer elif reference_points.shape[-1] == 4: sampling_locations = ( reference_points[:, :, None, :, None, :2] + sampling_offsets / self.num_points * reference_points[:, :, None, :, None, 2:] * 0.5) else: raise ValueError( "Last dim of reference_points must be 2 or 4, but get {} instead.". format(reference_points.shape[-1])) if not isinstance(query, paddle.Tensor): from ppdet.modeling.transformers.utils import deformable_attention_core_func output = deformable_attention_core_func( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) else: value_spatial_shapes = paddle.to_tensor(value_spatial_shapes) value_level_start_index = paddle.to_tensor(value_level_start_index) output = self.ms_deformable_attn_core( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) output = self.output_proj(output) return output class TransformerDecoderLayer(nn.Layer): def __init__(self, d_model=256, n_head=8, dim_feedforward=1024, dropout=0., activation="relu", n_levels=4, n_points=4, weight_attr=None, bias_attr=None): super(TransformerDecoderLayer, self).__init__() # self attention self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout) self.dropout1 = nn.Dropout(dropout) self.norm1 = nn.LayerNorm( d_model, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) # cross attention self.cross_attn = PPMSDeformableAttention(d_model, n_head, n_levels, n_points, 1.0) self.dropout2 = nn.Dropout(dropout) self.norm2 = nn.LayerNorm( d_model, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) # ffn self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr, bias_attr) self.activation = getattr(F, activation) self.dropout3 = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr, bias_attr) self.dropout4 = nn.Dropout(dropout) self.norm3 = nn.LayerNorm( d_model, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) self._reset_parameters() def _reset_parameters(self): linear_init_(self.linear1) linear_init_(self.linear2) xavier_uniform_(self.linear1.weight) xavier_uniform_(self.linear2.weight) def with_pos_embed(self, tensor, pos): return tensor if pos is None else tensor + pos def forward_ffn(self, tgt): return self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) def forward(self, tgt, reference_points, memory, memory_spatial_shapes, memory_level_start_index, attn_mask=None, memory_mask=None, query_pos_embed=None): # self attention q = k = self.with_pos_embed(tgt, query_pos_embed) if attn_mask is not None: attn_mask = paddle.where( attn_mask.astype('bool'), paddle.zeros(attn_mask.shape, tgt.dtype), paddle.full(attn_mask.shape, float("-inf"), tgt.dtype)) tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask) tgt = tgt + self.dropout1(tgt2) tgt = self.norm1(tgt) # cross attention tgt2 = self.cross_attn( self.with_pos_embed(tgt, query_pos_embed), reference_points, memory, memory_spatial_shapes, memory_level_start_index, memory_mask) tgt = tgt + self.dropout2(tgt2) tgt = self.norm2(tgt) # ffn tgt2 = self.forward_ffn(tgt) tgt = tgt + self.dropout4(tgt2) tgt = self.norm3(tgt) return tgt class TransformerDecoder(nn.Layer): def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1): super(TransformerDecoder, self).__init__() self.layers = _get_clones(decoder_layer, num_layers) self.hidden_dim = hidden_dim self.num_layers = num_layers self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx def forward(self, tgt, ref_points_unact, memory, memory_spatial_shapes, memory_level_start_index, bbox_head, score_head, query_pos_head, attn_mask=None, memory_mask=None, query_pos_head_inv_sig=False): output = tgt dec_out_bboxes = [] dec_out_logits = [] ref_points_detach = F.sigmoid(ref_points_unact) for i, layer in enumerate(self.layers): ref_points_input = ref_points_detach.unsqueeze(2) if not query_pos_head_inv_sig: query_pos_embed = query_pos_head(ref_points_detach) else: query_pos_embed = query_pos_head( inverse_sigmoid(ref_points_detach)) output = layer(output, ref_points_input, memory, memory_spatial_shapes, memory_level_start_index, attn_mask, memory_mask, query_pos_embed) inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid( ref_points_detach)) if self.training: dec_out_logits.append(score_head[i](output)) if i == 0: dec_out_bboxes.append(inter_ref_bbox) else: dec_out_bboxes.append( F.sigmoid(bbox_head[i](output) + inverse_sigmoid( ref_points))) elif i == self.eval_idx: dec_out_logits.append(score_head[i](output)) dec_out_bboxes.append(inter_ref_bbox) break ref_points = inter_ref_bbox ref_points_detach = inter_ref_bbox.detach( ) if self.training else inter_ref_bbox return paddle.stack(dec_out_bboxes), paddle.stack(dec_out_logits) @register class RTDETRTransformer(nn.Layer): __shared__ = ['num_classes', 'hidden_dim', 'eval_size'] def __init__(self, num_classes=80, hidden_dim=256, num_queries=300, position_embed_type='sine', backbone_feat_channels=[512, 1024, 2048], feat_strides=[8, 16, 32], num_levels=3, num_decoder_points=4, nhead=8, num_decoder_layers=6, dim_feedforward=1024, dropout=0., activation="relu", num_denoising=100, label_noise_ratio=0.5, box_noise_scale=1.0, learnt_init_query=True, query_pos_head_inv_sig=False, eval_size=None, eval_idx=-1, eps=1e-2): super(RTDETRTransformer, self).__init__() assert position_embed_type in ['sine', 'learned'], \ f'ValueError: position_embed_type not supported {position_embed_type}!' assert len(backbone_feat_channels) <= num_levels assert len(feat_strides) == len(backbone_feat_channels) for _ in range(num_levels - len(feat_strides)): feat_strides.append(feat_strides[-1] * 2) self.hidden_dim = hidden_dim self.nhead = nhead self.feat_strides = feat_strides self.num_levels = num_levels self.num_classes = num_classes self.num_queries = num_queries self.eps = eps self.num_decoder_layers = num_decoder_layers self.eval_size = eval_size # backbone feature projection self._build_input_proj_layer(backbone_feat_channels) # Transformer module decoder_layer = TransformerDecoderLayer( hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels, num_decoder_points) self.decoder = TransformerDecoder(hidden_dim, decoder_layer, num_decoder_layers, eval_idx) # denoising part self.denoising_class_embed = nn.Embedding( num_classes, hidden_dim, weight_attr=ParamAttr(initializer=nn.initializer.Normal())) self.num_denoising = num_denoising self.label_noise_ratio = label_noise_ratio self.box_noise_scale = box_noise_scale # decoder embedding self.learnt_init_query = learnt_init_query if learnt_init_query: self.tgt_embed = nn.Embedding(num_queries, hidden_dim) self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2) self.query_pos_head_inv_sig = query_pos_head_inv_sig # encoder head self.enc_output = nn.Sequential( nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm( hidden_dim, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0)))) self.enc_score_head = nn.Linear(hidden_dim, num_classes) self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3) # decoder head self.dec_score_head = nn.LayerList([ nn.Linear(hidden_dim, num_classes) for _ in range(num_decoder_layers) ]) self.dec_bbox_head = nn.LayerList([ MLP(hidden_dim, hidden_dim, 4, num_layers=3) for _ in range(num_decoder_layers) ]) self._reset_parameters() def _reset_parameters(self): # class and bbox head init bias_cls = bias_init_with_prob(0.01) linear_init_(self.enc_score_head) constant_(self.enc_score_head.bias, bias_cls) constant_(self.enc_bbox_head.layers[-1].weight) constant_(self.enc_bbox_head.layers[-1].bias) for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head): linear_init_(cls_) constant_(cls_.bias, bias_cls) constant_(reg_.layers[-1].weight) constant_(reg_.layers[-1].bias) linear_init_(self.enc_output[0]) xavier_uniform_(self.enc_output[0].weight) if self.learnt_init_query: xavier_uniform_(self.tgt_embed.weight) xavier_uniform_(self.query_pos_head.layers[0].weight) xavier_uniform_(self.query_pos_head.layers[1].weight) for l in self.input_proj: xavier_uniform_(l[0].weight) # init encoder output anchors and valid_mask if self.eval_size: self.anchors, self.valid_mask = self._generate_anchors() @classmethod def from_config(cls, cfg, input_shape): return {'backbone_feat_channels': [i.channels for i in input_shape]} def _build_input_proj_layer(self, backbone_feat_channels): self.input_proj = nn.LayerList() for in_channels in backbone_feat_channels: self.input_proj.append( nn.Sequential( ('conv', nn.Conv2D( in_channels, self.hidden_dim, kernel_size=1, bias_attr=False)), ('norm', nn.BatchNorm2D( self.hidden_dim, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))) in_channels = backbone_feat_channels[-1] for _ in range(self.num_levels - len(backbone_feat_channels)): self.input_proj.append( nn.Sequential( ('conv', nn.Conv2D( in_channels, self.hidden_dim, kernel_size=3, stride=2, padding=1, bias_attr=False)), ('norm', nn.BatchNorm2D( self.hidden_dim, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))) in_channels = self.hidden_dim def _get_encoder_input(self, feats): # get projection features proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)] if self.num_levels > len(proj_feats): len_srcs = len(proj_feats) for i in range(len_srcs, self.num_levels): if i == len_srcs: proj_feats.append(self.input_proj[i](feats[-1])) else: proj_feats.append(self.input_proj[i](proj_feats[-1])) # get encoder inputs feat_flatten = [] spatial_shapes = [] level_start_index = [0, ] for i, feat in enumerate(proj_feats): _, _, h, w = feat.shape # [b, c, h, w] -> [b, h*w, c] feat_flatten.append(feat.flatten(2).transpose([0, 2, 1])) # [num_levels, 2] spatial_shapes.append([h, w]) # [l], start index of each level level_start_index.append(h * w + level_start_index[-1]) # [b, l, c] feat_flatten = paddle.concat(feat_flatten, 1) level_start_index.pop() return (feat_flatten, spatial_shapes, level_start_index) def forward(self, feats, pad_mask=None, gt_meta=None, is_teacher=False): # input projection and embedding (memory, spatial_shapes, level_start_index) = self._get_encoder_input(feats) # prepare denoising training if self.training: denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \ get_contrastive_denoising_training_group(gt_meta, self.num_classes, self.num_queries, self.denoising_class_embed.weight, self.num_denoising, self.label_noise_ratio, self.box_noise_scale) else: denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \ self._get_decoder_input( memory, spatial_shapes, denoising_class, denoising_bbox_unact,is_teacher) # decoder out_bboxes, out_logits = self.decoder( target, init_ref_points_unact, memory, spatial_shapes, level_start_index, self.dec_bbox_head, self.dec_score_head, self.query_pos_head, attn_mask=attn_mask, memory_mask=None, query_pos_head_inv_sig=self.query_pos_head_inv_sig) return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits, dn_meta) def _generate_anchors(self, spatial_shapes=None, grid_size=0.05, dtype="float32"): if spatial_shapes is None: spatial_shapes = [ [int(self.eval_size[0] / s), int(self.eval_size[1] / s)] for s in self.feat_strides ] anchors = [] for lvl, (h, w) in enumerate(spatial_shapes): grid_y, grid_x = paddle.meshgrid( paddle.arange( end=h, dtype=dtype), paddle.arange( end=w, dtype=dtype)) grid_xy = paddle.stack([grid_x, grid_y], -1) valid_WH = paddle.to_tensor([h, w]).astype(dtype) grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl) anchors.append( paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4])) anchors = paddle.concat(anchors, 1) valid_mask = ((anchors > self.eps) * (anchors < 1 - self.eps)).all(-1, keepdim=True) anchors = paddle.log(anchors / (1 - anchors)) anchors = paddle.where(valid_mask, anchors, paddle.to_tensor(float("inf"))) return anchors, valid_mask def _get_decoder_input(self, memory, spatial_shapes, denoising_class=None, denoising_bbox_unact=None, is_teacher=False): bs, _, _ = memory.shape # prepare input for decoder if self.training or self.eval_size is None or is_teacher: anchors, valid_mask = self._generate_anchors(spatial_shapes) else: anchors, valid_mask = self.anchors, self.valid_mask memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.)) output_memory = self.enc_output(memory) enc_outputs_class = self.enc_score_head(output_memory) enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors _, topk_ind = paddle.topk( enc_outputs_class.max(-1), self.num_queries, axis=1) # extract region proposal boxes batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype) batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries]) topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1) reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact, topk_ind) # unsigmoided. enc_topk_bboxes = F.sigmoid(reference_points_unact) if denoising_bbox_unact is not None: reference_points_unact = paddle.concat( [denoising_bbox_unact, reference_points_unact], 1) if self.training: reference_points_unact = reference_points_unact.detach() enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind) # extract region features if self.learnt_init_query: target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1]) else: target = paddle.gather_nd(output_memory, topk_ind) if self.training: target = target.detach() if denoising_class is not None: target = paddle.concat([denoising_class, target], 1) return target, reference_points_unact, enc_topk_bboxes, enc_topk_logits ================================================ FILE: ppdet/modeling/transformers/rtdetr_transformerv2.py ================================================ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR) # Copyright (c) 2020 SenseTime. All Rights Reserved. # Modified from detrex (https://github.com/IDEA-Research/detrex) # Copyright 2022 The IDEA Authors. All rights reserved. from __future__ import absolute_import from __future__ import division from __future__ import print_function import functools import math import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.regularizer import L2Decay from ppdet.core.workspace import register from .rtdetr_transformer import TransformerDecoder from .utils import deformable_attention_core_func_v2, get_contrastive_denoising_training_group from ..heads.detr_head import MLP from ..initializer import (linear_init_, constant_, xavier_uniform_, bias_init_with_prob) from ..layers import MultiHeadAttention __all__ = ['RTDETRTransformerv2'] class MSDeformableAttention(nn.Layer): def __init__(self, embed_dim=256, num_heads=8, num_levels=4, num_points=4, sampling_method='default', offset_scale=0.5, lr_mult=0.1): """ Multi-Scale Deformable Attention Module """ super(MSDeformableAttention, self).__init__() self.embed_dim = embed_dim self.num_heads = num_heads self.num_levels = num_levels if isinstance(num_points, list): assert len(num_points) == num_levels, ValueError num_points_list = num_points else: num_points_list = [num_points for _ in range(num_levels)] self.num_points_list = num_points_list self.total_points = num_heads * sum(num_points_list) num_points_scale = [1 / n for n in num_points_list for _ in range(n)] self.register_buffer('num_points_scale', paddle.to_tensor(num_points_scale, dtype=paddle.float32)) self.sampling_method = sampling_method self.offset_scale = offset_scale self.head_dim = embed_dim // num_heads assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" self.sampling_offsets = nn.Linear( embed_dim, self.total_points * 2, weight_attr=ParamAttr(learning_rate=lr_mult), bias_attr=ParamAttr(learning_rate=lr_mult)) self.attention_weights = nn.Linear(embed_dim, self.total_points) self.value_proj = nn.Linear(embed_dim, embed_dim) self.output_proj = nn.Linear(embed_dim, embed_dim) self.ms_deformable_attn_core = functools.partial( deformable_attention_core_func_v2, num_points_list=self.num_points_list, sampling_method=self.sampling_method) self._reset_parameters() if self.sampling_method == 'discrete': for p in self.sampling_offsets.parameters(): p.stop_gradient = True def _reset_parameters(self): # sampling_offsets constant_(self.sampling_offsets.weight) thetas = paddle.arange( self.num_heads, dtype=paddle.float32) * (2.0 * math.pi / self.num_heads) grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1) grid_init = grid_init / grid_init.abs().max(-1, keepdim=True) grid_init = grid_init.reshape([self.num_heads, 1, 2]).tile( [1, sum(self.num_points_list), 1]) scaling = paddle.concat( [paddle.arange(1, n + 1, dtype=paddle.float32) for n in self.num_points_list]).reshape([1, -1, 1]) grid_init *= scaling self.sampling_offsets.bias.set_value(grid_init.flatten()) # attention_weights constant_(self.attention_weights.weight) constant_(self.attention_weights.bias) # proj xavier_uniform_(self.value_proj.weight) constant_(self.value_proj.bias) xavier_uniform_(self.output_proj.weight) constant_(self.output_proj.bias) def forward(self, query, reference_points, value, value_spatial_shapes, value_mask=None): """ Args: query (Tensor): [batch_num, query_len, num_heads * head_dim] reference_points (Tensor): [batch_num, query_length, n_levels, 2], range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area value (Tensor): [batch_num, value_len, num_heads * head_dim] value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] value_mask (Tensor): [batch_num, value_len], True for non-padding elements, False for padding elements Returns: output (Tensor): [bs, Length_{query}, C] """ batch_num, query_len = query.shape[:2] value_len = value.shape[1] value = self.value_proj(value) if value_mask is not None: value_mask = value_mask.astype(value.dtype).unsqueeze(-1) value *= value_mask value = value.reshape([batch_num, value_len, self.num_heads, self.head_dim]) sampling_offsets = self.sampling_offsets(query).reshape( [batch_num, query_len, self.num_heads, sum(self.num_points_list), 2]) attention_weights = self.attention_weights(query).reshape( [batch_num, query_len, self.num_heads, sum(self.num_points_list)]) attention_weights = F.softmax(attention_weights, axis=-1) if reference_points.shape[-1] == 2: offset_normalizer = value_spatial_shapes.flip([1]).reshape( [1, 1, 1, self.num_levels, 1, 2]) sampling_locations = reference_points.reshape([ batch_num, query_len, 1, self.num_levels, 1, 2 ]) + sampling_offsets / offset_normalizer.astype(sampling_offsets.dtype) elif reference_points.shape[-1] == 4: offset = sampling_offsets * reference_points[:, :, None, :, 2:] num_points_scale = self.num_points_scale.astype(query.dtype).unsqueeze(-1) offset = offset * num_points_scale * self.offset_scale sampling_locations = reference_points[:, :, None, :, :2] + offset else: raise ValueError( "Last dim of reference_points must be 2 or 4, but get {} instead.". format(reference_points.shape[-1])) output = self.ms_deformable_attn_core(value, value_spatial_shapes, sampling_locations, attention_weights) output = self.output_proj(output) return output class TransformerDecoderLayer(nn.Layer): def __init__(self, d_model=256, n_head=8, dim_feedforward=1024, dropout=0., activation="relu", n_levels=4, n_points=4, sampling_method='default', weight_attr=None, bias_attr=None): super(TransformerDecoderLayer, self).__init__() # self attention self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout) self.dropout1 = nn.Dropout(dropout) self.norm1 = nn.LayerNorm( d_model, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) # cross attention self.cross_attn = MSDeformableAttention( d_model, n_head, n_levels, n_points, sampling_method=sampling_method, lr_mult=1.0) self.dropout2 = nn.Dropout(dropout) self.norm2 = nn.LayerNorm( d_model, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) # ffn self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr, bias_attr) self.activation = getattr(F, activation) self.dropout3 = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr, bias_attr) self.dropout4 = nn.Dropout(dropout) self.norm3 = nn.LayerNorm( d_model, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) self._reset_parameters() def _reset_parameters(self): linear_init_(self.linear1) linear_init_(self.linear2) xavier_uniform_(self.linear1.weight) xavier_uniform_(self.linear2.weight) def with_pos_embed(self, tensor, pos): return tensor if pos is None else tensor + pos def forward_ffn(self, tgt): return self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) def forward(self, tgt, reference_points, memory, memory_spatial_shapes, memory_level_start_index, attn_mask=None, memory_mask=None, query_pos_embed=None): # self attention q = k = self.with_pos_embed(tgt, query_pos_embed) if attn_mask is not None: attn_mask = paddle.where( attn_mask.astype('bool'), paddle.zeros(attn_mask.shape, tgt.dtype), paddle.full(attn_mask.shape, float("-inf"), tgt.dtype)) tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask) tgt = tgt + self.dropout1(tgt2) tgt = self.norm1(tgt) # cross attention tgt2 = self.cross_attn( self.with_pos_embed(tgt, query_pos_embed), reference_points, memory, memory_spatial_shapes, memory_mask) tgt = tgt + self.dropout2(tgt2) tgt = self.norm2(tgt) # ffn tgt2 = self.forward_ffn(tgt) tgt = tgt + self.dropout4(tgt2) tgt = self.norm3(tgt) return tgt @register class RTDETRTransformerv2(nn.Layer): __shared__ = ['num_classes', 'hidden_dim', 'eval_size'] def __init__(self, num_classes=80, hidden_dim=256, num_queries=300, position_embed_type='sine', backbone_feat_channels=[512, 1024, 2048], feat_strides=[8, 16, 32], num_levels=3, num_decoder_points=4, nhead=8, num_decoder_layers=6, dim_feedforward=1024, dropout=0., activation="relu", num_denoising=100, label_noise_ratio=0.5, box_noise_scale=1.0, learnt_init_query=True, query_pos_head_inv_sig=False, eval_size=None, eval_idx=-1, eps=1e-2, cross_attn_sampling_method='default'): super(RTDETRTransformerv2, self).__init__() assert position_embed_type in ['sine', 'learned'], \ f'ValueError: position_embed_type not supported {position_embed_type}!' assert len(backbone_feat_channels) <= num_levels assert len(feat_strides) == len(backbone_feat_channels) for _ in range(num_levels - len(feat_strides)): feat_strides.append(feat_strides[-1] * 2) self.hidden_dim = hidden_dim self.nhead = nhead self.feat_strides = feat_strides self.num_levels = num_levels self.num_classes = num_classes self.num_queries = num_queries self.eps = eps self.num_decoder_layers = num_decoder_layers self.eval_size = eval_size assert cross_attn_sampling_method in ['default', 'discrete'], NotImplementedError self.cross_attn_sampling_method = cross_attn_sampling_method # backbone feature projection self._build_input_proj_layer(backbone_feat_channels) # Transformer module decoder_layer = TransformerDecoderLayer( hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels, num_decoder_points, sampling_method=cross_attn_sampling_method) self.decoder = TransformerDecoder(hidden_dim, decoder_layer, num_decoder_layers, eval_idx) # denoising part self.denoising_class_embed = nn.Embedding( num_classes, hidden_dim, weight_attr=ParamAttr(initializer=nn.initializer.Normal())) self.num_denoising = num_denoising self.label_noise_ratio = label_noise_ratio self.box_noise_scale = box_noise_scale # decoder embedding self.learnt_init_query = learnt_init_query if learnt_init_query: self.tgt_embed = nn.Embedding(num_queries, hidden_dim) self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2) self.query_pos_head_inv_sig = query_pos_head_inv_sig # encoder head self.enc_output = nn.Sequential( nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm( hidden_dim, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0)))) self.enc_score_head = nn.Linear(hidden_dim, num_classes) self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3) # decoder head self.dec_score_head = nn.LayerList([ nn.Linear(hidden_dim, num_classes) for _ in range(num_decoder_layers) ]) self.dec_bbox_head = nn.LayerList([ MLP(hidden_dim, hidden_dim, 4, num_layers=3) for _ in range(num_decoder_layers) ]) self._reset_parameters() def _reset_parameters(self): # class and bbox head init bias_cls = bias_init_with_prob(0.01) linear_init_(self.enc_score_head) constant_(self.enc_score_head.bias, bias_cls) constant_(self.enc_bbox_head.layers[-1].weight) constant_(self.enc_bbox_head.layers[-1].bias) for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head): linear_init_(cls_) constant_(cls_.bias, bias_cls) constant_(reg_.layers[-1].weight) constant_(reg_.layers[-1].bias) linear_init_(self.enc_output[0]) xavier_uniform_(self.enc_output[0].weight) if self.learnt_init_query: xavier_uniform_(self.tgt_embed.weight) xavier_uniform_(self.query_pos_head.layers[0].weight) xavier_uniform_(self.query_pos_head.layers[1].weight) for l in self.input_proj: xavier_uniform_(l[0].weight) # init encoder output anchors and valid_mask if self.eval_size: self.anchors, self.valid_mask = self._generate_anchors() @classmethod def from_config(cls, cfg, input_shape): return {'backbone_feat_channels': [i.channels for i in input_shape]} def _build_input_proj_layer(self, backbone_feat_channels): self.input_proj = nn.LayerList() for in_channels in backbone_feat_channels: self.input_proj.append( nn.Sequential( ('conv', nn.Conv2D( in_channels, self.hidden_dim, kernel_size=1, bias_attr=False)), ('norm', nn.BatchNorm2D( self.hidden_dim, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))) in_channels = backbone_feat_channels[-1] for _ in range(self.num_levels - len(backbone_feat_channels)): self.input_proj.append( nn.Sequential( ('conv', nn.Conv2D( in_channels, self.hidden_dim, kernel_size=3, stride=2, padding=1, bias_attr=False)), ('norm', nn.BatchNorm2D( self.hidden_dim, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))) in_channels = self.hidden_dim def _get_encoder_input(self, feats): # get projection features proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)] if self.num_levels > len(proj_feats): len_srcs = len(proj_feats) for i in range(len_srcs, self.num_levels): if i == len_srcs: proj_feats.append(self.input_proj[i](feats[-1])) else: proj_feats.append(self.input_proj[i](proj_feats[-1])) # get encoder inputs feat_flatten = [] spatial_shapes = [] level_start_index = [0, ] for i, feat in enumerate(proj_feats): _, _, h, w = feat.shape # [b, c, h, w] -> [b, h*w, c] feat_flatten.append(feat.flatten(2).transpose([0, 2, 1])) # [num_levels, 2] spatial_shapes.append([h, w]) # [l], start index of each level level_start_index.append(h * w + level_start_index[-1]) # [b, l, c] feat_flatten = paddle.concat(feat_flatten, 1) level_start_index.pop() return (feat_flatten, spatial_shapes, level_start_index) def forward(self, feats, pad_mask=None, gt_meta=None, is_teacher=False): # input projection and embedding (memory, spatial_shapes, level_start_index) = self._get_encoder_input(feats) # prepare denoising training if self.training: denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \ get_contrastive_denoising_training_group(gt_meta, self.num_classes, self.num_queries, self.denoising_class_embed.weight, self.num_denoising, self.label_noise_ratio, self.box_noise_scale) else: denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \ self._get_decoder_input( memory, spatial_shapes, denoising_class, denoising_bbox_unact,is_teacher) # decoder out_bboxes, out_logits = self.decoder( target, init_ref_points_unact, memory, spatial_shapes, level_start_index, self.dec_bbox_head, self.dec_score_head, self.query_pos_head, attn_mask=attn_mask, memory_mask=None, query_pos_head_inv_sig=self.query_pos_head_inv_sig) return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits, dn_meta) def _generate_anchors(self, spatial_shapes=None, grid_size=0.05, dtype="float32"): if spatial_shapes is None: spatial_shapes = [ [int(self.eval_size[0] / s), int(self.eval_size[1] / s)] for s in self.feat_strides ] anchors = [] for lvl, (h, w) in enumerate(spatial_shapes): grid_y, grid_x = paddle.meshgrid( paddle.arange( end=h, dtype=dtype), paddle.arange( end=w, dtype=dtype)) grid_xy = paddle.stack([grid_x, grid_y], -1) valid_WH = paddle.to_tensor([h, w]).astype(dtype) grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl) anchors.append( paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4])) anchors = paddle.concat(anchors, 1) valid_mask = ((anchors > self.eps) * (anchors < 1 - self.eps)).all(-1, keepdim=True) anchors = paddle.log(anchors / (1 - anchors)) anchors = paddle.where(valid_mask, anchors, paddle.to_tensor(float("inf"))) return anchors, valid_mask def _get_decoder_input(self, memory, spatial_shapes, denoising_class=None, denoising_bbox_unact=None, is_teacher=False): bs, _, _ = memory.shape # prepare input for decoder if self.training or self.eval_size is None or is_teacher: anchors, valid_mask = self._generate_anchors(spatial_shapes) else: anchors, valid_mask = self.anchors, self.valid_mask memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.)) output_memory = self.enc_output(memory) enc_outputs_class = self.enc_score_head(output_memory) enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors _, topk_ind = paddle.topk( enc_outputs_class.max(-1), self.num_queries, axis=1) # extract region proposal boxes batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype) batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries]) topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1) reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact, topk_ind) # unsigmoided. enc_topk_bboxes = F.sigmoid(reference_points_unact) if denoising_bbox_unact is not None: reference_points_unact = paddle.concat( [denoising_bbox_unact, reference_points_unact], 1) if self.training: reference_points_unact = reference_points_unact.detach() enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind) # extract region features if self.learnt_init_query: target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1]) else: target = paddle.gather_nd(output_memory, topk_ind) if self.training: target = target.detach() if denoising_class is not None: target = paddle.concat([denoising_class, target], 1) return target, reference_points_unact, enc_topk_bboxes, enc_topk_logits ================================================ FILE: ppdet/modeling/transformers/rtdetr_transformerv3.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR) # Copyright (c) 2020 SenseTime. All Rights Reserved. # Modified from detrex (https://github.com/IDEA-Research/detrex) # Copyright 2022 The IDEA Authors. All rights reserved. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from paddle.regularizer import L2Decay from ppdet.core.workspace import register from ..layers import MultiHeadAttention from ..heads.detr_head import MLP from .deformable_transformer import MSDeformableAttention from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_, bias_init_with_prob) from .utils import (_get_clones, get_sine_pos_embed, get_contrastive_denoising_training_group, inverse_sigmoid) __all__ = ['RTDETRTransformerv3'] class PPMSDeformableAttention(MSDeformableAttention): def forward(self, query, reference_points, value, value_spatial_shapes, value_level_start_index, value_mask=None): """ Args: query (Tensor): [bs, query_length, C] reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area value (Tensor): [bs, value_length, C] value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] value_level_start_index (List): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...] value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements Returns: output (Tensor): [bs, Length_{query}, C] """ bs, Len_q = query.shape[:2] Len_v = value.shape[1] value = self.value_proj(value) if value_mask is not None: value_mask = value_mask.astype(value.dtype).unsqueeze(-1) value *= value_mask value = value.reshape([bs, Len_v, self.num_heads, self.head_dim]) sampling_offsets = self.sampling_offsets(query).reshape( [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2]) attention_weights = self.attention_weights(query).reshape( [bs, Len_q, self.num_heads, self.num_levels * self.num_points]) attention_weights = F.softmax(attention_weights).reshape( [bs, Len_q, self.num_heads, self.num_levels, self.num_points]) if reference_points.shape[-1] == 2: offset_normalizer = paddle.to_tensor(value_spatial_shapes) offset_normalizer = offset_normalizer.flip([1]).reshape( [1, 1, 1, self.num_levels, 1, 2]) sampling_locations = reference_points.reshape([ bs, Len_q, 1, self.num_levels, 1, 2 ]) + sampling_offsets / offset_normalizer elif reference_points.shape[-1] == 4: sampling_locations = ( reference_points[:, :, None, :, None, :2] + sampling_offsets / self.num_points * reference_points[:, :, None, :, None, 2:] * 0.5) else: raise ValueError( "Last dim of reference_points must be 2 or 4, but get {} instead.". format(reference_points.shape[-1])) if not isinstance(query, paddle.Tensor): from ppdet.modeling.transformers.utils import deformable_attention_core_func output = deformable_attention_core_func( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) else: value_spatial_shapes = paddle.to_tensor(value_spatial_shapes) value_level_start_index = paddle.to_tensor(value_level_start_index) output = self.ms_deformable_attn_core( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) output = self.output_proj(output) return output class TransformerDecoderLayer(nn.Layer): def __init__(self, d_model=256, n_head=8, dim_feedforward=1024, dropout=0., activation="relu", n_levels=4, n_points=4, weight_attr=None, bias_attr=None): super(TransformerDecoderLayer, self).__init__() # self attention self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout) self.dropout1 = nn.Dropout(dropout) self.norm1 = nn.LayerNorm( d_model, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) # cross attention self.cross_attn = PPMSDeformableAttention(d_model, n_head, n_levels, n_points, 1.0) self.dropout2 = nn.Dropout(dropout) self.norm2 = nn.LayerNorm( d_model, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) # ffn self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr, bias_attr) self.activation = getattr(F, activation) self.dropout3 = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr, bias_attr) self.dropout4 = nn.Dropout(dropout) self.norm3 = nn.LayerNorm( d_model, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) self._reset_parameters() def _reset_parameters(self): linear_init_(self.linear1) linear_init_(self.linear2) xavier_uniform_(self.linear1.weight) xavier_uniform_(self.linear2.weight) def with_pos_embed(self, tensor, pos): return tensor if pos is None else tensor + pos def forward_ffn(self, tgt): return self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) def forward(self, tgt, reference_points, memory, memory_spatial_shapes, memory_level_start_index, attn_mask=None, memory_mask=None, query_pos_embed=None): # self attention q = k = self.with_pos_embed(tgt, query_pos_embed) if attn_mask is not None: attn_mask = paddle.where( attn_mask.astype('bool'), paddle.zeros(attn_mask.shape, tgt.dtype), paddle.full(attn_mask.shape, float("-inf"), tgt.dtype)) tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask) tgt = tgt + self.dropout1(tgt2) tgt = self.norm1(tgt) # cross attention tgt2 = self.cross_attn( self.with_pos_embed(tgt, query_pos_embed), reference_points, memory, memory_spatial_shapes, memory_level_start_index, memory_mask) tgt = tgt + self.dropout2(tgt2) tgt = self.norm2(tgt) # ffn tgt2 = self.forward_ffn(tgt) tgt = tgt + self.dropout4(tgt2) tgt = self.norm3(tgt) return tgt class TransformerDecoder(nn.Layer): def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1): super(TransformerDecoder, self).__init__() self.layers = _get_clones(decoder_layer, num_layers) self.hidden_dim = hidden_dim self.num_layers = num_layers self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx def forward(self, tgt, ref_points_unact, memory, memory_spatial_shapes, memory_level_start_index, bbox_head, score_head, query_pos_head, attn_mask=None, memory_mask=None, query_pos_head_inv_sig=False): output = tgt dec_out_bboxes = [] dec_out_logits = [] ref_points_detach = F.sigmoid(ref_points_unact) for i, layer in enumerate(self.layers): ref_points_input = ref_points_detach.unsqueeze(2) if not query_pos_head_inv_sig: query_pos_embed = query_pos_head(ref_points_detach) else: query_pos_embed = query_pos_head( inverse_sigmoid(ref_points_detach)) output = layer(output, ref_points_input, memory, memory_spatial_shapes, memory_level_start_index, attn_mask, memory_mask, query_pos_embed) inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid( ref_points_detach)) if self.training: dec_out_logits.append(score_head[i](output)) if i == 0: dec_out_bboxes.append(inter_ref_bbox) else: dec_out_bboxes.append( F.sigmoid(bbox_head[i](output) + inverse_sigmoid( ref_points))) elif i == self.eval_idx: dec_out_logits.append(score_head[i](output)) dec_out_bboxes.append(inter_ref_bbox) break ref_points = inter_ref_bbox ref_points_detach = inter_ref_bbox.detach( ) if self.training else inter_ref_bbox return paddle.stack(dec_out_bboxes), paddle.stack(dec_out_logits) @register class RTDETRTransformerv3(nn.Layer): __shared__ = ['num_classes', 'hidden_dim', 'eval_size', 'o2m_branch', 'num_queries_o2m'] def __init__(self, num_classes=80, hidden_dim=256, num_queries=300, position_embed_type='sine', backbone_feat_channels=[512, 1024, 2048], feat_strides=[8, 16, 32], num_levels=3, num_decoder_points=4, nhead=8, num_decoder_layers=6, dim_feedforward=1024, dropout=0., activation="relu", num_denoising=100, label_noise_ratio=0.5, box_noise_scale=1.0, learnt_init_query=True, query_pos_head_inv_sig=False, eval_size=None, eval_idx=-1, num_noises=0, num_noise_queries=[], num_noise_denoising=100, o2m_branch=False, num_queries_o2m=450, eps=1e-2): super(RTDETRTransformerv3, self).__init__() assert position_embed_type in ['sine', 'learned'], \ f'ValueError: position_embed_type not supported {position_embed_type}!' assert len(backbone_feat_channels) <= num_levels assert len(feat_strides) == len(backbone_feat_channels) assert len(num_noise_queries) == num_noises for _ in range(num_levels - len(feat_strides)): feat_strides.append(feat_strides[-1] * 2) self.hidden_dim = hidden_dim self.nhead = nhead self.feat_strides = feat_strides self.num_levels = num_levels self.num_classes = num_classes self.num_queries = [num_queries] self.eps = eps self.num_decoder_layers = num_decoder_layers self.eval_size = eval_size self.num_noises = num_noises self.num_noise_denoising = num_noise_denoising self.num_groups = 1 if num_noises > 0: self.num_queries.extend(num_noise_queries) self.num_groups += num_noises self.o2m_branch = o2m_branch self.num_queries_o2m = num_queries_o2m if o2m_branch: self.num_queries.append(num_queries_o2m) self.num_groups += 1 # backbone feature projection self._build_input_proj_layer(backbone_feat_channels) # Transformer module decoder_layer = TransformerDecoderLayer( hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels, num_decoder_points) self.decoder = TransformerDecoder(hidden_dim, decoder_layer, num_decoder_layers, eval_idx) # denoising part self.denoising_class_embed = nn.Embedding( num_classes, hidden_dim, weight_attr=ParamAttr(initializer=nn.initializer.Normal())) self.num_denoising = num_denoising self.label_noise_ratio = label_noise_ratio self.box_noise_scale = box_noise_scale # decoder embedding self.learnt_init_query = learnt_init_query if learnt_init_query: self.tgt_embed = nn.Embedding(num_queries, hidden_dim) self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2) self.query_pos_head_inv_sig = query_pos_head_inv_sig # encoder head self.enc_output = nn.LayerList([ nn.Sequential( nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm( hidden_dim, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0)))) for _ in range(self.num_groups) ]) self.enc_score_head = nn.LayerList([ nn.Linear(hidden_dim, num_classes) for _ in range(self.num_groups) ]) self.enc_bbox_head = nn.LayerList([ MLP(hidden_dim, hidden_dim, 4, num_layers=3) for _ in range(self.num_groups) ]) self.map_memory = nn.Sequential( nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm( hidden_dim, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0))) ) # decoder head self.dec_score_head = nn.LayerList([ nn.Linear(hidden_dim, num_classes) for _ in range(num_decoder_layers) ]) self.dec_bbox_head = nn.LayerList([ MLP(hidden_dim, hidden_dim, 4, num_layers=3) for _ in range(num_decoder_layers) ]) self._reset_parameters() def _reset_parameters(self): # class and bbox head init bias_cls = bias_init_with_prob(0.01) for enc_score_head in self.enc_score_head: linear_init_(enc_score_head) constant_(enc_score_head.bias, bias_cls) for enc_bbox_head in self.enc_bbox_head: constant_(enc_bbox_head.layers[-1].weight) constant_(enc_bbox_head.layers[-1].bias) for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head): linear_init_(cls_) constant_(cls_.bias, bias_cls) constant_(reg_.layers[-1].weight) constant_(reg_.layers[-1].bias) for enc_output in self.enc_output: linear_init_(enc_output[0]) xavier_uniform_(enc_output[0].weight) linear_init_(self.map_memory[0]) xavier_uniform_(self.map_memory[0].weight) if self.learnt_init_query: xavier_uniform_(self.tgt_embed.weight) xavier_uniform_(self.query_pos_head.layers[0].weight) xavier_uniform_(self.query_pos_head.layers[1].weight) for l in self.input_proj: xavier_uniform_(l[0].weight) # init encoder output anchors and valid_mask if self.eval_size: self.anchors, self.valid_mask = self._generate_anchors() @classmethod def from_config(cls, cfg, input_shape): return {'backbone_feat_channels': [i.channels for i in input_shape]} def _build_input_proj_layer(self, backbone_feat_channels): self.input_proj = nn.LayerList() for in_channels in backbone_feat_channels: self.input_proj.append( nn.Sequential( ('conv', nn.Conv2D( in_channels, self.hidden_dim, kernel_size=1, bias_attr=False)), ('norm', nn.BatchNorm2D( self.hidden_dim, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))) in_channels = backbone_feat_channels[-1] for _ in range(self.num_levels - len(backbone_feat_channels)): self.input_proj.append( nn.Sequential( ('conv', nn.Conv2D( in_channels, self.hidden_dim, kernel_size=3, stride=2, padding=1, bias_attr=False)), ('norm', nn.BatchNorm2D( self.hidden_dim, weight_attr=ParamAttr(regularizer=L2Decay(0.0)), bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))) in_channels = self.hidden_dim def _get_encoder_input(self, feats): # get projection features proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)] if self.num_levels > len(proj_feats): len_srcs = len(proj_feats) for i in range(len_srcs, self.num_levels): if i == len_srcs: proj_feats.append(self.input_proj[i](feats[-1])) else: proj_feats.append(self.input_proj[i](proj_feats[-1])) # get encoder inputs feat_flatten = [] spatial_shapes = [] level_start_index = [0, ] for i, feat in enumerate(proj_feats): _, _, h, w = feat.shape # [b, c, h, w] -> [b, h*w, c] feat_flatten.append(feat.flatten(2).transpose([0, 2, 1])) # [num_levels, 2] spatial_shapes.append([h, w]) # [l], start index of each level level_start_index.append(h * w + level_start_index[-1]) # [b, l, c] feat_flatten = paddle.concat(feat_flatten, 1) level_start_index.pop() return (feat_flatten, spatial_shapes, level_start_index) def forward(self, feats, pad_mask=None, gt_meta=None, is_teacher=False): # input projection and embedding (memory, spatial_shapes, level_start_index) = self._get_encoder_input(feats) # prepare denoising training if self.training: denoising_classes, denoising_bbox_unacts, attn_masks, dn_metas = [], [], [], [] for g_id in range(self.num_noises + 1): if g_id == 0: num_denoising = self.num_denoising else: num_denoising = self.num_noise_denoising denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \ get_contrastive_denoising_training_group(gt_meta, self.num_classes, self.num_queries[g_id], self.denoising_class_embed.weight, num_denoising, self.label_noise_ratio, self.box_noise_scale) denoising_classes.append(denoising_class) denoising_bbox_unacts.append(denoising_bbox_unact) attn_masks.append(attn_mask) dn_metas.append(dn_meta) else: denoising_classes, denoising_bbox_unacts, attn_masks, dn_metas = None, None, None, None target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \ self._get_decoder_input( memory, spatial_shapes, denoising_classes, denoising_bbox_unacts, is_teacher) # multi group noise attention if self.training: new_size = target.shape[1] new_attn_mask = paddle.ones([new_size, new_size]) < 0 begin, end = 0, 0 mask = None for g_id in range(self.num_groups): new_mask = paddle.rand([self.num_queries[g_id], self.num_queries[g_id]]) if self.o2m_branch and g_id == self.num_groups - 1: end = end + self.num_queries_o2m new_mask = new_mask >= 0.0 new_attn_mask[begin: end, begin: end] = new_mask else: end = end + attn_masks[g_id].shape[1] dn_size, q_size = dn_metas[g_id]['dn_num_split'] if g_id > 0: new_mask = new_mask > 0.1 else: new_mask = new_mask >= 0.0 attn_masks[g_id][dn_size: dn_size + q_size, dn_size: dn_size + q_size] = new_mask new_attn_mask[begin: end, begin: end] = attn_masks[g_id] begin = end attn_masks = new_attn_mask # decoder out_bboxes, out_logits = self.decoder( target, init_ref_points_unact, memory, spatial_shapes, level_start_index, self.dec_bbox_head, self.dec_score_head, self.query_pos_head, attn_mask=attn_masks, memory_mask=None, query_pos_head_inv_sig=self.query_pos_head_inv_sig) return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits, dn_metas) def _generate_anchors(self, spatial_shapes=None, grid_size=0.05, dtype="float32"): if spatial_shapes is None: spatial_shapes = [ [int(self.eval_size[0] / s), int(self.eval_size[1] / s)] for s in self.feat_strides ] anchors = [] for lvl, (h, w) in enumerate(spatial_shapes): grid_y, grid_x = paddle.meshgrid( paddle.arange( end=h, dtype=dtype), paddle.arange( end=w, dtype=dtype)) grid_xy = paddle.stack([grid_x, grid_y], -1) valid_WH = paddle.to_tensor([h, w]).astype(dtype) grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl) anchors.append( paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4])) anchors = paddle.concat(anchors, 1) valid_mask = ((anchors > self.eps) * (anchors < 1 - self.eps)).all(-1, keepdim=True) anchors = paddle.log(anchors / (1 - anchors)) anchors = paddle.where(valid_mask, anchors, paddle.to_tensor(float("inf"))) return anchors, valid_mask def _get_decoder_input(self, memory, spatial_shapes, denoising_classes=None, denoising_bbox_unacts=None, is_teacher=False): bs, _, _ = memory.shape # prepare input for decoder if self.training or self.eval_size is None or is_teacher: anchors, valid_mask = self._generate_anchors(spatial_shapes) else: anchors, valid_mask = self.anchors, self.valid_mask memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.)) map_memory = self.map_memory(memory.detach()) targets, reference_points_unacts, enc_topk_bboxes, enc_topk_logits = [], [], [], [] for g_id in range(self.num_groups): output_memory = self.enc_output[g_id](memory) enc_outputs_class = self.enc_score_head[g_id](output_memory) enc_outputs_coord_unact = self.enc_bbox_head[g_id](output_memory) + anchors _, topk_ind = paddle.topk( enc_outputs_class.max(-1), self.num_queries[g_id], axis=1) # extract region proposal boxes batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype) batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries[g_id]]) topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1) reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact, topk_ind) # unsigmoided. enc_topk_bbox = F.sigmoid(reference_points_unact) enc_topk_logit = paddle.gather_nd(enc_outputs_class, topk_ind) if denoising_bbox_unacts is not None and not (self.o2m_branch and g_id == self.num_groups - 1): reference_points_unact = paddle.concat( [denoising_bbox_unacts[g_id], reference_points_unact], 1) if self.training: reference_points_unact = reference_points_unact.detach() # extract region features if self.learnt_init_query: target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1]) else: if g_id == 0: target = paddle.gather_nd(output_memory, topk_ind) if self.training: target = target.detach() else: target = paddle.gather_nd(map_memory, topk_ind) if denoising_classes is not None and not (self.o2m_branch and g_id == self.num_groups - 1): target = paddle.concat([denoising_classes[g_id], target], 1) if not self.training: return target, reference_points_unact, enc_topk_bbox, enc_topk_logit targets.append(target) reference_points_unacts.append(reference_points_unact) enc_topk_bboxes.append(enc_topk_bbox) enc_topk_logits.append(enc_topk_logit) targets = paddle.concat(targets, 1) reference_points_unacts = paddle.concat(reference_points_unacts, 1) enc_topk_bboxes = paddle.concat(enc_topk_bboxes, 1) enc_topk_logits = paddle.concat(enc_topk_logits, 1) return targets, reference_points_unacts, enc_topk_bboxes, enc_topk_logits ================================================ FILE: ppdet/modeling/transformers/utils.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Modified from DETR (https://github.com/facebookresearch/detr) # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved # Modified from detrex (https://github.com/IDEA-Research/detrex) # Copyright 2022 The IDEA Authors. All rights reserved. from __future__ import absolute_import from __future__ import division from __future__ import print_function import copy import math import paddle import paddle.nn as nn import paddle.nn.functional as F from ..bbox_utils import bbox_overlaps __all__ = [ '_get_clones', 'bbox_overlaps', 'bbox_cxcywh_to_xyxy', 'bbox_xyxy_to_cxcywh', 'sigmoid_focal_loss', 'inverse_sigmoid', 'deformable_attention_core_func', 'varifocal_loss_with_logits' ] def _get_clones(module, N): return nn.LayerList([copy.deepcopy(module) for _ in range(N)]) def bbox_cxcywh_to_xyxy(x): cxcy, wh = paddle.split(x, 2, axis=-1) return paddle.concat([cxcy - 0.5 * wh, cxcy + 0.5 * wh], axis=-1) def bbox_xyxy_to_cxcywh(x): x1, y1, x2, y2 = x.split(4, axis=-1) return paddle.concat([(x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)], axis=-1) def sigmoid_focal_loss(logit, label, normalizer=1.0, alpha=0.25, gamma=2.0): prob = F.sigmoid(logit) ce_loss = F.binary_cross_entropy_with_logits(logit, label, reduction="none") p_t = prob * label + (1 - prob) * (1 - label) loss = ce_loss * ((1 - p_t)**gamma) if alpha >= 0: alpha_t = alpha * label + (1 - alpha) * (1 - label) loss = alpha_t * loss return loss.mean(1).sum() / normalizer def inverse_sigmoid(x, eps=1e-5): x = x.clip(min=0., max=1.) return paddle.log(x.clip(min=eps) / (1 - x).clip(min=eps)) def deformable_attention_core_func(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights): """ Args: value (Tensor): [bs, value_length, n_head, c] value_spatial_shapes (Tensor|List): [n_levels, 2] value_level_start_index (Tensor|List): [n_levels] sampling_locations (Tensor): [bs, query_length, n_head, n_levels, n_points, 2] attention_weights (Tensor): [bs, query_length, n_head, n_levels, n_points] Returns: output (Tensor): [bs, Length_{query}, C] """ bs, _, n_head, c = value.shape _, Len_q, _, n_levels, n_points, _ = sampling_locations.shape split_shape = [h * w for h, w in value_spatial_shapes] value_list = value.split(split_shape, axis=1) sampling_grids = 2 * sampling_locations - 1 sampling_value_list = [] for level, (h, w) in enumerate(value_spatial_shapes): # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ value_l_ = value_list[level].flatten(2).transpose([0, 2, 1]).reshape( [bs * n_head, c, h, w]) # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 sampling_grid_l_ = sampling_grids[:, :, :, level].transpose([0, 2, 1, 3, 4]).flatten(0, 1) # N_*M_, D_, Lq_, P_ sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, mode='bilinear', padding_mode='zeros', align_corners=False) sampling_value_list.append(sampling_value_l_) # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_) attention_weights = attention_weights.transpose([0, 2, 1, 3, 4]).reshape( [bs * n_head, 1, Len_q, n_levels * n_points]) output = (paddle.stack(sampling_value_list, axis=-2).flatten(-2) * attention_weights).sum(-1).reshape([bs, n_head * c, Len_q]) return output.transpose([0, 2, 1]) def discrete_sample(x, grid): """ Args: x (Tensor): [N, C, H, W] grid (Tensor): [N, grid_H, grid_W, 2] Returns: output (Tensor): [N, C, grid_H, grid_W] """ N, C, H, W = x.shape _, grid_H, grid_W, _ = grid.shape spatial_shape = paddle.to_tensor([[W, H]], dtype=paddle.float32) index = (grid * spatial_shape + 0.5).astype(paddle.int64).flatten(1, 2) h_index = index[:, :, 1].clip(0, H - 1) w_index = index[:, :, 0].clip(0, W - 1) batch_index = paddle.arange(N).unsqueeze(-1).tile([1, grid_H * grid_W]) output = x[batch_index, :, h_index, w_index] output = output.transpose([0, 2, 1]).reshape([N, C, grid_H, grid_W]) return output def deformable_attention_core_func_v2(value, value_spatial_shapes, sampling_locations, attention_weights, num_points_list, sampling_method='default'): """ Args: value (Tensor): [batch_num, value_len, num_heads, head_dim] value_spatial_shapes (Tensor|List): [n_levels, 2] sampling_locations (Tensor): [batch_num, query_len, num_heads, total_num_points, 2] attention_weights (Tensor): [batch_num, query_len, num_heads, total_num_points] num_points_list (List): The number of sampling point corresponding to each level sampling_method (str): default(grid_sample) or discrete(discrete_sample) Returns: output (Tensor): [batch_num, query_len, num_heads * head_dim] """ assert sampling_method in ['default', 'discrete'], NotImplementedError batch_num, _, num_heads, head_dim = value.shape query_len = sampling_locations.shape[1] num_levels = len(num_points_list) value = value.transpose([0, 2, 3, 1]).flatten(0, 1) split_shape = [h * w for h, w in value_spatial_shapes] value_list = value.split(split_shape, axis=-1) value_list = [ value.reshape([batch_num * num_heads, head_dim, h, w]) for value, (h, w) in zip(value_list, value_spatial_shapes) ] if sampling_method == 'default': sampling_grids = 2 * sampling_locations - 1 else: sampling_grids = sampling_locations sampling_grids = sampling_grids.transpose([0, 2, 1, 3, 4]).flatten(0, 1) sampling_grids_list = sampling_grids.split(num_points_list, axis=-2) sampling_value_list = [] for idx in range(num_levels): # value_list[idx]: [batch_num * num_heads, head_dim, h, w] # sampling_grids_list[idx]: [batch_num * num_heads, query_len, num_points, 2] # _sampling_value: [batch_num * num_heads, head_dim, query_len, num_points] if sampling_method == 'default': _sampling_value = F.grid_sample(value_list[idx], sampling_grids_list[idx], mode='bilinear', padding_mode='zeros', align_corners=False) else: _sampling_value = discrete_sample(value_list[idx], sampling_grids_list[idx]) sampling_value_list.append(_sampling_value) attn_weights = attention_weights.transpose([0, 2, 1, 3]) attn_weights = attn_weights.flatten(0, 1).unsqueeze(1) sampling_value = paddle.concat(sampling_value_list, axis=-1) # attn_weights: [batch_num * num_heads, 1, query_len, total_num_points] # sampling_value: [batch_num * num_heads, head_dim, query_len, total_num_points] # output: [batch_num * num_heads, head_dim, query_len] output = (sampling_value * attn_weights).sum(-1) output = output.reshape([batch_num, num_heads * head_dim, query_len]) return output.transpose([0, 2, 1]) def get_valid_ratio(mask): _, H, W = mask.shape valid_ratio_h = paddle.sum(mask[:, :, 0], 1) / H valid_ratio_w = paddle.sum(mask[:, 0, :], 1) / W # [b, 2] return paddle.stack([valid_ratio_w, valid_ratio_h], -1) def get_denoising_training_group(targets, num_classes, num_queries, class_embed, num_denoising=100, label_noise_ratio=0.5, box_noise_scale=1.0): if num_denoising <= 0: return None, None, None, None num_gts = [len(t) for t in targets["gt_class"]] max_gt_num = max(num_gts) if max_gt_num == 0: return None, None, None, None num_group = num_denoising // max_gt_num num_group = 1 if num_group == 0 else num_group # pad gt to max_num of a batch bs = len(targets["gt_class"]) input_query_class = paddle.full([bs, max_gt_num], num_classes, dtype='int32') input_query_bbox = paddle.zeros([bs, max_gt_num, 4]) pad_gt_mask = paddle.zeros([bs, max_gt_num]) for i in range(bs): num_gt = num_gts[i] if num_gt > 0: input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1) input_query_bbox[i, :num_gt] = targets["gt_bbox"][i] pad_gt_mask[i, :num_gt] = 1 input_query_class = input_query_class.tile([1, num_group]) input_query_bbox = input_query_bbox.tile([1, num_group, 1]) pad_gt_mask = pad_gt_mask.tile([1, num_group]) dn_positive_idx = paddle.nonzero(pad_gt_mask)[:, 1] dn_positive_idx = paddle.split(dn_positive_idx, [n * num_group for n in num_gts]) # total denoising queries num_denoising = int(max_gt_num * num_group) if label_noise_ratio > 0: input_query_class = input_query_class.flatten() pad_gt_mask = pad_gt_mask.flatten() # half of bbox prob, cast mask from bool to float bacause dtype promotaion # between bool and float is not supported in static mode. mask = paddle.cast( paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5), paddle.float32) chosen_idx = paddle.nonzero(mask * pad_gt_mask).squeeze(-1) # randomly put a new one here new_label = paddle.randint_like(chosen_idx, 0, num_classes, dtype=input_query_class.dtype) input_query_class.scatter_(chosen_idx, new_label) input_query_class.reshape_([bs, num_denoising]) pad_gt_mask.reshape_([bs, num_denoising]) if box_noise_scale > 0: diff = paddle.concat( [input_query_bbox[..., 2:] * 0.5, input_query_bbox[..., 2:]], axis=-1) * box_noise_scale diff *= (paddle.rand(input_query_bbox.shape) * 2.0 - 1.0) input_query_bbox += diff input_query_bbox = inverse_sigmoid(input_query_bbox) class_embed = paddle.concat( [class_embed, paddle.zeros([1, class_embed.shape[-1]])]) input_query_class = paddle.gather(class_embed, input_query_class.flatten(), axis=0).reshape([bs, num_denoising, -1]) tgt_size = num_denoising + num_queries attn_mask = paddle.ones([tgt_size, tgt_size]) < 0 # match query cannot see the reconstruction attn_mask[num_denoising:, :num_denoising] = True # reconstruct cannot see each other for i in range(num_group): if i == 0: attn_mask[max_gt_num * i:max_gt_num * (i + 1), max_gt_num * (i + 1):num_denoising] = True if i == num_group - 1: attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num * i] = True else: attn_mask[max_gt_num * i:max_gt_num * (i + 1), max_gt_num * (i + 1):num_denoising] = True attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num * i] = True attn_mask = ~attn_mask dn_meta = { "dn_positive_idx": dn_positive_idx, "dn_num_group": num_group, "dn_num_split": [num_denoising, num_queries] } return input_query_class, input_query_bbox, attn_mask, dn_meta def get_contrastive_denoising_training_group(targets, num_classes, num_queries, class_embed, num_denoising=100, label_noise_ratio=0.5, box_noise_scale=1.0): if num_denoising <= 0: return None, None, None, None # listcomp is not well-supported in SOT mode for now. num_gts = [] for t in targets["gt_class"]: num_gts.append(len(t)) max_gt_num = max(num_gts) if max_gt_num == 0: return None, None, None, None num_group = num_denoising // max_gt_num num_group = 1 if num_group == 0 else num_group # pad gt to max_num of a batch bs = len(targets["gt_class"]) input_query_class = paddle.full([bs, max_gt_num], num_classes, dtype='int32') input_query_bbox = paddle.zeros([bs, max_gt_num, 4]) pad_gt_mask = paddle.zeros([bs, max_gt_num]) for i in range(bs): num_gt = num_gts[i] if num_gt > 0: input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1) input_query_bbox[i, :num_gt] = targets["gt_bbox"][i] pad_gt_mask[i, :num_gt] = 1 # each group has positive and negative queries. input_query_class = input_query_class.tile([1, 2 * num_group]) input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1]) pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group]) # positive and negative mask negative_gt_mask = paddle.zeros([bs, max_gt_num * 2, 1]) negative_gt_mask[:, max_gt_num:] = 1 negative_gt_mask = negative_gt_mask.tile([1, num_group, 1]) positive_gt_mask = 1 - negative_gt_mask # contrastive denoising training positive index positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask dn_positive_idx = paddle.nonzero(positive_gt_mask)[:, 1] dn_positive_idx = paddle.split(dn_positive_idx, [n * num_group for n in num_gts]) # total denoising queries num_denoising = int(max_gt_num * 2 * num_group) if label_noise_ratio > 0: input_query_class = input_query_class.flatten() pad_gt_mask = pad_gt_mask.flatten() # half of bbox prob mask = paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5) chosen_idx = paddle.nonzero(mask.cast(pad_gt_mask.dtype) * pad_gt_mask).squeeze(-1) # randomly put a new one here new_label = paddle.randint_like(chosen_idx, 0, num_classes, dtype=input_query_class.dtype) input_query_class.scatter_(chosen_idx, new_label) input_query_class.reshape_([bs, num_denoising]) pad_gt_mask.reshape_([bs, num_denoising]) if box_noise_scale > 0: known_bbox = bbox_cxcywh_to_xyxy(input_query_bbox) diff = paddle.tile(input_query_bbox[..., 2:] * 0.5, [1, 1, 2]) * box_noise_scale rand_sign = paddle.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0 rand_part = paddle.rand(input_query_bbox.shape) rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * ( 1 - negative_gt_mask) rand_part *= rand_sign known_bbox += rand_part * diff known_bbox.clip_(min=0.0, max=1.0) input_query_bbox = bbox_xyxy_to_cxcywh(known_bbox) input_query_bbox = inverse_sigmoid(input_query_bbox) class_embed = paddle.concat( [class_embed, paddle.zeros([1, class_embed.shape[-1]])]) input_query_class = paddle.gather(class_embed, input_query_class.flatten(), axis=0).reshape([bs, num_denoising, -1]) tgt_size = num_denoising + num_queries attn_mask = paddle.ones([tgt_size, tgt_size]) < 0 # match query cannot see the reconstruction attn_mask[num_denoising:, :num_denoising] = True # reconstruct cannot see each other for i in range(num_group): if i == 0: attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num * 2 * (i + 1):num_denoising] = True if i == num_group - 1: attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num * i * 2] = True else: attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num * 2 * (i + 1):num_denoising] = True attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num * 2 * i] = True attn_mask = ~attn_mask dn_meta = { "dn_positive_idx": dn_positive_idx, "dn_num_group": num_group, "dn_num_split": [num_denoising, num_queries] } return input_query_class, input_query_bbox, attn_mask, dn_meta def get_sine_pos_embed(pos_tensor, num_pos_feats=128, temperature=10000, exchange_xy=True): """generate sine position embedding from a position tensor Args: pos_tensor (Tensor): Shape as `(None, n)`. num_pos_feats (int): projected shape for each float in the tensor. Default: 128 temperature (int): The temperature used for scaling the position embedding. Default: 10000. exchange_xy (bool, optional): exchange pos x and pos y. \ For example, input tensor is `[x, y]`, the results will # noqa be `[pos(y), pos(x)]`. Defaults: True. Returns: Tensor: Returned position embedding # noqa with shape `(None, n * num_pos_feats)`. """ scale = 2. * math.pi dim_t = 2. * paddle.floor_divide(paddle.arange(num_pos_feats), paddle.to_tensor(2)) dim_t = scale / temperature**(dim_t / num_pos_feats) def sine_func(x): x *= dim_t return paddle.stack((x[:, :, 0::2].sin(), x[:, :, 1::2].cos()), axis=3).flatten(2) pos_res = [sine_func(x) for x in pos_tensor.split(pos_tensor.shape[-1], -1)] if exchange_xy: pos_res[0], pos_res[1] = pos_res[1], pos_res[0] pos_res = paddle.concat(pos_res, axis=2) return pos_res def mask_to_box_coordinate(mask, normalize=False, format="xyxy", dtype="float32"): """ Compute the bounding boxes around the provided mask. Args: mask (Tensor:bool): [b, c, h, w] Returns: bbox (Tensor): [b, c, 4] """ assert mask.ndim == 4 assert format in ["xyxy", "xywh"] h, w = mask.shape[-2:] y, x = paddle.meshgrid(paddle.arange(end=h, dtype=dtype), paddle.arange(end=w, dtype=dtype)) x_mask = x * mask.astype(x.dtype) x_max = x_mask.flatten(-2).max(-1) + 1 x_min = paddle.where(mask.astype(bool), x_mask, paddle.to_tensor(1e8)).flatten(-2).min(-1) y_mask = y * mask.astype(y.dtype) y_max = y_mask.flatten(-2).max(-1) + 1 y_min = paddle.where(mask.astype(bool), y_mask, paddle.to_tensor(1e8)).flatten(-2).min(-1) out_bbox = paddle.stack([x_min, y_min, x_max, y_max], axis=-1) mask = mask.any(axis=[2, 3]).unsqueeze(2) out_bbox = out_bbox * mask.astype(out_bbox.dtype) if normalize: out_bbox /= paddle.to_tensor([w, h, w, h]).astype(dtype) return out_bbox if format == "xyxy" else bbox_xyxy_to_cxcywh(out_bbox) def varifocal_loss_with_logits(pred_logits, gt_score, label, normalizer=1.0, alpha=0.75, gamma=2.0): pred_score = F.sigmoid(pred_logits) weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label loss = F.binary_cross_entropy_with_logits(pred_logits, gt_score, weight=weight, reduction='none') return loss.mean(1).sum() / normalizer ================================================ FILE: ppdet/optimizer/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import optimizer from . import ema from .optimizer import * from .ema import * ================================================ FILE: ppdet/optimizer/adamw.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle from paddle.optimizer import AdamW from functools import partial import re IS_PADDLE_LATER_2_4 = ( int(paddle.version.major) >= 2 and int(paddle.version.minor) >= 4) or int(paddle.version.major) == 0 def layerwise_lr_decay(decay_rate, name_dict, n_layers, param): """ Args: decay_rate (float): The layer-wise decay ratio. name_dict (dict): The keys of name_dict is dynamic name of model while the value of name_dict is static name. Use model.named_parameters() to get name_dict. n_layers (int): Total number of layers in the transformer encoder. """ ratio = 1.0 static_name = name_dict[param.name] if 'blocks.' in static_name or 'layers.' in static_name: idx_1 = static_name.find('blocks.') idx_2 = static_name.find('layers.') assert any([x >= 0 for x in [idx_1, idx_2]]), '' idx = idx_1 if idx_1 >= 0 else idx_2 # idx = re.findall('[blocks|layers]\.(\d+)\.', static_name)[0] layer = int(static_name[idx:].split('.')[1]) ratio = decay_rate**(n_layers - layer) elif 'cls_token' in static_name or 'patch_embed' in static_name or 'pos_embed' in static_name: ratio = decay_rate**(n_layers + 1) if IS_PADDLE_LATER_2_4: return ratio else: param.optimize_attr['learning_rate'] *= ratio class AdamWDL(AdamW): r""" The AdamWDL optimizer is implemented based on the AdamW Optimization with dynamic lr setting. Generally it's used for transformer model. We use "layerwise_lr_decay" as default dynamic lr setting method of AdamWDL. “Layer-wise decay” means exponentially decaying the learning rates of individual layers in a top-down manner. For example, suppose the 24-th layer uses a learning rate l, and the Layer-wise decay rate is α, then the learning rate of layer m is lα^(24-m). See more details on: https://arxiv.org/abs/1906.08237. .. math:: & t = t + 1 & moment\_1\_out = {\beta}_1 * moment\_1 + (1 - {\beta}_1) * grad & moment\_2\_out = {\beta}_2 * moment\_2 + (1 - {\beta}_2) * grad * grad & learning\_rate = learning\_rate * \frac{\sqrt{1 - {\beta}_2^t}}{1 - {\beta}_1^t} & param\_out = param - learning\_rate * (\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param) Args: learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``. It can be a float value or a LRScheduler. The default value is 0.001. beta1 (float, optional): The exponential decay rate for the 1st moment estimates. It should be a float number or a Tensor with shape [1] and data type as float32. The default value is 0.9. beta2 (float, optional): The exponential decay rate for the 2nd moment estimates. It should be a float number or a Tensor with shape [1] and data type as float32. The default value is 0.999. epsilon (float, optional): A small float value for numerical stability. It should be a float number or a Tensor with shape [1] and data type as float32. The default value is 1e-08. parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \ This parameter is required in dygraph mode. \ The default value is None in static mode, at this time all parameters will be updated. weight_decay (float, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01. apply_decay_param_fun (function|None, optional): If it is not None, only tensors that makes apply_decay_param_fun(Tensor.name)==True will be updated. It only works when we want to specify tensors. Default: None. grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of some derived class of ``GradientClipBase`` . There are three cliping strategies ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators. The accumulators are updated at every step. Every element of the two moving-average is updated in both dense mode and sparse mode. If the size of parameter is very large, then the update may be very slow. The lazy mode only update the element that has gradient in current mini-batch, so it will be much more faster. But this mode has different semantics with the original Adam algorithm and may lead to different result. The default value is False. multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false. layerwise_decay (float, optional): The layer-wise decay ratio. Defaults to 1.0. n_layers (int, optional): The total number of encoder layers. Defaults to 12. set_param_lr_fun (function|None, optional): If it's not None, set_param_lr_fun() will set the the parameter learning rate before it executes Adam Operator. Defaults to :ref:`layerwise_lr_decay`. name_dict (dict, optional): The keys of name_dict is dynamic name of model while the value of name_dict is static name. Use model.named_parameters() to get name_dict. name (str, optional): Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. The default value is None. Examples: .. code-block:: python import paddle from paddlenlp.ops.optimizer import AdamWDL def simple_lr_setting(decay_rate, name_dict, n_layers, param): ratio = 1.0 static_name = name_dict[param.name] if "weight" in static_name: ratio = decay_rate**0.5 param.optimize_attr["learning_rate"] *= ratio linear = paddle.nn.Linear(10, 10) name_dict = dict() for n, p in linear.named_parameters(): name_dict[p.name] = n inp = paddle.rand([10,10], dtype="float32") out = linear(inp) loss = paddle.mean(out) adamwdl = AdamWDL( learning_rate=1e-4, parameters=linear.parameters(), set_param_lr_fun=simple_lr_setting, layerwise_decay=0.8, name_dict=name_dict) loss.backward() adamwdl.step() adamwdl.clear_grad() """ def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, parameters=None, weight_decay=0.01, apply_decay_param_fun=None, grad_clip=None, lazy_mode=False, multi_precision=False, layerwise_decay=1.0, n_layers=12, set_param_lr_func=None, name_dict=None, name=None): if not isinstance(layerwise_decay, float): raise TypeError("coeff should be float or Tensor.") self.layerwise_decay = layerwise_decay self.n_layers = n_layers self.set_param_lr_func = partial( set_param_lr_func, layerwise_decay, name_dict, n_layers) if set_param_lr_func is not None else set_param_lr_func if IS_PADDLE_LATER_2_4: super(AdamWDL, self).__init__( learning_rate=learning_rate, parameters=parameters, beta1=beta1, beta2=beta2, epsilon=epsilon, grad_clip=grad_clip, name=name, apply_decay_param_fun=apply_decay_param_fun, weight_decay=weight_decay, lazy_mode=lazy_mode, multi_precision=multi_precision, lr_ratio=self.set_param_lr_func) else: super(AdamWDL, self).__init__( learning_rate=learning_rate, parameters=parameters, beta1=beta1, beta2=beta2, epsilon=epsilon, grad_clip=grad_clip, name=name, apply_decay_param_fun=apply_decay_param_fun, weight_decay=weight_decay, lazy_mode=lazy_mode, multi_precision=multi_precision) def _append_optimize_op(self, block, param_and_grad): if self.set_param_lr_func is None: return super(AdamWDL, self)._append_optimize_op(block, param_and_grad) self._append_decoupled_weight_decay(block, param_and_grad) prev_lr = param_and_grad[0].optimize_attr["learning_rate"] self.set_param_lr_func(param_and_grad[0]) # excute Adam op res = super(AdamW, self)._append_optimize_op(block, param_and_grad) param_and_grad[0].optimize_attr["learning_rate"] = prev_lr return res if not IS_PADDLE_LATER_2_4: AdamWDL._append_optimize_op = _append_optimize_op def build_adamwdl(model, lr=1e-4, weight_decay=0.05, betas=(0.9, 0.999), layer_decay=0.65, num_layers=None, filter_bias_and_bn=True, skip_decay_names=None, set_param_lr_func='layerwise_lr_decay'): if skip_decay_names and filter_bias_and_bn: decay_dict = { param.name: not (len(param.shape) == 1 or name.endswith('.bias') or any([_n in name for _n in skip_decay_names])) for name, param in model.named_parameters() } parameters = [p for p in model.parameters()] else: parameters = model.parameters() opt_args = dict( parameters=parameters, learning_rate=lr, weight_decay=weight_decay) if decay_dict is not None: opt_args['apply_decay_param_fun'] = lambda n: decay_dict[n] if isinstance(set_param_lr_func, str): set_param_lr_func = eval(set_param_lr_func) opt_args['set_param_lr_func'] = set_param_lr_func opt_args['beta1'] = betas[0] opt_args['beta2'] = betas[1] opt_args['layerwise_decay'] = layer_decay name_dict = {p.name: n for n, p in model.named_parameters()} opt_args['name_dict'] = name_dict opt_args['n_layers'] = num_layers optimizer = AdamWDL(**opt_args) return optimizer ================================================ FILE: ppdet/optimizer/ema.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import math import paddle import weakref from copy import deepcopy from .utils import get_bn_running_state_names __all__ = ['ModelEMA', 'SimpleModelEMA'] class ModelEMA(object): """ Exponential Weighted Average for Deep Neutal Networks Args: model (nn.Layer): Detector of model. decay (int): The decay used for updating ema parameter. Ema's parameter are updated with the formula: `ema_param = decay * ema_param + (1 - decay) * cur_param`. Defaults is 0.9998. ema_decay_type (str): type in ['threshold', 'normal', 'exponential'], 'threshold' as default. cycle_epoch (int): The epoch of interval to reset ema_param and step. Defaults is -1, which means not reset. Its function is to add a regular effect to ema, which is set according to experience and is effective when the total training epoch is large. ema_black_list (set|list|tuple, optional): The custom EMA black_list. Blacklist of weight names that will not participate in EMA calculation. Default: None. """ def __init__(self, model, decay=0.9998, ema_decay_type='threshold', cycle_epoch=-1, ema_black_list=None, ema_filter_no_grad=False): self.step = 0 self.epoch = 0 self.decay = decay self.ema_decay_type = ema_decay_type self.cycle_epoch = cycle_epoch self.ema_black_list = self._match_ema_black_list( model.state_dict().keys(), ema_black_list) bn_states_names = get_bn_running_state_names(model) if ema_filter_no_grad: for n, p in model.named_parameters(): if p.stop_gradient and n not in bn_states_names: self.ema_black_list.add(n) self.state_dict = dict() for k, v in model.state_dict().items(): if k in self.ema_black_list: self.state_dict[k] = v else: self.state_dict[k] = paddle.zeros_like(v, dtype='float32') self._model_state = { k: weakref.ref(p) for k, p in model.state_dict().items() } def reset(self): self.step = 0 self.epoch = 0 for k, v in self.state_dict.items(): if k in self.ema_black_list: self.state_dict[k] = v else: self.state_dict[k] = paddle.zeros_like(v) def resume(self, state_dict, step=0): for k, v in state_dict.items(): if k in self.state_dict: if self.state_dict[k].dtype == v.dtype: self.state_dict[k] = v else: self.state_dict[k] = v.astype(self.state_dict[k].dtype) self.step = step def update(self, model=None): if self.ema_decay_type == 'threshold': decay = min(self.decay, (1 + self.step) / (10 + self.step)) elif self.ema_decay_type == 'exponential': decay = self.decay * (1 - math.exp(-(self.step + 1) / 2000)) else: decay = self.decay self._decay = decay if model is not None: model_dict = model.state_dict() else: model_dict = {k: p() for k, p in self._model_state.items()} assert all( [v is not None for _, v in model_dict.items()]), 'python gc.' for k, v in self.state_dict.items(): if k not in self.ema_black_list: v = decay * v + (1 - decay) * model_dict[k].astype('float32') v.stop_gradient = True self.state_dict[k] = v self.step += 1 def apply(self): if self.step == 0: return self.state_dict state_dict = dict() model_dict = {k: p() for k, p in self._model_state.items()} for k, v in self.state_dict.items(): if k in self.ema_black_list: v.stop_gradient = True state_dict[k] = v else: if self.ema_decay_type != 'exponential': v = v / (1 - self._decay**self.step) v = v.astype(model_dict[k].dtype) v.stop_gradient = True state_dict[k] = v self.epoch += 1 if self.cycle_epoch > 0 and self.epoch == self.cycle_epoch: self.reset() return state_dict def _match_ema_black_list(self, weight_name, ema_black_list=None): out_list = set() if ema_black_list: for name in weight_name: for key in ema_black_list: if key in name: out_list.add(name) return out_list class SimpleModelEMA(object): """ Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models Keep a moving average of everything in the model state_dict (parameters and buffers). This is intended to allow functionality like https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage A smoothed version of the weights is necessary for some training schemes to perform well. This class is sensitive where it is initialized in the sequence of model init, GPU assignment and distributed training wrappers. """ def __init__(self, model=None, decay=0.9996): """ Args: model (nn.Module): model to apply EMA. decay (float): ema decay reate. """ self.model = deepcopy(model) self.decay = decay def update(self, model, decay=None): if decay is None: decay = self.decay with paddle.no_grad(): state = {} msd = model.state_dict() for k, v in self.model.state_dict().items(): if paddle.is_floating_point(v): v *= decay v += (1.0 - decay) * msd[k].detach() state[k] = v self.model.set_state_dict(state) def resume(self, state_dict, step=0): state = {} msd = state_dict for k, v in self.model.state_dict().items(): if paddle.is_floating_point(v): v = msd[k].detach() state[k] = v self.model.set_state_dict(state) self.step = step ================================================ FILE: ppdet/optimizer/optimizer.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import re import sys import math import paddle import paddle.nn as nn import paddle.optimizer as optimizer import paddle.regularizer as regularizer from ppdet.core.workspace import register, serializable import copy from .adamw import AdamWDL, build_adamwdl __all__ = ['LearningRate', 'OptimizerBuilder'] from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) @serializable class CosineDecay(object): """ Cosine learning rate decay Args: max_epochs (int): max epochs for the training process. if you commbine cosine decay with warmup, it is recommended that the max_iters is much larger than the warmup iter use_warmup (bool): whether to use warmup. Default: True. min_lr_ratio (float): minimum learning rate ratio. Default: 0. last_plateau_epochs (int): use minimum learning rate in the last few epochs. Default: 0. """ def __init__(self, max_epochs=1000, use_warmup=True, min_lr_ratio=0., last_plateau_epochs=0): self.max_epochs = max_epochs self.use_warmup = use_warmup self.min_lr_ratio = min_lr_ratio self.last_plateau_epochs = last_plateau_epochs def __call__(self, base_lr=None, boundary=None, value=None, step_per_epoch=None): assert base_lr is not None, "either base LR or values should be provided" max_iters = self.max_epochs * int(step_per_epoch) last_plateau_iters = self.last_plateau_epochs * int(step_per_epoch) min_lr = base_lr * self.min_lr_ratio if boundary is not None and value is not None and self.use_warmup: # use warmup warmup_iters = len(boundary) for i in range(int(boundary[-1]), max_iters): boundary.append(i) if i < max_iters - last_plateau_iters: decayed_lr = min_lr + (base_lr - min_lr) * 0.5 * (math.cos( (i - warmup_iters) * math.pi / (max_iters - warmup_iters - last_plateau_iters)) + 1) value.append(decayed_lr) else: value.append(min_lr) return optimizer.lr.PiecewiseDecay(boundary, value) elif last_plateau_iters > 0: # not use warmup, but set `last_plateau_epochs` > 0 boundary = [] value = [] for i in range(max_iters): if i < max_iters - last_plateau_iters: decayed_lr = min_lr + (base_lr - min_lr) * 0.5 * (math.cos( i * math.pi / (max_iters - last_plateau_iters)) + 1) value.append(decayed_lr) else: value.append(min_lr) if i > 0: boundary.append(i) return optimizer.lr.PiecewiseDecay(boundary, value) return optimizer.lr.CosineAnnealingDecay( base_lr, T_max=max_iters, eta_min=min_lr) @serializable class PiecewiseDecay(object): """ Multi step learning rate decay Args: gamma (float | list): decay factor milestones (list): steps at which to decay learning rate """ def __init__(self, gamma=[0.1, 0.01], milestones=[8, 11], values=None, use_warmup=True): super(PiecewiseDecay, self).__init__() if type(gamma) is not list: self.gamma = [] for i in range(len(milestones)): self.gamma.append(gamma / 10**i) else: self.gamma = gamma self.milestones = milestones self.values = values self.use_warmup = use_warmup def __call__(self, base_lr=None, boundary=None, value=None, step_per_epoch=None): if boundary is not None and self.use_warmup: boundary.extend([int(step_per_epoch) * i for i in self.milestones]) else: # do not use LinearWarmup boundary = [int(step_per_epoch) * i for i in self.milestones] value = [base_lr] # during step[0, boundary[0]] is base_lr # self.values is setted directly in config if self.values is not None: assert len(self.milestones) + 1 == len(self.values) return optimizer.lr.PiecewiseDecay(boundary, self.values) # value is computed by self.gamma value = value if value is not None else [base_lr] for i in self.gamma: value.append(base_lr * i) return optimizer.lr.PiecewiseDecay(boundary, value) @serializable class LinearWarmup(object): """ Warm up learning rate linearly Args: steps (int): warm up steps start_factor (float): initial learning rate factor epochs (int|None): use epochs as warm up steps, the priority of `epochs` is higher than `steps`. Default: None. """ def __init__(self, steps=500, start_factor=1. / 3, epochs=None, epochs_first=True): super(LinearWarmup, self).__init__() self.steps = steps self.start_factor = start_factor self.epochs = epochs self.epochs_first = epochs_first def __call__(self, base_lr, step_per_epoch): boundary = [] value = [] if self.epochs_first and self.epochs is not None: warmup_steps = self.epochs * step_per_epoch else: warmup_steps = self.steps warmup_steps = max(warmup_steps, 1) for i in range(warmup_steps + 1): if warmup_steps > 0: alpha = i / warmup_steps factor = self.start_factor * (1 - alpha) + alpha lr = base_lr * factor value.append(lr) if i > 0: boundary.append(i) return boundary, value @serializable class ExpWarmup(object): """ Warm up learning rate in exponential mode Args: steps (int): warm up steps. epochs (int|None): use epochs as warm up steps, the priority of `epochs` is higher than `steps`. Default: None. power (int): Exponential coefficient. Default: 2. """ def __init__(self, steps=1000, epochs=None, power=2): super(ExpWarmup, self).__init__() self.steps = steps self.epochs = epochs self.power = power def __call__(self, base_lr, step_per_epoch): boundary = [] value = [] warmup_steps = self.epochs * step_per_epoch if self.epochs is not None else self.steps warmup_steps = max(warmup_steps, 1) for i in range(warmup_steps + 1): factor = (i / float(warmup_steps))**self.power value.append(base_lr * factor) if i > 0: boundary.append(i) return boundary, value @register class LearningRate(object): """ Learning Rate configuration Args: base_lr (float): base learning rate schedulers (list): learning rate schedulers """ __category__ = 'optim' def __init__(self, base_lr=0.01, schedulers=[PiecewiseDecay(), LinearWarmup()]): super(LearningRate, self).__init__() self.base_lr = base_lr self.schedulers = [] schedulers = copy.deepcopy(schedulers) for sched in schedulers: if isinstance(sched, dict): # support dict sched instantiate module = sys.modules[__name__] type = sched.pop("name") scheduler = getattr(module, type)(**sched) self.schedulers.append(scheduler) else: self.schedulers.append(sched) def __call__(self, step_per_epoch): assert len(self.schedulers) >= 1 if not self.schedulers[0].use_warmup: return self.schedulers[0](base_lr=self.base_lr, step_per_epoch=step_per_epoch) # TODO: split warmup & decay # warmup boundary, value = self.schedulers[1](self.base_lr, step_per_epoch) # decay decay_lr = self.schedulers[0](self.base_lr, boundary, value, step_per_epoch) return decay_lr @register class OptimizerBuilder(): """ Build optimizer handles Args: regularizer (object): an `Regularizer` instance optimizer (object): an `Optimizer` instance """ __category__ = 'optim' def __init__(self, clip_grad_by_norm=None, clip_grad_by_value=None, regularizer={'type': 'L2', 'factor': .0001}, optimizer={'type': 'Momentum', 'momentum': .9}): self.clip_grad_by_norm = clip_grad_by_norm self.clip_grad_by_value = clip_grad_by_value self.regularizer = regularizer self.optimizer = optimizer def __call__(self, learning_rate, model=None): if self.clip_grad_by_norm is not None: grad_clip = nn.ClipGradByGlobalNorm( clip_norm=self.clip_grad_by_norm) elif self.clip_grad_by_value is not None: var = abs(self.clip_grad_by_value) grad_clip = nn.ClipGradByValue(min=-var, max=var) else: grad_clip = None if self.regularizer and self.regularizer != 'None': reg_type = self.regularizer['type'] + 'Decay' reg_factor = self.regularizer['factor'] regularization = getattr(regularizer, reg_type)(reg_factor) else: regularization = None optim_args = self.optimizer.copy() optim_type = optim_args['type'] del optim_args['type'] if optim_type == 'AdamWDL': return build_adamwdl(model, lr=learning_rate, **optim_args) if optim_type != 'AdamW': optim_args['weight_decay'] = regularization op = getattr(optimizer, optim_type) if 'param_groups' in optim_args: assert isinstance(optim_args['param_groups'], list), '' param_groups = optim_args.pop('param_groups') params, visited = [], [] for group in param_groups: assert isinstance(group, dict) and 'params' in group and isinstance( group['params'], list), '' _params = {} for n, p in model.named_parameters(): if not p.trainable: continue for k in group['params']: if re.search(k, n): _params.update({n: p}) break _group = group.copy() _group.update({'params': list(_params.values())}) params.append(_group) visited.extend(list(_params.keys())) ext_params = [ p for n, p in model.named_parameters() if n not in visited and p.trainable is True ] if len(ext_params) < len(model.parameters()): params.append({'params': ext_params}) elif len(ext_params) > len(model.parameters()): raise RuntimeError else: _params = model.parameters() params = [param for param in _params if param.trainable is True] return op(learning_rate=learning_rate, parameters=params, grad_clip=grad_clip, **optim_args) ================================================ FILE: ppdet/optimizer/utils.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn from typing import List def get_bn_running_state_names(model: nn.Layer) -> List[str]: """Get all bn state full names including running mean and variance """ names = [] for n, m in model.named_sublayers(): if isinstance(m, (nn.BatchNorm2D, nn.SyncBatchNorm)): assert hasattr(m, '_mean'), f'assert {m} has _mean' assert hasattr(m, '_variance'), f'assert {m} has _variance' running_mean = f'{n}._mean' running_var = f'{n}._variance' names.extend([running_mean, running_var]) return names ================================================ FILE: ppdet/slim/__init__.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import distill_loss from . import distill_model from . import ofa from . import prune from . import quant from . import unstructured_prune from .distill_loss import * from .distill_model import * from .ofa import * from .prune import * from .quant import * from .unstructured_prune import * import yaml from ppdet.core.workspace import load_config from ppdet.utils.checkpoint import load_pretrain_weight def build_slim_model(cfg, slim_cfg, mode='train'): with open(slim_cfg) as f: slim_load_cfg = yaml.load(f, Loader=yaml.Loader) if mode != 'train' and slim_load_cfg['slim'] == 'Distill': return cfg if slim_load_cfg['slim'] == 'Distill': if "slim_method" in slim_load_cfg and slim_load_cfg[ 'slim_method'] == "FGD": model = FGDDistillModel(cfg, slim_cfg) elif "slim_method" in slim_load_cfg and slim_load_cfg[ 'slim_method'] == "LD": model = LDDistillModel(cfg, slim_cfg) elif "slim_method" in slim_load_cfg and slim_load_cfg[ 'slim_method'] == "CWD": model = CWDDistillModel(cfg, slim_cfg) elif "slim_method" in slim_load_cfg and slim_load_cfg[ 'slim_method'] == "PPYOLOEDistill": model = PPYOLOEDistillModel(cfg, slim_cfg) else: # common distillation model model = DistillModel(cfg, slim_cfg) cfg['model'] = model cfg['slim_type'] = cfg.slim elif slim_load_cfg['slim'] == 'OFA': load_config(slim_cfg) model = create(cfg.architecture) load_pretrain_weight(model, cfg.weights) slim = create(cfg.slim) cfg['slim'] = slim cfg['model'] = slim(model, model.state_dict()) cfg['slim_type'] = cfg.slim elif slim_load_cfg['slim'] == 'DistillPrune': if mode == 'train': model = DistillModel(cfg, slim_cfg) pruner = create(cfg.pruner) pruner(model.student_model) else: model = create(cfg.architecture) weights = cfg.weights load_config(slim_cfg) pruner = create(cfg.pruner) model = pruner(model) load_pretrain_weight(model, weights) cfg['model'] = model cfg['slim_type'] = cfg.slim elif slim_load_cfg['slim'] == 'PTQ': model = create(cfg.architecture) load_config(slim_cfg) load_pretrain_weight(model, cfg.weights) slim = create(cfg.slim) cfg['slim_type'] = cfg.slim cfg['slim'] = slim cfg['model'] = slim(model) elif slim_load_cfg['slim'] == 'UnstructuredPruner': load_config(slim_cfg) slim = create(cfg.slim) cfg['slim_type'] = cfg.slim cfg['slim'] = slim cfg['unstructured_prune'] = True else: load_config(slim_cfg) model = create(cfg.architecture) if mode == 'train': load_pretrain_weight(model, cfg.pretrain_weights) slim = create(cfg.slim) cfg['slim_type'] = cfg.slim # TODO: fix quant export model in framework. if mode == 'test' and 'QAT' in slim_load_cfg['slim']: slim.quant_config['activation_preprocess_type'] = None cfg['model'] = slim(model) cfg['slim'] = slim if mode != 'train': load_pretrain_weight(cfg['model'], cfg.weights) return cfg ================================================ FILE: ppdet/slim/distill_loss.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import math import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr from ppdet.core.workspace import register from ppdet.modeling import ops from ppdet.modeling.losses.iou_loss import GIoULoss from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) __all__ = [ 'DistillYOLOv3Loss', 'KnowledgeDistillationKLDivLoss', 'DistillPPYOLOELoss', 'FGDFeatureLoss', 'CWDFeatureLoss', 'PKDFeatureLoss', 'MGDFeatureLoss', ] def parameter_init(mode="kaiming", value=0.): if mode == "kaiming": weight_attr = paddle.nn.initializer.KaimingUniform() elif mode == "constant": weight_attr = paddle.nn.initializer.Constant(value=value) else: weight_attr = paddle.nn.initializer.KaimingUniform() weight_init = ParamAttr(initializer=weight_attr) return weight_init def feature_norm(feat): # Normalize the feature maps to have zero mean and unit variances. assert len(feat.shape) == 4 N, C, H, W = feat.shape feat = feat.transpose([1, 0, 2, 3]).reshape([C, -1]) mean = feat.mean(axis=-1, keepdim=True) std = feat.std(axis=-1, keepdim=True) feat = (feat - mean) / (std + 1e-6) return feat.reshape([C, N, H, W]).transpose([1, 0, 2, 3]) @register class DistillYOLOv3Loss(nn.Layer): def __init__(self, weight=1000): super(DistillYOLOv3Loss, self).__init__() self.loss_weight = weight def obj_weighted_reg(self, sx, sy, sw, sh, tx, ty, tw, th, tobj): loss_x = ops.sigmoid_cross_entropy_with_logits(sx, F.sigmoid(tx)) loss_y = ops.sigmoid_cross_entropy_with_logits(sy, F.sigmoid(ty)) loss_w = paddle.abs(sw - tw) loss_h = paddle.abs(sh - th) loss = paddle.add_n([loss_x, loss_y, loss_w, loss_h]) weighted_loss = paddle.mean(loss * F.sigmoid(tobj)) return weighted_loss def obj_weighted_cls(self, scls, tcls, tobj): loss = ops.sigmoid_cross_entropy_with_logits(scls, F.sigmoid(tcls)) weighted_loss = paddle.mean(paddle.multiply(loss, F.sigmoid(tobj))) return weighted_loss def obj_loss(self, sobj, tobj): obj_mask = paddle.cast(tobj > 0., dtype="float32") obj_mask.stop_gradient = True loss = paddle.mean( ops.sigmoid_cross_entropy_with_logits(sobj, obj_mask)) return loss def forward(self, teacher_model, student_model): teacher_distill_pairs = teacher_model.yolo_head.loss.distill_pairs student_distill_pairs = student_model.yolo_head.loss.distill_pairs distill_reg_loss, distill_cls_loss, distill_obj_loss = [], [], [] for s_pair, t_pair in zip(student_distill_pairs, teacher_distill_pairs): distill_reg_loss.append( self.obj_weighted_reg(s_pair[0], s_pair[1], s_pair[2], s_pair[ 3], t_pair[0], t_pair[1], t_pair[2], t_pair[3], t_pair[4])) distill_cls_loss.append( self.obj_weighted_cls(s_pair[5], t_pair[5], t_pair[4])) distill_obj_loss.append(self.obj_loss(s_pair[4], t_pair[4])) distill_reg_loss = paddle.add_n(distill_reg_loss) distill_cls_loss = paddle.add_n(distill_cls_loss) distill_obj_loss = paddle.add_n(distill_obj_loss) loss = (distill_reg_loss + distill_cls_loss + distill_obj_loss ) * self.loss_weight return loss @register class KnowledgeDistillationKLDivLoss(nn.Layer): """Loss function for knowledge distilling using KL divergence. Args: reduction (str): Options are `'none'`, `'mean'` and `'sum'`. loss_weight (float): Loss weight of current loss. T (int): Temperature for distillation. """ def __init__(self, reduction='mean', loss_weight=1.0, T=10): super(KnowledgeDistillationKLDivLoss, self).__init__() assert reduction in ('none', 'mean', 'sum') assert T >= 1 self.reduction = reduction self.loss_weight = loss_weight self.T = T def knowledge_distillation_kl_div_loss(self, pred, soft_label, T, detach_target=True): r"""Loss function for knowledge distilling using KL divergence. Args: pred (Tensor): Predicted logits with shape (N, n + 1). soft_label (Tensor): Target logits with shape (N, N + 1). T (int): Temperature for distillation. detach_target (bool): Remove soft_label from automatic differentiation """ assert pred.shape == soft_label.shape target = F.softmax(soft_label / T, axis=1) if detach_target: target = target.detach() kd_loss = F.kl_div( F.log_softmax( pred / T, axis=1), target, reduction='none').mean(1) * (T * T) return kd_loss def forward(self, pred, soft_label, weight=None, avg_factor=None, reduction_override=None): """Forward function. Args: pred (Tensor): Predicted logits with shape (N, n + 1). soft_label (Tensor): Target logits with shape (N, N + 1). weight (Tensor, optional): The weight of loss for each prediction. Defaults to None. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. reduction_override (str, optional): The reduction method used to override the original reduction method of the loss. Defaults to None. """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = (reduction_override if reduction_override else self.reduction) loss_kd_out = self.knowledge_distillation_kl_div_loss( pred, soft_label, T=self.T) if weight is not None: loss_kd_out = weight * loss_kd_out if avg_factor is None: if reduction == 'none': loss = loss_kd_out elif reduction == 'mean': loss = loss_kd_out.mean() elif reduction == 'sum': loss = loss_kd_out.sum() else: # if reduction is mean, then average the loss by avg_factor if reduction == 'mean': loss = loss_kd_out.sum() / avg_factor # if reduction is 'none', then do nothing, otherwise raise an error elif reduction != 'none': raise ValueError( 'avg_factor can not be used with reduction="sum"') loss_kd = self.loss_weight * loss return loss_kd @register class DistillPPYOLOELoss(nn.Layer): def __init__( self, loss_weight={'logits': 4.0, 'feat': 1.0}, logits_distill=True, logits_loss_weight={'class': 1.0, 'iou': 2.5, 'dfl': 0.5}, logits_ld_distill=False, logits_ld_params={'weight': 20000, 'T': 10}, feat_distill=True, feat_distiller='fgd', feat_distill_place='neck_feats', teacher_width_mult=1.0, # L student_width_mult=0.75, # M feat_out_channels=[768, 384, 192]): super(DistillPPYOLOELoss, self).__init__() self.loss_weight_logits = loss_weight['logits'] self.loss_weight_feat = loss_weight['feat'] self.logits_distill = logits_distill self.logits_ld_distill = logits_ld_distill self.feat_distill = feat_distill if logits_distill and self.loss_weight_logits > 0: self.bbox_loss_weight = logits_loss_weight['iou'] self.dfl_loss_weight = logits_loss_weight['dfl'] self.qfl_loss_weight = logits_loss_weight['class'] self.loss_bbox = GIoULoss() if logits_ld_distill: self.loss_kd = KnowledgeDistillationKLDivLoss( loss_weight=logits_ld_params['weight'], T=logits_ld_params['T']) if feat_distill and self.loss_weight_feat > 0: assert feat_distiller in ['cwd', 'fgd', 'pkd', 'mgd', 'mimic'] assert feat_distill_place in ['backbone_feats', 'neck_feats'] self.feat_distill_place = feat_distill_place self.t_channel_list = [ int(c * teacher_width_mult) for c in feat_out_channels ] self.s_channel_list = [ int(c * student_width_mult) for c in feat_out_channels ] self.distill_feat_loss_modules = [] for i in range(len(feat_out_channels)): if feat_distiller == 'cwd': feat_loss_module = CWDFeatureLoss( student_channels=self.s_channel_list[i], teacher_channels=self.t_channel_list[i], normalize=True) elif feat_distiller == 'fgd': feat_loss_module = FGDFeatureLoss( student_channels=self.s_channel_list[i], teacher_channels=self.t_channel_list[i], normalize=True, alpha_fgd=0.00001, beta_fgd=0.000005, gamma_fgd=0.00001, lambda_fgd=0.00000005) elif feat_distiller == 'pkd': feat_loss_module = PKDFeatureLoss( student_channels=self.s_channel_list[i], teacher_channels=self.t_channel_list[i], normalize=True, resize_stu=True) elif feat_distiller == 'mgd': feat_loss_module = MGDFeatureLoss( student_channels=self.s_channel_list[i], teacher_channels=self.t_channel_list[i], normalize=True, loss_func='ssim') elif feat_distiller == 'mimic': feat_loss_module = MimicFeatureLoss( student_channels=self.s_channel_list[i], teacher_channels=self.t_channel_list[i], normalize=True) else: raise ValueError self.distill_feat_loss_modules.append(feat_loss_module) def quality_focal_loss(self, pred_logits, soft_target_logits, beta=2.0, use_sigmoid=False, num_total_pos=None): if use_sigmoid: func = F.binary_cross_entropy_with_logits soft_target = F.sigmoid(soft_target_logits) pred_sigmoid = F.sigmoid(pred_logits) preds = pred_logits else: func = F.binary_cross_entropy soft_target = soft_target_logits pred_sigmoid = pred_logits preds = pred_sigmoid scale_factor = pred_sigmoid - soft_target loss = func( preds, soft_target, reduction='none') * scale_factor.abs().pow(beta) loss = loss.sum(1) if num_total_pos is not None: loss = loss.sum() / num_total_pos else: loss = loss.mean() return loss def bbox_loss(self, s_bbox, t_bbox, weight_targets=None): # [x,y,w,h] if weight_targets is not None: loss = paddle.sum(self.loss_bbox(s_bbox, t_bbox) * weight_targets) avg_factor = weight_targets.sum() loss = loss / avg_factor else: loss = paddle.mean(self.loss_bbox(s_bbox, t_bbox)) return loss def distribution_focal_loss(self, pred_corners, target_corners, weight_targets=None): target_corners_label = F.softmax(target_corners, axis=-1) loss_dfl = F.cross_entropy( pred_corners, target_corners_label, soft_label=True, reduction='none') loss_dfl = loss_dfl.sum(1) if weight_targets is not None: loss_dfl = loss_dfl * (weight_targets.expand([-1, 4]).reshape([-1])) loss_dfl = loss_dfl.sum(-1) / weight_targets.sum() else: loss_dfl = loss_dfl.mean(-1) return loss_dfl / 4.0 # 4 direction def main_kd(self, mask_positive, pred_scores, soft_cls, num_classes): num_pos = mask_positive.sum() if num_pos > 0: cls_mask = mask_positive.unsqueeze(-1).tile([1, 1, num_classes]) pred_scores_pos = paddle.masked_select( pred_scores, cls_mask).reshape([-1, num_classes]) soft_cls_pos = paddle.masked_select( soft_cls, cls_mask).reshape([-1, num_classes]) loss_kd = self.loss_kd( pred_scores_pos, soft_cls_pos, avg_factor=num_pos) else: loss_kd = paddle.zeros([]) return loss_kd def forward(self, teacher_model, student_model): teacher_distill_pairs = teacher_model.yolo_head.distill_pairs student_distill_pairs = student_model.yolo_head.distill_pairs if self.logits_distill and self.loss_weight_logits > 0: distill_bbox_loss, distill_dfl_loss, distill_cls_loss = [], [], [] distill_cls_loss.append( self.quality_focal_loss( student_distill_pairs['pred_cls_scores'].reshape( (-1, student_distill_pairs['pred_cls_scores'].shape[-1] )), teacher_distill_pairs['pred_cls_scores'].detach().reshape( (-1, teacher_distill_pairs['pred_cls_scores'].shape[-1] )), num_total_pos=student_distill_pairs['pos_num'], use_sigmoid=False)) distill_bbox_loss.append( self.bbox_loss(student_distill_pairs['pred_bboxes_pos'], teacher_distill_pairs['pred_bboxes_pos'].detach(), weight_targets=student_distill_pairs['bbox_weight'] ) if 'pred_bboxes_pos' in student_distill_pairs and \ 'pred_bboxes_pos' in teacher_distill_pairs and \ 'bbox_weight' in student_distill_pairs else paddle.zeros([])) distill_dfl_loss.append( self.distribution_focal_loss( student_distill_pairs['pred_dist_pos'].reshape((-1, student_distill_pairs['pred_dist_pos'].shape[-1])), teacher_distill_pairs['pred_dist_pos'].detach().reshape((-1, teacher_distill_pairs['pred_dist_pos'].shape[-1])), \ weight_targets=student_distill_pairs['bbox_weight'] ) if 'pred_dist_pos' in student_distill_pairs and \ 'pred_dist_pos' in teacher_distill_pairs and \ 'bbox_weight' in student_distill_pairs else paddle.zeros([])) distill_cls_loss = paddle.add_n(distill_cls_loss) distill_bbox_loss = paddle.add_n(distill_bbox_loss) distill_dfl_loss = paddle.add_n(distill_dfl_loss) logits_loss = distill_bbox_loss * self.bbox_loss_weight + distill_cls_loss * self.qfl_loss_weight + distill_dfl_loss * self.dfl_loss_weight if self.logits_ld_distill: loss_kd = self.main_kd( student_distill_pairs['mask_positive_select'], student_distill_pairs['pred_cls_scores'], teacher_distill_pairs['pred_cls_scores'], student_model.yolo_head.num_classes, ) logits_loss += loss_kd else: logits_loss = paddle.zeros([]) if self.feat_distill and self.loss_weight_feat > 0: feat_loss_list = [] inputs = student_model.inputs assert 'gt_bbox' in inputs assert self.feat_distill_place in student_distill_pairs assert self.feat_distill_place in teacher_distill_pairs stu_feats = student_distill_pairs[self.feat_distill_place] tea_feats = teacher_distill_pairs[self.feat_distill_place] for i, loss_module in enumerate(self.distill_feat_loss_modules): feat_loss_list.append( loss_module(stu_feats[i], tea_feats[i], inputs)) feat_loss = paddle.add_n(feat_loss_list) else: feat_loss = paddle.zeros([]) student_model.yolo_head.distill_pairs.clear() teacher_model.yolo_head.distill_pairs.clear() return logits_loss * self.loss_weight_logits, feat_loss * self.loss_weight_feat @register class CWDFeatureLoss(nn.Layer): def __init__(self, student_channels, teacher_channels, normalize=False, tau=1.0, weight=1.0): super(CWDFeatureLoss, self).__init__() self.normalize = normalize self.tau = tau self.loss_weight = weight if student_channels != teacher_channels: self.align = nn.Conv2D( student_channels, teacher_channels, kernel_size=1, stride=1, padding=0) else: self.align = None def distill_softmax(self, x, tau): _, _, w, h = x.shape x = paddle.reshape(x, [-1, w * h]) x /= tau return F.softmax(x, axis=1) def forward(self, preds_s, preds_t, inputs=None): assert preds_s.shape[-2:] == preds_t.shape[-2:] N, C, H, W = preds_s.shape eps = 1e-5 if self.align is not None: preds_s = self.align(preds_s) if self.normalize: preds_s = feature_norm(preds_s) preds_t = feature_norm(preds_t) softmax_pred_s = self.distill_softmax(preds_s, self.tau) softmax_pred_t = self.distill_softmax(preds_t, self.tau) loss = paddle.sum(-softmax_pred_t * paddle.log(eps + softmax_pred_s) + softmax_pred_t * paddle.log(eps + softmax_pred_t)) return self.loss_weight * loss / (C * N) @register class FGDFeatureLoss(nn.Layer): """ Focal and Global Knowledge Distillation for Detectors The code is reference from https://github.com/yzd-v/FGD/blob/master/mmdet/distillation/losses/fgd.py Args: student_channels (int): The number of channels in the student's FPN feature map. Default to 256. teacher_channels (int): The number of channels in the teacher's FPN feature map. Default to 256. normalize (bool): Whether to normalize the feature maps. temp (float, optional): The temperature coefficient. Defaults to 0.5. alpha_fgd (float, optional): The weight of fg_loss. Defaults to 0.001 beta_fgd (float, optional): The weight of bg_loss. Defaults to 0.0005 gamma_fgd (float, optional): The weight of mask_loss. Defaults to 0.001 lambda_fgd (float, optional): The weight of relation_loss. Defaults to 0.000005 """ def __init__(self, student_channels, teacher_channels, normalize=False, loss_weight=1.0, temp=0.5, alpha_fgd=0.001, beta_fgd=0.0005, gamma_fgd=0.001, lambda_fgd=0.000005): super(FGDFeatureLoss, self).__init__() self.normalize = normalize self.loss_weight = loss_weight self.temp = temp self.alpha_fgd = alpha_fgd self.beta_fgd = beta_fgd self.gamma_fgd = gamma_fgd self.lambda_fgd = lambda_fgd kaiming_init = parameter_init("kaiming") zeros_init = parameter_init("constant", 0.0) if student_channels != teacher_channels: self.align = nn.Conv2D( student_channels, teacher_channels, kernel_size=1, stride=1, padding=0, weight_attr=kaiming_init) student_channels = teacher_channels else: self.align = None self.conv_mask_s = nn.Conv2D( student_channels, 1, kernel_size=1, weight_attr=kaiming_init) self.conv_mask_t = nn.Conv2D( teacher_channels, 1, kernel_size=1, weight_attr=kaiming_init) self.stu_conv_block = nn.Sequential( nn.Conv2D( student_channels, student_channels // 2, kernel_size=1, weight_attr=zeros_init), nn.LayerNorm([student_channels // 2, 1, 1]), nn.ReLU(), nn.Conv2D( student_channels // 2, student_channels, kernel_size=1, weight_attr=zeros_init)) self.tea_conv_block = nn.Sequential( nn.Conv2D( teacher_channels, teacher_channels // 2, kernel_size=1, weight_attr=zeros_init), nn.LayerNorm([teacher_channels // 2, 1, 1]), nn.ReLU(), nn.Conv2D( teacher_channels // 2, teacher_channels, kernel_size=1, weight_attr=zeros_init)) def spatial_channel_attention(self, x, t=0.5): shape = x.shape N, C, H, W = shape _f = paddle.abs(x) spatial_map = paddle.reshape( paddle.mean( _f, axis=1, keepdim=True) / t, [N, -1]) spatial_map = F.softmax(spatial_map, axis=1, dtype="float32") * H * W spatial_att = paddle.reshape(spatial_map, [N, H, W]) channel_map = paddle.mean( paddle.mean( _f, axis=2, keepdim=False), axis=2, keepdim=False) channel_att = F.softmax(channel_map / t, axis=1, dtype="float32") * C return [spatial_att, channel_att] def spatial_pool(self, x, mode="teacher"): batch, channel, width, height = x.shape x_copy = x x_copy = paddle.reshape(x_copy, [batch, channel, height * width]) x_copy = x_copy.unsqueeze(1) if mode.lower() == "student": context_mask = self.conv_mask_s(x) else: context_mask = self.conv_mask_t(x) context_mask = paddle.reshape(context_mask, [batch, 1, height * width]) context_mask = F.softmax(context_mask, axis=2) context_mask = context_mask.unsqueeze(-1) context = paddle.matmul(x_copy, context_mask) context = paddle.reshape(context, [batch, channel, 1, 1]) return context def mask_loss(self, stu_channel_att, tea_channel_att, stu_spatial_att, tea_spatial_att): def _func(a, b): return paddle.sum(paddle.abs(a - b)) / len(a) mask_loss = _func(stu_channel_att, tea_channel_att) + _func( stu_spatial_att, tea_spatial_att) return mask_loss def feature_loss(self, stu_feature, tea_feature, mask_fg, mask_bg, tea_channel_att, tea_spatial_att): mask_fg = mask_fg.unsqueeze(axis=1) mask_bg = mask_bg.unsqueeze(axis=1) tea_channel_att = tea_channel_att.unsqueeze(axis=-1).unsqueeze(axis=-1) tea_spatial_att = tea_spatial_att.unsqueeze(axis=1) fea_t = paddle.multiply(tea_feature, paddle.sqrt(tea_spatial_att)) fea_t = paddle.multiply(fea_t, paddle.sqrt(tea_channel_att)) fg_fea_t = paddle.multiply(fea_t, paddle.sqrt(mask_fg)) bg_fea_t = paddle.multiply(fea_t, paddle.sqrt(mask_bg)) fea_s = paddle.multiply(stu_feature, paddle.sqrt(tea_spatial_att)) fea_s = paddle.multiply(fea_s, paddle.sqrt(tea_channel_att)) fg_fea_s = paddle.multiply(fea_s, paddle.sqrt(mask_fg)) bg_fea_s = paddle.multiply(fea_s, paddle.sqrt(mask_bg)) fg_loss = F.mse_loss(fg_fea_s, fg_fea_t, reduction="sum") / len(mask_fg) bg_loss = F.mse_loss(bg_fea_s, bg_fea_t, reduction="sum") / len(mask_bg) return fg_loss, bg_loss def relation_loss(self, stu_feature, tea_feature): context_s = self.spatial_pool(stu_feature, "student") context_t = self.spatial_pool(tea_feature, "teacher") out_s = stu_feature + self.stu_conv_block(context_s) out_t = tea_feature + self.tea_conv_block(context_t) rela_loss = F.mse_loss(out_s, out_t, reduction="sum") / len(out_s) return rela_loss def mask_value(self, mask, xl, xr, yl, yr, value): mask[xl:xr, yl:yr] = paddle.maximum(mask[xl:xr, yl:yr], value) return mask def forward(self, stu_feature, tea_feature, inputs): assert stu_feature.shape[-2:] == stu_feature.shape[-2:] assert "gt_bbox" in inputs.keys() and "im_shape" in inputs.keys() gt_bboxes = inputs['gt_bbox'] ins_shape = [ inputs['im_shape'][i] for i in range(inputs['im_shape'].shape[0]) ] index_gt = [] for i in range(len(gt_bboxes)): if gt_bboxes[i].size > 2: index_gt.append(i) # only distill feature with labeled GTbox if len(index_gt) != len(gt_bboxes): index_gt_t = paddle.to_tensor(index_gt) stu_feature = paddle.index_select(stu_feature, index_gt_t) tea_feature = paddle.index_select(tea_feature, index_gt_t) ins_shape = [ins_shape[c] for c in index_gt] gt_bboxes = [gt_bboxes[c] for c in index_gt] assert len(gt_bboxes) == tea_feature.shape[0] if self.align is not None: stu_feature = self.align(stu_feature) if self.normalize: stu_feature = feature_norm(stu_feature) tea_feature = feature_norm(tea_feature) tea_spatial_att, tea_channel_att = self.spatial_channel_attention( tea_feature, self.temp) stu_spatial_att, stu_channel_att = self.spatial_channel_attention( stu_feature, self.temp) mask_fg = paddle.zeros(tea_spatial_att.shape) mask_bg = paddle.ones_like(tea_spatial_att) one_tmp = paddle.ones([*tea_spatial_att.shape[1:]]) zero_tmp = paddle.zeros([*tea_spatial_att.shape[1:]]) mask_fg.stop_gradient = True mask_bg.stop_gradient = True one_tmp.stop_gradient = True zero_tmp.stop_gradient = True wmin, wmax, hmin, hmax = [], [], [], [] if len(gt_bboxes) == 0: loss = self.relation_loss(stu_feature, tea_feature) return self.lambda_fgd * loss N, _, H, W = stu_feature.shape for i in range(N): tmp_box = paddle.ones_like(gt_bboxes[i]) tmp_box.stop_gradient = True tmp_box[:, 0] = gt_bboxes[i][:, 0] / ins_shape[i][1] * W tmp_box[:, 2] = gt_bboxes[i][:, 2] / ins_shape[i][1] * W tmp_box[:, 1] = gt_bboxes[i][:, 1] / ins_shape[i][0] * H tmp_box[:, 3] = gt_bboxes[i][:, 3] / ins_shape[i][0] * H zero = paddle.zeros_like(tmp_box[:, 0], dtype="int32") ones = paddle.ones_like(tmp_box[:, 2], dtype="int32") zero.stop_gradient = True ones.stop_gradient = True wmin.append( paddle.cast(paddle.floor(tmp_box[:, 0]), "int32").maximum(zero)) wmax.append(paddle.cast(paddle.ceil(tmp_box[:, 2]), "int32")) hmin.append( paddle.cast(paddle.floor(tmp_box[:, 1]), "int32").maximum(zero)) hmax.append(paddle.cast(paddle.ceil(tmp_box[:, 3]), "int32")) area_recip = 1.0 / ( hmax[i].reshape([1, -1]) + 1 - hmin[i].reshape([1, -1])) / ( wmax[i].reshape([1, -1]) + 1 - wmin[i].reshape([1, -1])) for j in range(len(gt_bboxes[i])): if gt_bboxes[i][j].sum() > 0: mask_fg[i] = self.mask_value( mask_fg[i], hmin[i][j], hmax[i][j] + 1, wmin[i][j], wmax[i][j] + 1, area_recip[0][j]) mask_bg[i] = paddle.where(mask_fg[i] > zero_tmp, zero_tmp, one_tmp) if paddle.sum(mask_bg[i]): mask_bg[i] /= paddle.sum(mask_bg[i]) fg_loss, bg_loss = self.feature_loss(stu_feature, tea_feature, mask_fg, mask_bg, tea_channel_att, tea_spatial_att) mask_loss = self.mask_loss(stu_channel_att, tea_channel_att, stu_spatial_att, tea_spatial_att) rela_loss = self.relation_loss(stu_feature, tea_feature) loss = self.alpha_fgd * fg_loss + self.beta_fgd * bg_loss \ + self.gamma_fgd * mask_loss + self.lambda_fgd * rela_loss return loss * self.loss_weight @register class PKDFeatureLoss(nn.Layer): """ PKD: General Distillation Framework for Object Detectors via Pearson Correlation Coefficient. Args: loss_weight (float): Weight of loss. Defaults to 1.0. resize_stu (bool): If True, we'll down/up sample the features of the student model to the spatial size of those of the teacher model if their spatial sizes are different. And vice versa. Defaults to True. """ def __init__(self, student_channels=256, teacher_channels=256, normalize=True, loss_weight=1.0, resize_stu=True): super(PKDFeatureLoss, self).__init__() self.normalize = normalize self.loss_weight = loss_weight self.resize_stu = resize_stu def forward(self, stu_feature, tea_feature, inputs=None): size_s, size_t = stu_feature.shape[2:], tea_feature.shape[2:] if size_s[0] != size_t[0]: if self.resize_stu: stu_feature = F.interpolate( stu_feature, size_t, mode='bilinear') else: tea_feature = F.interpolate( tea_feature, size_s, mode='bilinear') assert stu_feature.shape == tea_feature.shape if self.normalize: stu_feature = feature_norm(stu_feature) tea_feature = feature_norm(tea_feature) loss = F.mse_loss(stu_feature, tea_feature) / 2 return loss * self.loss_weight @register class MimicFeatureLoss(nn.Layer): def __init__(self, student_channels=256, teacher_channels=256, normalize=True, loss_weight=1.0): super(MimicFeatureLoss, self).__init__() self.normalize = normalize self.loss_weight = loss_weight self.mse_loss = nn.MSELoss() if student_channels != teacher_channels: self.align = nn.Conv2D( student_channels, teacher_channels, kernel_size=1, stride=1, padding=0) else: self.align = None def forward(self, stu_feature, tea_feature, inputs=None): if self.align is not None: stu_feature = self.align(stu_feature) if self.normalize: stu_feature = feature_norm(stu_feature) tea_feature = feature_norm(tea_feature) loss = self.mse_loss(stu_feature, tea_feature) return loss * self.loss_weight @register class MGDFeatureLoss(nn.Layer): def __init__(self, student_channels=256, teacher_channels=256, normalize=True, loss_weight=1.0, loss_func='mse'): super(MGDFeatureLoss, self).__init__() self.normalize = normalize self.loss_weight = loss_weight assert loss_func in ['mse', 'ssim'] self.loss_func = loss_func self.mse_loss = nn.MSELoss(reduction='sum') self.ssim_loss = SSIM(11) kaiming_init = parameter_init("kaiming") if student_channels != teacher_channels: self.align = nn.Conv2D( student_channels, teacher_channels, kernel_size=1, stride=1, padding=0, weight_attr=kaiming_init, bias_attr=False) else: self.align = None self.generation = nn.Sequential( nn.Conv2D( teacher_channels, teacher_channels, kernel_size=3, padding=1), nn.ReLU(), nn.Conv2D( teacher_channels, teacher_channels, kernel_size=3, padding=1)) def forward(self, stu_feature, tea_feature, inputs=None): N = stu_feature.shape[0] if self.align is not None: stu_feature = self.align(stu_feature) stu_feature = self.generation(stu_feature) if self.normalize: stu_feature = feature_norm(stu_feature) tea_feature = feature_norm(tea_feature) if self.loss_func == 'mse': loss = self.mse_loss(stu_feature, tea_feature) / N elif self.loss_func == 'ssim': ssim_loss = self.ssim_loss(stu_feature, tea_feature) loss = paddle.clip((1 - ssim_loss) / 2, 0, 1) else: raise ValueError return loss * self.loss_weight class SSIM(nn.Layer): def __init__(self, window_size=11, size_average=True): super(SSIM, self).__init__() self.window_size = window_size self.size_average = size_average self.channel = 1 self.window = self.create_window(window_size, self.channel) def gaussian(self, window_size, sigma): gauss = paddle.to_tensor([ math.exp(-(x - window_size // 2)**2 / float(2 * sigma**2)) for x in range(window_size) ]) return gauss / gauss.sum() def create_window(self, window_size, channel): _1D_window = self.gaussian(window_size, 1.5).unsqueeze(1) _2D_window = _1D_window.mm(_1D_window.t()).unsqueeze(0).unsqueeze(0) window = _2D_window.expand([channel, 1, window_size, window_size]) return window def _ssim(self, img1, img2, window, window_size, channel, size_average=True): mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel) mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel) mu1_sq = mu1.pow(2) mu2_sq = mu2.pow(2) mu1_mu2 = mu1 * mu2 sigma1_sq = F.conv2d( img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq sigma2_sq = F.conv2d( img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq sigma12 = F.conv2d( img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2 C1 = 0.01**2 C2 = 0.03**2 ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ( 1e-12 + (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)) if size_average: return ssim_map.mean() else: return ssim_map.mean([1, 2, 3]) def forward(self, img1, img2): channel = img1.shape[1] if channel == self.channel and self.window.dtype == img1.dtype: window = self.window else: window = self.create_window(self.window_size, channel) self.window = window self.channel = channel return self._ssim(img1, img2, window, self.window_size, channel, self.size_average) ================================================ FILE: ppdet/slim/distill_model.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn from ppdet.core.workspace import register, create, load_config from ppdet.utils.checkpoint import load_pretrain_weight from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) __all__ = [ 'DistillModel', 'FGDDistillModel', 'CWDDistillModel', 'LDDistillModel', 'PPYOLOEDistillModel', ] @register class DistillModel(nn.Layer): """ Build common distill model. Args: cfg: The student config. slim_cfg: The teacher and distill config. """ def __init__(self, cfg, slim_cfg): super(DistillModel, self).__init__() self.arch = cfg.architecture self.stu_cfg = cfg self.student_model = create(self.stu_cfg.architecture) if 'pretrain_weights' in self.stu_cfg and self.stu_cfg.pretrain_weights: stu_pretrain = self.stu_cfg.pretrain_weights else: stu_pretrain = None slim_cfg = load_config(slim_cfg) self.tea_cfg = slim_cfg self.teacher_model = create(self.tea_cfg.architecture) if 'pretrain_weights' in self.tea_cfg and self.tea_cfg.pretrain_weights: tea_pretrain = self.tea_cfg.pretrain_weights else: tea_pretrain = None self.distill_cfg = slim_cfg # load pretrain weights self.is_inherit = False if stu_pretrain: if self.is_inherit and tea_pretrain: load_pretrain_weight(self.student_model, tea_pretrain) logger.debug( "Inheriting! loading teacher weights to student model!") load_pretrain_weight(self.student_model, stu_pretrain) logger.info("Student model has loaded pretrain weights!") if tea_pretrain: load_pretrain_weight(self.teacher_model, tea_pretrain) logger.info("Teacher model has loaded pretrain weights!") self.teacher_model.eval() for param in self.teacher_model.parameters(): param.trainable = False self.distill_loss = self.build_loss(self.distill_cfg) def build_loss(self, distill_cfg): if 'distill_loss' in distill_cfg and distill_cfg.distill_loss: return create(distill_cfg.distill_loss) else: return None def parameters(self): return self.student_model.parameters() def forward(self, inputs): if self.training: student_loss = self.student_model(inputs) with paddle.no_grad(): teacher_loss = self.teacher_model(inputs) loss = self.distill_loss(self.teacher_model, self.student_model) student_loss['distill_loss'] = loss student_loss['teacher_loss'] = teacher_loss['loss'] student_loss['loss'] += student_loss['distill_loss'] return student_loss else: return self.student_model(inputs) @register class FGDDistillModel(DistillModel): """ Build FGD distill model. Args: cfg: The student config. slim_cfg: The teacher and distill config. """ def __init__(self, cfg, slim_cfg): super(FGDDistillModel, self).__init__(cfg=cfg, slim_cfg=slim_cfg) assert self.arch in ['RetinaNet', 'PicoDet' ], 'Unsupported arch: {}'.format(self.arch) self.is_inherit = True def build_loss(self, distill_cfg): assert 'distill_loss_name' in distill_cfg and distill_cfg.distill_loss_name assert 'distill_loss' in distill_cfg and distill_cfg.distill_loss loss_func = dict() name_list = distill_cfg.distill_loss_name for name in name_list: loss_func[name] = create(distill_cfg.distill_loss) return loss_func def forward(self, inputs): if self.training: s_body_feats = self.student_model.backbone(inputs) s_neck_feats = self.student_model.neck(s_body_feats) with paddle.no_grad(): t_body_feats = self.teacher_model.backbone(inputs) t_neck_feats = self.teacher_model.neck(t_body_feats) loss_dict = {} for idx, k in enumerate(self.distill_loss): loss_dict[k] = self.distill_loss[k](s_neck_feats[idx], t_neck_feats[idx], inputs) if self.arch == "RetinaNet": loss = self.student_model.head(s_neck_feats, inputs) elif self.arch == "PicoDet": head_outs = self.student_model.head( s_neck_feats, self.student_model.export_post_process) loss_gfl = self.student_model.head.get_loss(head_outs, inputs) total_loss = paddle.add_n(list(loss_gfl.values())) loss = {} loss.update(loss_gfl) loss.update({'loss': total_loss}) else: raise ValueError(f"Unsupported model {self.arch}") for k in loss_dict: loss['loss'] += loss_dict[k] loss[k] = loss_dict[k] return loss else: body_feats = self.student_model.backbone(inputs) neck_feats = self.student_model.neck(body_feats) head_outs = self.student_model.head(neck_feats) if self.arch == "RetinaNet": bbox, bbox_num = self.student_model.head.post_process( head_outs, inputs['im_shape'], inputs['scale_factor']) return {'bbox': bbox, 'bbox_num': bbox_num} elif self.arch == "PicoDet": head_outs = self.student_model.head( neck_feats, self.student_model.export_post_process) scale_factor = inputs['scale_factor'] bboxes, bbox_num = self.student_model.head.post_process( head_outs, scale_factor, export_nms=self.student_model.export_nms) return {'bbox': bboxes, 'bbox_num': bbox_num} else: raise ValueError(f"Unsupported model {self.arch}") @register class CWDDistillModel(DistillModel): """ Build CWD distill model. Args: cfg: The student config. slim_cfg: The teacher and distill config. """ def __init__(self, cfg, slim_cfg): super(CWDDistillModel, self).__init__(cfg=cfg, slim_cfg=slim_cfg) assert self.arch in ['GFL', 'RetinaNet'], 'Unsupported arch: {}'.format( self.arch) def build_loss(self, distill_cfg): assert 'distill_loss_name' in distill_cfg and distill_cfg.distill_loss_name assert 'distill_loss' in distill_cfg and distill_cfg.distill_loss loss_func = dict() name_list = distill_cfg.distill_loss_name for name in name_list: loss_func[name] = create(distill_cfg.distill_loss) return loss_func def get_loss_retinanet(self, stu_fea_list, tea_fea_list, inputs): loss = self.student_model.head(stu_fea_list, inputs) loss_dict = {} for idx, k in enumerate(self.distill_loss): loss_dict[k] = self.distill_loss[k](stu_fea_list[idx], tea_fea_list[idx]) loss['loss'] += loss_dict[k] loss[k] = loss_dict[k] return loss def get_loss_gfl(self, stu_fea_list, tea_fea_list, inputs): loss = {} head_outs = self.student_model.head(stu_fea_list) loss_gfl = self.student_model.head.get_loss(head_outs, inputs) loss.update(loss_gfl) total_loss = paddle.add_n(list(loss.values())) loss.update({'loss': total_loss}) feat_loss = {} loss_dict = {} s_cls_feat, t_cls_feat = [], [] for s_neck_f, t_neck_f in zip(stu_fea_list, tea_fea_list): conv_cls_feat, _ = self.student_model.head.conv_feat(s_neck_f) cls_score = self.student_model.head.gfl_head_cls(conv_cls_feat) t_conv_cls_feat, _ = self.teacher_model.head.conv_feat(t_neck_f) t_cls_score = self.teacher_model.head.gfl_head_cls(t_conv_cls_feat) s_cls_feat.append(cls_score) t_cls_feat.append(t_cls_score) for idx, k in enumerate(self.distill_loss): loss_dict[k] = self.distill_loss[k](s_cls_feat[idx], t_cls_feat[idx]) feat_loss[f"neck_f_{idx}"] = self.distill_loss[k](stu_fea_list[idx], tea_fea_list[idx]) for k in feat_loss: loss['loss'] += feat_loss[k] loss[k] = feat_loss[k] for k in loss_dict: loss['loss'] += loss_dict[k] loss[k] = loss_dict[k] return loss def forward(self, inputs): if self.training: s_body_feats = self.student_model.backbone(inputs) s_neck_feats = self.student_model.neck(s_body_feats) with paddle.no_grad(): t_body_feats = self.teacher_model.backbone(inputs) t_neck_feats = self.teacher_model.neck(t_body_feats) if self.arch == "RetinaNet": loss = self.get_loss_retinanet(s_neck_feats, t_neck_feats, inputs) elif self.arch == "GFL": loss = self.get_loss_gfl(s_neck_feats, t_neck_feats, inputs) else: raise ValueError(f"unsupported arch {self.arch}") return loss else: body_feats = self.student_model.backbone(inputs) neck_feats = self.student_model.neck(body_feats) head_outs = self.student_model.head(neck_feats) if self.arch == "RetinaNet": bbox, bbox_num = self.student_model.head.post_process( head_outs, inputs['im_shape'], inputs['scale_factor']) return {'bbox': bbox, 'bbox_num': bbox_num} elif self.arch == "GFL": bbox_pred, bbox_num = head_outs output = {'bbox': bbox_pred, 'bbox_num': bbox_num} return output else: raise ValueError(f"unsupported arch {self.arch}") @register class LDDistillModel(DistillModel): """ Build LD distill model. Args: cfg: The student config. slim_cfg: The teacher and distill config. """ def __init__(self, cfg, slim_cfg): super(LDDistillModel, self).__init__(cfg=cfg, slim_cfg=slim_cfg) assert self.arch in ['GFL'], 'Unsupported arch: {}'.format(self.arch) def forward(self, inputs): if self.training: s_body_feats = self.student_model.backbone(inputs) s_neck_feats = self.student_model.neck(s_body_feats) s_head_outs = self.student_model.head(s_neck_feats) with paddle.no_grad(): t_body_feats = self.teacher_model.backbone(inputs) t_neck_feats = self.teacher_model.neck(t_body_feats) t_head_outs = self.teacher_model.head(t_neck_feats) soft_label_list = t_head_outs[0] soft_targets_list = t_head_outs[1] student_loss = self.student_model.head.get_loss( s_head_outs, inputs, soft_label_list, soft_targets_list) total_loss = paddle.add_n(list(student_loss.values())) student_loss['loss'] = total_loss return student_loss else: return self.student_model(inputs) @register class PPYOLOEDistillModel(DistillModel): """ Build PPYOLOE distill model, only used in PPYOLOE Args: cfg: The student config. slim_cfg: The teacher and distill config. """ def __init__(self, cfg, slim_cfg): super(PPYOLOEDistillModel, self).__init__(cfg=cfg, slim_cfg=slim_cfg) assert self.arch in ['PPYOLOE'], 'Unsupported arch: {}'.format( self.arch) def forward(self, inputs, alpha=0.125): if self.training: with paddle.no_grad(): teacher_loss = self.teacher_model(inputs) if hasattr(self.teacher_model.yolo_head, "assigned_labels"): self.student_model.yolo_head.assigned_labels, self.student_model.yolo_head.assigned_bboxes, self.student_model.yolo_head.assigned_scores = \ self.teacher_model.yolo_head.assigned_labels, self.teacher_model.yolo_head.assigned_bboxes, self.teacher_model.yolo_head.assigned_scores delattr(self.teacher_model.yolo_head, "assigned_labels") delattr(self.teacher_model.yolo_head, "assigned_bboxes") delattr(self.teacher_model.yolo_head, "assigned_scores") student_loss = self.student_model(inputs) logits_loss, feat_loss = self.distill_loss(self.teacher_model, self.student_model) det_total_loss = student_loss['loss'] total_loss = alpha * (det_total_loss + logits_loss + feat_loss) student_loss['loss'] = total_loss student_loss['det_loss'] = det_total_loss student_loss['logits_loss'] = logits_loss student_loss['feat_loss'] = feat_loss return student_loss else: return self.student_model(inputs) ================================================ FILE: ppdet/slim/ofa.py ================================================ from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import load_config, merge_config, create from ppdet.utils.checkpoint import load_weight, load_pretrain_weight from ppdet.utils.logger import setup_logger from ppdet.core.workspace import register, serializable from paddle.utils import try_import logger = setup_logger(__name__) @register @serializable class OFA(object): def __init__(self, ofa_config): super(OFA, self).__init__() self.ofa_config = ofa_config def __call__(self, model, param_state_dict): paddleslim = try_import('paddleslim') from paddleslim.nas.ofa import OFA, RunConfig, utils from paddleslim.nas.ofa.convert_super import Convert, supernet task = self.ofa_config['task'] expand_ratio = self.ofa_config['expand_ratio'] skip_neck = self.ofa_config['skip_neck'] skip_head = self.ofa_config['skip_head'] run_config = self.ofa_config['RunConfig'] if 'skip_layers' in run_config: skip_layers = run_config['skip_layers'] else: skip_layers = [] # supernet config sp_config = supernet(expand_ratio=expand_ratio) # convert to supernet model = Convert(sp_config).convert(model) skip_names = [] if skip_neck: skip_names.append('neck.') if skip_head: skip_names.append('head.') for name, sublayer in model.named_sublayers(): for n in skip_names: if n in name: skip_layers.append(name) run_config['skip_layers'] = skip_layers run_config = RunConfig(**run_config) # build ofa model ofa_model = OFA(model, run_config=run_config) ofa_model.set_epoch(0) ofa_model.set_task(task) input_spec = [{ "image": paddle.ones( shape=[1, 3, 640, 640], dtype='float32'), "im_shape": paddle.full( [1, 2], 640, dtype='float32'), "scale_factor": paddle.ones( shape=[1, 2], dtype='float32') }] ofa_model._clear_search_space(input_spec=input_spec) ofa_model._build_ss = True check_ss = ofa_model._sample_config('expand_ratio', phase=None) # tokenize the search space ofa_model.tokenize() # check token map, search cands and search space logger.info('Token map is {}'.format(ofa_model.token_map)) logger.info('Search candidates is {}'.format(ofa_model.search_cands)) logger.info('The length of search_space is {}, search_space is {}'. format(len(ofa_model._ofa_layers), ofa_model._ofa_layers)) # set model state dict into ofa model utils.set_state_dict(ofa_model.model, param_state_dict) return ofa_model ================================================ FILE: ppdet/slim/prune.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle from paddle.utils import try_import from ppdet.core.workspace import register, serializable from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) def print_prune_params(model): model_dict = model.state_dict() for key in model_dict.keys(): weight_name = model_dict[key].name logger.info('Parameter name: {}, shape: {}'.format( weight_name, model_dict[key].shape)) @register @serializable class Pruner(object): def __init__(self, criterion, pruned_params, pruned_ratios, print_params=False): super(Pruner, self).__init__() assert criterion in ['l1_norm', 'fpgm'], \ "unsupported prune criterion: {}".format(criterion) self.criterion = criterion self.pruned_params = pruned_params self.pruned_ratios = pruned_ratios self.print_params = print_params def __call__(self, model): # FIXME: adapt to network graph when Training and inference are # inconsistent, now only supports prune inference network graph. model.eval() paddleslim = try_import('paddleslim') from paddleslim.analysis import dygraph_flops as flops input_spec = [{ "image": paddle.ones( shape=[1, 3, 640, 640], dtype='float32'), "im_shape": paddle.full( [1, 2], 640, dtype='float32'), "scale_factor": paddle.ones( shape=[1, 2], dtype='float32') }] if self.print_params: print_prune_params(model) ori_flops = flops(model, input_spec) / (1000**3) logger.info("FLOPs before pruning: {}GFLOPs".format(ori_flops)) if self.criterion == 'fpgm': pruner = paddleslim.dygraph.FPGMFilterPruner(model, input_spec) elif self.criterion == 'l1_norm': pruner = paddleslim.dygraph.L1NormFilterPruner(model, input_spec) logger.info("pruned params: {}".format(self.pruned_params)) pruned_ratios = [float(n) for n in self.pruned_ratios] ratios = {} for i, param in enumerate(self.pruned_params): ratios[param] = pruned_ratios[i] pruner.prune_vars(ratios, [0]) pruned_flops = flops(model, input_spec) / (1000**3) logger.info("FLOPs after pruning: {}GFLOPs; pruned ratio: {}".format( pruned_flops, (ori_flops - pruned_flops) / ori_flops)) return model @register @serializable class PrunerQAT(object): def __init__(self, criterion, pruned_params, pruned_ratios, print_prune_params, quant_config, print_qat_model): super(PrunerQAT, self).__init__() assert criterion in ['l1_norm', 'fpgm'], \ "unsupported prune criterion: {}".format(criterion) # Pruner hyperparameter self.criterion = criterion self.pruned_params = pruned_params self.pruned_ratios = pruned_ratios self.print_prune_params = print_prune_params # QAT hyperparameter self.quant_config = quant_config self.print_qat_model = print_qat_model def __call__(self, model): # FIXME: adapt to network graph when Training and inference are # inconsistent, now only supports prune inference network graph. model.eval() paddleslim = try_import('paddleslim') from paddleslim.analysis import dygraph_flops as flops input_spec = [{ "image": paddle.ones( shape=[1, 3, 640, 640], dtype='float32'), "im_shape": paddle.full( [1, 2], 640, dtype='float32'), "scale_factor": paddle.ones( shape=[1, 2], dtype='float32') }] if self.print_prune_params: print_prune_params(model) ori_flops = flops(model, input_spec) / 1000 logger.info("FLOPs before pruning: {}GFLOPs".format(ori_flops)) if self.criterion == 'fpgm': pruner = paddleslim.dygraph.FPGMFilterPruner(model, input_spec) elif self.criterion == 'l1_norm': pruner = paddleslim.dygraph.L1NormFilterPruner(model, input_spec) logger.info("pruned params: {}".format(self.pruned_params)) pruned_ratios = [float(n) for n in self.pruned_ratios] ratios = {} for i, param in enumerate(self.pruned_params): ratios[param] = pruned_ratios[i] pruner.prune_vars(ratios, [0]) pruned_flops = flops(model, input_spec) / 1000 logger.info("FLOPs after pruning: {}GFLOPs; pruned ratio: {}".format( pruned_flops, (ori_flops - pruned_flops) / ori_flops)) self.quanter = paddleslim.dygraph.quant.QAT(config=self.quant_config) self.quanter.quantize(model) if self.print_qat_model: logger.info("Quantized model:") logger.info(model) return model def save_quantized_model(self, layer, path, input_spec=None, **config): self.quanter.save_quantized_model( model=layer, path=path, input_spec=input_spec, **config) ================================================ FILE: ppdet/slim/quant.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function from paddle.utils import try_import from ppdet.core.workspace import register, serializable from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) @register @serializable class QAT(object): def __init__(self, quant_config, print_model): super(QAT, self).__init__() self.quant_config = quant_config self.print_model = print_model def __call__(self, model): paddleslim = try_import('paddleslim') self.quanter = paddleslim.dygraph.quant.QAT(config=self.quant_config) if self.print_model: logger.info("Model before quant:") logger.info(model) # For PP-YOLOE, convert model to deploy firstly. for layer in model.sublayers(): if hasattr(layer, 'convert_to_deploy'): layer.convert_to_deploy() self.quanter.quantize(model) if self.print_model: logger.info("Quantized model:") logger.info(model) return model def save_quantized_model(self, layer, path, input_spec=None, **config): self.quanter.save_quantized_model( model=layer, path=path, input_spec=input_spec, **config) @register @serializable class PTQ(object): def __init__(self, ptq_config, quant_batch_num=10, output_dir='output_inference', fuse=True, fuse_list=None): super(PTQ, self).__init__() self.ptq_config = ptq_config self.quant_batch_num = quant_batch_num self.output_dir = output_dir self.fuse = fuse self.fuse_list = fuse_list def __call__(self, model): paddleslim = try_import('paddleslim') self.ptq = paddleslim.PTQ(**self.ptq_config) model.eval() quant_model = self.ptq.quantize( model, fuse=self.fuse, fuse_list=self.fuse_list) return quant_model def save_quantized_model(self, quant_model, quantize_model_path, input_spec=None): self.ptq.save_quantized_model(quant_model, quantize_model_path, input_spec) ================================================ FILE: ppdet/slim/unstructured_prune.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function from paddle.utils import try_import from ppdet.core.workspace import register, serializable from ppdet.utils.logger import setup_logger logger = setup_logger(__name__) @register @serializable class UnstructuredPruner(object): def __init__(self, stable_epochs, pruning_epochs, tunning_epochs, pruning_steps, ratio, initial_ratio, prune_params_type=None): self.stable_epochs = stable_epochs self.pruning_epochs = pruning_epochs self.tunning_epochs = tunning_epochs self.ratio = ratio self.prune_params_type = prune_params_type self.initial_ratio = initial_ratio self.pruning_steps = pruning_steps def __call__(self, model, steps_per_epoch, skip_params_func=None): paddleslim = try_import('paddleslim') from paddleslim import GMPUnstructuredPruner configs = { 'pruning_strategy': 'gmp', 'stable_iterations': self.stable_epochs * steps_per_epoch, 'pruning_iterations': self.pruning_epochs * steps_per_epoch, 'tunning_iterations': self.tunning_epochs * steps_per_epoch, 'resume_iteration': 0, 'pruning_steps': self.pruning_steps, 'initial_ratio': self.initial_ratio, } pruner = GMPUnstructuredPruner( model, ratio=self.ratio, skip_params_func=skip_params_func, prune_params_type=self.prune_params_type, local_sparsity=True, configs=configs) return pruner ================================================ FILE: ppdet/utils/__init__.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppdet/utils/cam_utils.py ================================================ import numpy as np import cv2 import os import sys import glob from ppdet.utils.logger import setup_logger import copy logger = setup_logger('ppdet_cam') import paddle from ppdet.engine import Trainer def get_test_images(infer_dir, infer_img): """ Get image path list in TEST mode """ assert infer_img is not None or infer_dir is not None, \ "--infer_img or --infer_dir should be set" assert infer_img is None or os.path.isfile(infer_img), \ "{} is not a file".format(infer_img) assert infer_dir is None or os.path.isdir(infer_dir), \ "{} is not a directory".format(infer_dir) # infer_img has a higher priority if infer_img and os.path.isfile(infer_img): return [infer_img] images = set() infer_dir = os.path.abspath(infer_dir) assert os.path.isdir(infer_dir), \ "infer_dir {} is not a directory".format(infer_dir) exts = ['jpg', 'jpeg', 'png', 'bmp'] exts += [ext.upper() for ext in exts] for ext in exts: images.update(glob.glob('{}/*.{}'.format(infer_dir, ext))) images = list(images) assert len(images) > 0, "no image found in {}".format(infer_dir) logger.info("Found {} inference images in total.".format(len(images))) return images def compute_ious(boxes1, boxes2): """[Compute pairwise IOU matrix for given two sets of boxes] Args: boxes1 ([numpy ndarray with shape N,4]): [representing bounding boxes with format (xmin,ymin,xmax,ymax)] boxes2 ([numpy ndarray with shape M,4]): [representing bounding boxes with format (xmin,ymin,xmax,ymax)] Returns: pairwise IOU maxtrix with shape (N,M),where the value at ith row jth column hold the iou between ith box and jth box from box1 and box2 respectively. """ lu = np.maximum( boxes1[:, None, :2], boxes2[:, :2] ) # lu with shape N,M,2 ; boxes1[:,None,:2] with shape (N,1,2) boxes2 with shape(M,2) rd = np.minimum(boxes1[:, None, 2:], boxes2[:, 2:]) # rd same to lu intersection_wh = np.maximum(0.0, rd - lu) intersection_area = intersection_wh[:, :, 0] * intersection_wh[:, :, 1] # with shape (N,M) boxes1_wh = np.maximum(0.0, boxes1[:, 2:] - boxes1[:, :2]) boxes1_area = boxes1_wh[:, 0] * boxes1_wh[:, 1] # with shape (N,) boxes2_wh = np.maximum(0.0, boxes2[:, 2:] - boxes2[:, :2]) boxes2_area = boxes2_wh[:, 0] * boxes2_wh[:, 1] # with shape (M,) union_area = np.maximum( boxes1_area[:, None] + boxes2_area - intersection_area, 1e-8) # with shape (N,M) ious = np.clip(intersection_area / union_area, 0.0, 1.0) return ious def grad_cam(feat, grad): """ Args: feat: CxHxW grad: CxHxW Returns: cam: HxW """ exp = (feat * grad.mean((1, 2), keepdims=True)).mean(axis=0) exp = np.maximum(-exp, 0) return exp def resize_cam(explanation, resize_shape) -> np.ndarray: """ Args: explanation: (width, height) resize_shape: (width, height) Returns: """ assert len(explanation.shape) == 2, f"{explanation.shape}. " \ f"Currently support 2D explanation results for visualization. " \ "Reduce higher dimensions to 2D for visualization." explanation = (explanation - explanation.min()) / ( explanation.max() - explanation.min()) explanation = cv2.resize(explanation, resize_shape) explanation = np.uint8(255 * explanation) explanation = cv2.applyColorMap(explanation, cv2.COLORMAP_JET) explanation = cv2.cvtColor(explanation, cv2.COLOR_BGR2RGB) return explanation class BBoxCAM: def __init__(self, FLAGS, cfg): self.FLAGS = FLAGS self.cfg = cfg # build model self.trainer = self.build_trainer(cfg) # num_class self.num_class = cfg.num_classes # set hook for extraction of featuremaps and grads self.set_hook(cfg) self.nms_idx_need_divid_numclass_arch = [ 'FasterRCNN', 'MaskRCNN', 'CascadeRCNN' ] """ In these networks, the bbox array shape before nms contain num_class, the nms_keep_idx of the bbox need to divide the num_class; """ # cam image output_dir try: os.makedirs(FLAGS.cam_out) except: print('Path already exists.') pass def build_trainer(self, cfg): # build trainer trainer = Trainer(cfg, mode='test') # load weights trainer.load_weights(cfg.weights) # set for get extra_data before nms trainer.model.use_extra_data = True # set for record the bbox index before nms if cfg.architecture in ['FasterRCNN', 'MaskRCNN']: trainer.model.bbox_post_process.nms.return_index = True elif cfg.architecture in ['YOLOv3', 'PPYOLOE', 'PPYOLOEWithAuxHead']: if trainer.model.post_process is not None: # anchor based YOLOs: YOLOv3,PP-YOLO trainer.model.post_process.nms.return_index = True else: # anchor free YOLOs: PP-YOLOE, PP-YOLOE+ trainer.model.yolo_head.nms.return_index = True elif cfg.architecture == 'BlazeFace' or cfg.architecture == 'SSD': trainer.model.post_process.nms.return_index = True elif cfg.architecture == 'RetinaNet': trainer.model.head.nms.return_index = True else: print(cfg.architecture + ' is not supported for cam temporarily!') sys.exit() # Todo: Unify the head/post_process name in each model return trainer def set_hook(self, cfg): # set hook for extraction of featuremaps and grads self.target_feats = {} self.target_layer_name = cfg.target_feature_layer_name # such as trainer.model.backbone, trainer.model.bbox_head.roi_extractor def hook(layer, input, output): self.target_feats[layer._layer_name_for_hook] = output try: exec('self.trainer.' + self.target_layer_name + '._layer_name_for_hook = self.target_layer_name') # self.trainer.target_layer_name._layer_name_for_hook = self.target_layer_name exec('self.trainer.' + self.target_layer_name + '.register_forward_post_hook(hook)') # self.trainer.target_layer_name.register_forward_post_hook(hook) except: print("Error! " "The target_layer_name--" + self.target_layer_name + " is not in model! " "Please check the spelling and " "the network's architecture!") sys.exit() def get_bboxes(self): # get inference images images = get_test_images(self.FLAGS.infer_dir, self.FLAGS.infer_img) # inference result = self.trainer.predict( images, draw_threshold=self.FLAGS.draw_threshold, output_dir=self.FLAGS.output_dir, save_results=self.FLAGS.save_results, visualize=False)[0] return result def get_bboxes_cams(self): # Get the bboxes prediction(after nms result) of the input inference_result = self.get_bboxes() # read input image # Todo: Support folder multi-images process from PIL import Image img = np.array(Image.open(self.cfg.infer_img)) # data for calaulate bbox grad_cam extra_data = inference_result['extra_data'] """ Example of Faster_RCNN based architecture: extra_data: {'scores': tensor with shape [num_of_bboxes_before_nms, num_classes], for example: [1000, 80] 'nms_keep_idx': tensor with shape [num_of_bboxes_after_nms, 1], for example: [300, 1] } Example of YOLOv3 based architecture: extra_data: {'scores': tensor with shape [1, num_classes, num_of_yolo_bboxes_before_nms], #for example: [1, 80, 8400] 'nms_keep_idx': tensor with shape [num_of_yolo_bboxes_after_nms, 1], # for example: [300, 1] } """ # array index of the predicted bbox before nms if self.cfg.architecture in self.nms_idx_need_divid_numclass_arch: # some network's bbox array shape before nms may be like [num_of_bboxes_before_nms, num_classes, 4], # we need to divide num_classes to get the before_nms_index; # currently, only include the rcnn architectures (fasterrcnn, maskrcnn, cascadercnn); before_nms_indexes = extra_data['nms_keep_idx'].cpu().numpy( ) // self.num_class # num_class else: before_nms_indexes = extra_data['nms_keep_idx'].cpu().numpy() # Calculate and visualize the heatmap of per predict bbox for index, target_bbox in enumerate(inference_result['bbox']): # target_bbox: [cls, score, x1, y1, x2, y2] # filter bboxes with low predicted scores if target_bbox[1] < self.FLAGS.draw_threshold: continue target_bbox_before_nms = int(before_nms_indexes[index]) if len(extra_data['scores'].shape) == 2: score_out = extra_data['scores'][target_bbox_before_nms] else: score_out = extra_data['scores'][0, :, target_bbox_before_nms] """ There are two kinds array shape of bbox score output : 1) [num_of_bboxes_before_nms, num_classes], for example: [1000, 80] 2) [num_of_image, num_classes, num_of_yolo_bboxes_before_nms], for example: [1, 80, 1000] """ # construct one_hot label and do backward to get the gradients predicted_label = paddle.argmax(score_out) label_onehot = paddle.nn.functional.one_hot( predicted_label, num_classes=len(score_out)) label_onehot = label_onehot.squeeze() target = paddle.sum(score_out * label_onehot) target.backward(retain_graph=True) if 'backbone' in self.target_layer_name or \ 'neck' in self.target_layer_name: # backbone/neck level feature if isinstance(self.target_feats[self.target_layer_name], list): # when the featuremap contains of multiple scales, # take the featuremap of the last scale # Todo: fuse the cam result from multisclae featuremaps if self.target_feats[self.target_layer_name][-1].shape[ -1] == 1: """ if the last level featuremap is 1x1 size, we take the second last one """ cam_grad = self.target_feats[self.target_layer_name][ -2].grad.squeeze().cpu().numpy() cam_feat = self.target_feats[self.target_layer_name][ -2].squeeze().cpu().numpy() else: cam_grad = self.target_feats[self.target_layer_name][ -1].grad.squeeze().cpu().numpy() cam_feat = self.target_feats[self.target_layer_name][ -1].squeeze().cpu().numpy() else: cam_grad = self.target_feats[ self.target_layer_name].grad.squeeze().cpu().numpy() cam_feat = self.target_feats[ self.target_layer_name].squeeze().cpu().numpy() else: # roi level feature cam_grad = self.target_feats[ self.target_layer_name].grad.squeeze().cpu().numpy()[ target_bbox_before_nms] cam_feat = self.target_feats[self.target_layer_name].squeeze( ).cpu().numpy()[target_bbox_before_nms] # grad_cam: exp = grad_cam(cam_feat, cam_grad) if 'backbone' in self.target_layer_name or \ 'neck' in self.target_layer_name: """ when use backbone/neck featuremap, we first do the cam on whole image, and then set the area outside the predic bbox to 0 """ # reshape the cam image to the input image size resized_exp = resize_cam(exp, (img.shape[1], img.shape[0])) mask = np.zeros((img.shape[0], img.shape[1], 3)) mask[int(target_bbox[3]):int(target_bbox[5]), int(target_bbox[ 2]):int(target_bbox[4]), :] = 1 resized_exp = resized_exp * mask # add the bbox cam back to the input image overlay_vis = np.uint8(resized_exp * 0.4 + img * 0.6) elif 'roi' in self.target_layer_name: # get the bbox part of the image bbox_img = copy.deepcopy(img[int(target_bbox[3]):int( target_bbox[5]), int(target_bbox[2]):int(target_bbox[ 4]), :]) # reshape the cam image to the bbox size resized_exp = resize_cam(exp, (bbox_img.shape[1], bbox_img.shape[0])) # add the bbox cam back to the bbox image bbox_overlay_vis = np.uint8(resized_exp * 0.4 + bbox_img * 0.6) # put the bbox_cam image to the original image overlay_vis = copy.deepcopy(img) overlay_vis[int(target_bbox[3]):int(target_bbox[5]), int( target_bbox[2]):int(target_bbox[4]), :] = bbox_overlay_vis else: print( 'Only supported cam for backbone/neck feature and roi feature, the others are not supported temporarily!' ) sys.exit() # put the bbox rectangle on image cv2.rectangle( overlay_vis, (int(target_bbox[2]), int(target_bbox[3])), (int(target_bbox[4]), int(target_bbox[5])), (0, 0, 255), 2) # save visualization result cam_image = Image.fromarray(overlay_vis) cam_image.save(self.FLAGS.cam_out + '/' + str(index) + '.jpg') # clear gradients after each bbox grad_cam target.clear_gradient() for n, v in self.trainer.model.named_sublayers(): v.clear_gradients() ================================================ FILE: ppdet/utils/check.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import sys import paddle import six import paddle.version as paddle_version from .logger import setup_logger logger = setup_logger(__name__) __all__ = [ 'check_gpu', 'check_npu', 'check_xpu', 'check_mlu', 'check_version', 'check_config' ] def check_mlu(use_mlu): """ Log error and exit when set use_mlu=true in paddlepaddle cpu/gpu/xpu/npu version. """ err = "Config use_mlu cannot be set as true while you are " \ "using paddlepaddle cpu/gpu/xpu/npu version ! \nPlease try: \n" \ "\t1. Install paddlepaddle-mlu to run model on MLU \n" \ "\t2. Set use_mlu as false in config file to run " \ "model on CPU/GPU/XPU/NPU" try: if use_mlu and not paddle.is_compiled_with_mlu(): logger.error(err) sys.exit(1) except Exception as e: pass def check_npu(use_npu): """ Log error and exit when set use_npu=true in paddlepaddle version without paddle-custom-npu installed. """ err = "Config use_npu cannot be set as true while you are " \ "using paddlepaddle version without paddle-custom-npu " \ "installed! \nPlease try: \n" \ "\t1. Install paddle-custom-npu to run model on NPU \n" \ "\t2. Set use_npu as false in config file to run " \ "model on other devices supported." try: if use_npu and not 'npu' in paddle.device.get_all_custom_device_type(): logger.error(err) sys.exit(1) except Exception as e: pass def check_xpu(use_xpu): """ Log error and exit when set use_xpu=true in paddlepaddle cpu/gpu/npu version. """ err = "Config use_xpu cannot be set as true while you are " \ "using paddlepaddle cpu/gpu/npu version ! \nPlease try: \n" \ "\t1. Install paddlepaddle-xpu to run model on XPU \n" \ "\t2. Set use_xpu as false in config file to run " \ "model on CPU/GPU/NPU" try: if use_xpu and not paddle.is_compiled_with_xpu(): logger.error(err) sys.exit(1) except Exception as e: pass def check_gpu(use_gpu): """ Log error and exit when set use_gpu=true in paddlepaddle cpu version. """ err = "Config use_gpu cannot be set as true while you are " \ "using paddlepaddle cpu version ! \nPlease try: \n" \ "\t1. Install paddlepaddle-gpu to run model on GPU \n" \ "\t2. Set use_gpu as false in config file to run " \ "model on CPU" try: if use_gpu and not paddle.is_compiled_with_cuda(): logger.error(err) sys.exit(1) except Exception as e: pass def check_version(version='2.2'): """ Log error and exit when the installed version of paddlepaddle is not satisfied. """ err = "PaddlePaddle version {} or higher is required, " \ "or a suitable develop version is satisfied as well. \n" \ "Please make sure the version is good with your code.".format(version) version_installed = [ paddle_version.major, paddle_version.minor, paddle_version.patch, paddle_version.rc ] if version_installed == ['0', '0', '0', '0']: return version_split = version.split('.') length = min(len(version_installed), len(version_split)) for i in six.moves.range(length): if version_installed[i] > version_split[i]: return if version_installed[i] < version_split[i]: raise Exception(err) def check_config(cfg): """ Check the correctness of the configuration file. Log error and exit when Config is not compliant. """ err = "'{}' not specified in config file. Please set it in config file." check_list = ['architecture', 'num_classes'] try: for var in check_list: if not var in cfg: logger.error(err.format(var)) sys.exit(1) except Exception as e: pass if 'log_iter' not in cfg: cfg.log_iter = 20 return cfg ================================================ FILE: ppdet/utils/checkpoint.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import os import json import numpy as np import paddle import paddle.nn as nn from .download import get_weights_path from .logger import setup_logger logger = setup_logger(__name__) def convert_to_dict(obj): if isinstance(obj, dict): return {k: convert_to_dict(v) for k, v in obj.items()} elif isinstance(obj, list): return [convert_to_dict(i) for i in obj] else: return obj def is_url(path): """ Whether path is URL. Args: path (string): URL string or not. """ return path.startswith('http://') \ or path.startswith('https://') \ or path.startswith('ppdet://') def _strip_postfix(path): path, ext = os.path.splitext(path) assert ext in ['', '.pdparams', '.pdopt', '.pdmodel'], \ "Unknown postfix {} from weights".format(ext) return path def load_weight(model, weight, optimizer=None, ema=None, exchange=True): if is_url(weight): weight = get_weights_path(weight) path = _strip_postfix(weight) pdparam_path = path + '.pdparams' if not os.path.exists(pdparam_path): raise ValueError("Model pretrain path {} does not " "exists.".format(pdparam_path)) if ema is not None and os.path.exists(path + '.pdema'): if exchange: # Exchange model and ema_model to load logger.info('Exchange model and ema_model to load:') ema_state_dict = paddle.load(pdparam_path) logger.info('Loading ema_model weights from {}'.format(path + '.pdparams')) param_state_dict = paddle.load(path + '.pdema') logger.info('Loading model weights from {}'.format(path + '.pdema')) else: ema_state_dict = paddle.load(path + '.pdema') logger.info('Loading ema_model weights from {}'.format(path + '.pdema')) param_state_dict = paddle.load(pdparam_path) logger.info('Loading model weights from {}'.format(path + '.pdparams')) else: ema_state_dict = None param_state_dict = paddle.load(pdparam_path) if hasattr(model, 'modelTeacher') and hasattr(model, 'modelStudent'): print('Loading pretrain weights for Teacher-Student framework.') print('Loading pretrain weights for Student model.') student_model_dict = model.modelStudent.state_dict() student_param_state_dict = match_state_dict( student_model_dict, param_state_dict, mode='student') model.modelStudent.set_dict(student_param_state_dict) print('Loading pretrain weights for Teacher model.') teacher_model_dict = model.modelTeacher.state_dict() teacher_param_state_dict = match_state_dict( teacher_model_dict, param_state_dict, mode='teacher') model.modelTeacher.set_dict(teacher_param_state_dict) else: model_dict = model.state_dict() model_weight = {} incorrect_keys = 0 for key in model_dict.keys(): if key in param_state_dict.keys(): model_weight[key] = param_state_dict[key] else: logger.info('Unmatched key: {}'.format(key)) incorrect_keys += 1 assert incorrect_keys == 0, "Load weight {} incorrectly, \ {} keys unmatched, please check again.".format(weight, incorrect_keys) logger.info('Finish resuming model weights: {}'.format(pdparam_path)) model.set_dict(model_weight) last_epoch = 0 if optimizer is not None and os.path.exists(path + '.pdopt'): optim_state_dict = paddle.load(path + '.pdopt') # to solve resume bug, will it be fixed in paddle 2.0 for key in optimizer.state_dict().keys(): if not key in optim_state_dict.keys(): optim_state_dict[key] = optimizer.state_dict()[key] if 'last_epoch' in optim_state_dict: last_epoch = optim_state_dict.pop('last_epoch') optimizer.set_state_dict(optim_state_dict) if ema_state_dict is not None: ema.resume(ema_state_dict, optim_state_dict['LR_Scheduler']['last_epoch']) elif ema_state_dict is not None: ema.resume(ema_state_dict) return last_epoch def match_state_dict(model_state_dict, weight_state_dict, mode='default'): """ Match between the model state dict and pretrained weight state dict. Return the matched state dict. The method supposes that all the names in pretrained weight state dict are subclass of the names in models`, if the prefix 'backbone.' in pretrained weight keys is stripped. And we could get the candidates for each model key. Then we select the name with the longest matched size as the final match result. For example, the model state dict has the name of 'backbone.res2.res2a.branch2a.conv.weight' and the pretrained weight as name of 'res2.res2a.branch2a.conv.weight' and 'branch2a.conv.weight'. We match the 'res2.res2a.branch2a.conv.weight' to the model key. """ model_keys = sorted(model_state_dict.keys()) weight_keys = sorted(weight_state_dict.keys()) def teacher_match(a, b): # skip student params if b.startswith('modelStudent'): return False return a == b or a.endswith("." + b) or b.endswith("." + a) def student_match(a, b): # skip teacher params if b.startswith('modelTeacher'): return False return a == b or a.endswith("." + b) or b.endswith("." + a) def match(a, b): if b.startswith('backbone.res5'): b = b[9:] return a == b or a.endswith("." + b) if mode == 'student': match_op = student_match elif mode == 'teacher': match_op = teacher_match else: match_op = match match_matrix = np.zeros([len(model_keys), len(weight_keys)]) for i, m_k in enumerate(model_keys): for j, w_k in enumerate(weight_keys): if match_op(m_k, w_k): match_matrix[i, j] = len(w_k) max_id = match_matrix.argmax(1) max_len = match_matrix.max(1) max_id[max_len == 0] = -1 load_id = set(max_id) load_id.discard(-1) not_load_weight_name = [] if weight_keys[0].startswith('modelStudent') or weight_keys[0].startswith( 'modelTeacher'): for match_idx in range(len(max_id)): if max_id[match_idx] == -1: not_load_weight_name.append(model_keys[match_idx]) if len(not_load_weight_name) > 0: logger.info('{} in model is not matched with pretrained weights, ' 'and its will be trained from scratch'.format( not_load_weight_name)) else: for idx in range(len(weight_keys)): if idx not in load_id: not_load_weight_name.append(weight_keys[idx]) if len(not_load_weight_name) > 0: logger.info('{} in pretrained weight is not used in the model, ' 'and its will not be loaded'.format( not_load_weight_name)) matched_keys = {} result_state_dict = {} for model_id, weight_id in enumerate(max_id): if weight_id == -1: continue model_key = model_keys[model_id] weight_key = weight_keys[weight_id] weight_value = weight_state_dict[weight_key] model_value_shape = list(model_state_dict[model_key].shape) if list(weight_value.shape) != model_value_shape: logger.info( 'The shape {} in pretrained weight {} is unmatched with ' 'the shape {} in model {}. And the weight {} will not be ' 'loaded'.format(weight_value.shape, weight_key, model_value_shape, model_key, weight_key)) continue assert model_key not in result_state_dict result_state_dict[model_key] = weight_value if weight_key in matched_keys: raise ValueError('Ambiguity weight {} loaded, it matches at least ' '{} and {} in the model'.format( weight_key, model_key, matched_keys[ weight_key])) matched_keys[weight_key] = model_key return result_state_dict def load_pretrain_weight(model, pretrain_weight, ARSL_eval=False): if is_url(pretrain_weight): pretrain_weight = get_weights_path(pretrain_weight) path = _strip_postfix(pretrain_weight) if not (os.path.isdir(path) or os.path.isfile(path) or os.path.exists(path + '.pdparams')): raise ValueError("Model pretrain path `{}` does not exists. " "If you don't want to load pretrain model, " "please delete `pretrain_weights` field in " "config file.".format(path)) teacher_student_flag = False if not ARSL_eval: if hasattr(model, 'modelTeacher') and hasattr(model, 'modelStudent'): print('Loading pretrain weights for Teacher-Student framework.') print( 'Assert Teacher model has the same structure with Student model.' ) model_dict = model.modelStudent.state_dict() teacher_student_flag = True else: model_dict = model.state_dict() weights_path = path + '.pdparams' param_state_dict = paddle.load(weights_path) param_state_dict = match_state_dict(model_dict, param_state_dict) for k, v in param_state_dict.items(): if isinstance(v, np.ndarray): v = paddle.to_tensor(v) if model_dict[k].dtype != v.dtype: param_state_dict[k] = v.astype(model_dict[k].dtype) if teacher_student_flag: model.modelStudent.set_dict(param_state_dict) model.modelTeacher.set_dict(param_state_dict) else: model.set_dict(param_state_dict) logger.info('Finish loading model weights: {}'.format(weights_path)) else: weights_path = path + '.pdparams' param_state_dict = paddle.load(weights_path) student_model_dict = model.modelStudent.state_dict() student_param_state_dict = match_state_dict( student_model_dict, param_state_dict, mode='student') model.modelStudent.set_dict(student_param_state_dict) print('Loading pretrain weights for Teacher model.') teacher_model_dict = model.modelTeacher.state_dict() teacher_param_state_dict = match_state_dict( teacher_model_dict, param_state_dict, mode='teacher') model.modelTeacher.set_dict(teacher_param_state_dict) logger.info('Finish loading model weights: {}'.format(weights_path)) def save_model(model, optimizer, save_dir, save_name, last_epoch, ema_model=None): """ save model into disk. Args: model (dict): the model state_dict to save parameters. optimizer (paddle.optimizer.Optimizer): the Optimizer instance to save optimizer states. save_dir (str): the directory to be saved. save_name (str): the path to be saved. last_epoch (int): the epoch index. ema_model (dict|None): the ema_model state_dict to save parameters. """ if paddle.distributed.get_rank() != 0: return save_dir = os.path.normpath(save_dir) if not os.path.exists(save_dir): os.makedirs(save_dir) if save_name == "best_model": best_model_path = os.path.join(save_dir, 'best_model') if not os.path.exists(best_model_path): os.makedirs(best_model_path) save_path = os.path.join(save_dir, save_name) # save model if isinstance(model, nn.Layer): paddle.save(model.state_dict(), save_path + ".pdparams") best_model = model.state_dict() else: assert isinstance(model, dict), 'model is not a instance of nn.layer or dict' if ema_model is None: paddle.save(model, save_path + ".pdparams") best_model = model else: assert isinstance(ema_model, dict), ("ema_model is not a instance of dict, " "please call model.state_dict() to get.") # Exchange model and ema_model to save paddle.save(ema_model, save_path + ".pdparams") paddle.save(model, save_path + ".pdema") best_model = ema_model if save_name == 'best_model': best_model_path = os.path.join(best_model_path, 'model') paddle.save(best_model, best_model_path + ".pdparams") # save optimizer state_dict = optimizer.state_dict() state_dict['last_epoch'] = last_epoch paddle.save(state_dict, save_path + ".pdopt") logger.info("Save checkpoint: {}".format(save_dir)) def save_semi_model(teacher_model, student_model, optimizer, save_dir, save_name, last_epoch, last_iter): """ save teacher and student model into disk. Args: teacher_model (dict): the teacher_model state_dict to save parameters. student_model (dict): the student_model state_dict to save parameters. optimizer (paddle.optimizer.Optimizer): the Optimizer instance to save optimizer states. save_dir (str): the directory to be saved. save_name (str): the path to be saved. last_epoch (int): the epoch index. last_iter (int): the iter index. """ if paddle.distributed.get_rank() != 0: return assert isinstance(teacher_model, dict), ( "teacher_model is not a instance of dict, " "please call teacher_model.state_dict() to get.") assert isinstance(student_model, dict), ( "student_model is not a instance of dict, " "please call student_model.state_dict() to get.") if not os.path.exists(save_dir): os.makedirs(save_dir) save_path = os.path.join(save_dir, save_name) # save model paddle.save(teacher_model, save_path + str(last_epoch) + "epoch_t.pdparams") paddle.save(student_model, save_path + str(last_epoch) + "epoch_s.pdparams") # save optimizer state_dict = optimizer.state_dict() state_dict['last_epoch'] = last_epoch state_dict['last_iter'] = last_iter paddle.save(state_dict, save_path + str(last_epoch) + "epoch.pdopt") logger.info("Save checkpoint: {}".format(save_dir)) def save_model_info(model_info, save_path, prefix): """ save model info to the target path """ save_path = os.path.join(save_path, prefix) if not os.path.exists(save_path): os.makedirs(save_path) with open(os.path.join(save_path, f'{prefix}.info.json'), 'w') as f: json.dump(model_info, f) logger.info("Already save model info in {}".format(save_path)) def update_train_results(config, prefix, metric_info, done_flag=False, last_num=5, ema=False): if paddle.distributed.get_rank() != 0: return assert last_num >= 1 train_results_path = os.path.join(config["save_dir"], "train_result.json") save_model_tag = ["pdparams", "pdopt", "pdstates"] save_inference_tag = [ "inference_config", "pdmodel", "pdiparams", "pdiparams.info" ] if ema: save_model_tag.append("pdema") if os.path.exists(train_results_path): with open(train_results_path, "r") as fp: train_results = json.load(fp) else: train_results = {} train_results["model_name"] = config["pdx_model_name"] train_results["label_dict"] = "" train_results["visualdl_log"] = "" train_results["train_log"] = "train.log" train_results["config"] = "config.yaml" train_results["models"] = {} for i in range(1, last_num + 1): train_results["models"][f"last_{i}"] = {} train_results["models"]["best"] = {} train_results["done_flag"] = done_flag if prefix == "best_model": train_results["models"]["best"]["score"] = metric_info["metric"] for tag in save_model_tag: train_results["models"]["best"][tag] = os.path.join( prefix, f"{prefix}.{tag}") for tag in save_inference_tag: train_results["models"]["best"][tag] = os.path.join( prefix, "inference", f"inference.{tag}" if tag != "inference_config" else "inference.yml") else: for i in range(last_num - 1, 0, -1): train_results["models"][f"last_{i + 1}"] = train_results["models"][ f"last_{i}"].copy() train_results["models"][f"last_{1}"]["score"] = metric_info["metric"] for tag in save_model_tag: train_results["models"][f"last_{1}"][tag] = os.path.join( prefix, f"{prefix}.{tag}") for tag in save_inference_tag: train_results["models"][f"last_{1}"][tag] = os.path.join( prefix, "inference", f"inference.{tag}" if tag != "inference_config" else "inference.yml") with open(train_results_path, "w") as fp: json.dump(train_results, fp) ================================================ FILE: ppdet/utils/cli.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from argparse import ArgumentParser, RawDescriptionHelpFormatter import yaml import re from ppdet.core.workspace import get_registered_modules, dump_value __all__ = ['ColorTTY', 'ArgsParser'] class ColorTTY(object): def __init__(self): super(ColorTTY, self).__init__() self.colors = ['red', 'green', 'yellow', 'blue', 'magenta', 'cyan'] def __getattr__(self, attr): if attr in self.colors: color = self.colors.index(attr) + 31 def color_message(message): return "[{}m{}".format(color, message) setattr(self, attr, color_message) return color_message def bold(self, message): return self.with_code('01', message) def with_code(self, code, message): return "[{}m{}".format(code, message) class ArgsParser(ArgumentParser): def __init__(self): super(ArgsParser, self).__init__( formatter_class=RawDescriptionHelpFormatter) self.add_argument("-c", "--config", help="configuration file to use") self.add_argument( "-o", "--opt", nargs='*', help="set configuration options") def parse_args(self, argv=None): args = super(ArgsParser, self).parse_args(argv) assert args.config is not None, \ "Please specify --config=configure_file_path." args.opt = self._parse_opt(args.opt) return args def _parse_opt(self, opts): config = {} if not opts: return config for s in opts: s = s.strip() k, v = s.split('=', 1) if '.' not in k: config[k] = yaml.load(v, Loader=yaml.Loader) else: keys = k.split('.') if keys[0] not in config: config[keys[0]] = {} cur = config[keys[0]] for idx, key in enumerate(keys[1:]): if idx == len(keys) - 2: cur[key] = yaml.load(v, Loader=yaml.Loader) else: cur[key] = {} cur = cur[key] return config def merge_args(config, args, exclude_args=['config', 'opt', 'slim_config']): for k, v in vars(args).items(): if k not in exclude_args: config[k] = v return config def print_total_cfg(config): modules = get_registered_modules() color_tty = ColorTTY() green = '___{}___'.format(color_tty.colors.index('green') + 31) styled = {} for key in config.keys(): if not config[key]: # empty schema continue if key not in modules and not hasattr(config[key], '__dict__'): styled[key] = config[key] continue elif key in modules: module = modules[key] else: type_name = type(config[key]).__name__ if type_name in modules: module = modules[type_name].copy() module.update({ k: v for k, v in config[key].__dict__.items() if k in module.schema }) key += " ({})".format(type_name) default = module.find_default_keys() missing = module.find_missing_keys() mismatch = module.find_mismatch_keys() extra = module.find_extra_keys() dep_missing = [] for dep in module.inject: if isinstance(module[dep], str) and module[dep] != '': if module[dep] not in modules: # not a valid module dep_missing.append(dep) else: dep_mod = modules[module[dep]] # empty dict but mandatory if not dep_mod and dep_mod.mandatory(): dep_missing.append(dep) override = list( set(module.keys()) - set(default) - set(extra) - set(dep_missing)) replacement = {} for name in set(override + default + extra + mismatch + missing): new_name = name if name in missing: value = "" else: value = module[name] if name in extra: value = dump_value(value) + " " elif name in mismatch: value = dump_value(value) + " " elif name in dep_missing: value = dump_value(value) + " " elif name in override and value != '': mark = green new_name = mark + name replacement[new_name] = value styled[key] = replacement buffer = yaml.dump(styled, default_flow_style=False, default_style='') buffer = (re.sub(r"", r"[31m[0m", buffer)) buffer = (re.sub(r"", r"[33m[0m", buffer)) buffer = (re.sub(r"", r"[31m[0m", buffer)) buffer = (re.sub(r"", r"[31m[0m", buffer)) buffer = re.sub(r"___(\d+)___(.*?):", r"[\1m\2[0m:", buffer) print(buffer) ================================================ FILE: ppdet/utils/colormap.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import numpy as np def colormap(rgb=False): """ Get colormap The code of this function is copied from https://github.com/facebookresearch/Detectron/blob/main/detectron/utils/colormap.py """ color_list = np.array([ 0.000, 0.447, 0.741, 0.850, 0.325, 0.098, 0.929, 0.694, 0.125, 0.494, 0.184, 0.556, 0.466, 0.674, 0.188, 0.301, 0.745, 0.933, 0.635, 0.078, 0.184, 0.300, 0.300, 0.300, 0.600, 0.600, 0.600, 1.000, 0.000, 0.000, 1.000, 0.500, 0.000, 0.749, 0.749, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 1.000, 0.667, 0.000, 1.000, 0.333, 0.333, 0.000, 0.333, 0.667, 0.000, 0.333, 1.000, 0.000, 0.667, 0.333, 0.000, 0.667, 0.667, 0.000, 0.667, 1.000, 0.000, 1.000, 0.333, 0.000, 1.000, 0.667, 0.000, 1.000, 1.000, 0.000, 0.000, 0.333, 0.500, 0.000, 0.667, 0.500, 0.000, 1.000, 0.500, 0.333, 0.000, 0.500, 0.333, 0.333, 0.500, 0.333, 0.667, 0.500, 0.333, 1.000, 0.500, 0.667, 0.000, 0.500, 0.667, 0.333, 0.500, 0.667, 0.667, 0.500, 0.667, 1.000, 0.500, 1.000, 0.000, 0.500, 1.000, 0.333, 0.500, 1.000, 0.667, 0.500, 1.000, 1.000, 0.500, 0.000, 0.333, 1.000, 0.000, 0.667, 1.000, 0.000, 1.000, 1.000, 0.333, 0.000, 1.000, 0.333, 0.333, 1.000, 0.333, 0.667, 1.000, 0.333, 1.000, 1.000, 0.667, 0.000, 1.000, 0.667, 0.333, 1.000, 0.667, 0.667, 1.000, 0.667, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 0.333, 1.000, 1.000, 0.667, 1.000, 0.167, 0.000, 0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000, 0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.167, 0.000, 0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000, 0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.167, 0.000, 0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000, 0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.143, 0.143, 0.143, 0.286, 0.286, 0.286, 0.429, 0.429, 0.429, 0.571, 0.571, 0.571, 0.714, 0.714, 0.714, 0.857, 0.857, 0.857, 1.000, 1.000, 1.000 ]).astype(np.float32) color_list = color_list.reshape((-1, 3)) * 255 if not rgb: color_list = color_list[:, ::-1] return color_list.astype('int32') ================================================ FILE: ppdet/utils/compact.py ================================================ import PIL def imagedraw_textsize_c(draw, text, font=None): if int(PIL.__version__.split('.')[0]) < 10: tw, th = draw.textsize(text, font=font) else: left, top, right, bottom = draw.textbbox((0, 0), text, font=font) tw, th = right - left, bottom - top return tw, th ================================================ FILE: ppdet/utils/download.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import os.path as osp import sys import yaml import time import shutil import requests import tqdm import hashlib import base64 import binascii import tarfile import zipfile import errno from paddle.utils.download import _get_unique_endpoints from ppdet.core.workspace import BASE_KEY from .logger import setup_logger from .voc_utils import create_list logger = setup_logger(__name__) __all__ = [ 'get_weights_path', 'get_dataset_path', 'get_config_path', 'download_dataset', 'create_voc_list' ] WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/weights") DATASET_HOME = osp.expanduser("~/.cache/paddle/dataset") CONFIGS_HOME = osp.expanduser("~/.cache/paddle/configs") # dict of {dataset_name: (download_info, sub_dirs)} # download info: [(url, md5sum)] DATASETS = { 'coco': ([ ( 'http://images.cocodataset.org/zips/train2017.zip', 'cced6f7f71b7629ddf16f17bbcfab6b2', ), ( 'http://images.cocodataset.org/zips/val2017.zip', '442b8da7639aecaf257c1dceb8ba8c80', ), ( 'http://images.cocodataset.org/annotations/annotations_trainval2017.zip', 'f4bbac642086de4f52a3fdda2de5fa2c', ), ], ["annotations", "train2017", "val2017"]), 'voc': ([ ( 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar', '6cd6e144f989b92b3379bac3b3de84fd', ), ( 'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar', 'c52e279531787c972589f7e41ab4ae64', ), ( 'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar', 'b6e924de25625d8de591ea690078ad9f', ), ( 'https://paddledet.bj.bcebos.com/data/label_list.txt', '5ae5d62183cfb6f6d3ac109359d06a1b', ), ], ["VOCdevkit/VOC2012", "VOCdevkit/VOC2007"]), 'wider_face': ([ ( 'https://dataset.bj.bcebos.com/wider_face/WIDER_train.zip', '3fedf70df600953d25982bcd13d91ba2', ), ( 'https://dataset.bj.bcebos.com/wider_face/WIDER_val.zip', 'dfa7d7e790efa35df3788964cf0bbaea', ), ( 'https://dataset.bj.bcebos.com/wider_face/wider_face_split.zip', 'a4a898d6193db4b9ef3260a68bad0dc7', ), ], ["WIDER_train", "WIDER_val", "wider_face_split"]), 'fruit': ([( 'https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit.tar', 'baa8806617a54ccf3685fa7153388ae6', ), ], ['Annotations', 'JPEGImages']), 'roadsign_voc': ([( 'https://paddlemodels.bj.bcebos.com/object_detection/roadsign_voc.tar', '8d629c0f880dd8b48de9aeff44bf1f3e', ), ], ['annotations', 'images']), 'roadsign_coco': ([( 'https://paddlemodels.bj.bcebos.com/object_detection/roadsign_coco.tar', '49ce5a9b5ad0d6266163cd01de4b018e', ), ], ['annotations', 'images']), 'spine_coco': ([( 'https://paddledet.bj.bcebos.com/data/spine.tar', '8a3a353c2c54a2284ad7d2780b65f6a6', ), ], ['annotations', 'images']), 'coco_ce': ([( 'https://paddledet.bj.bcebos.com/data/coco_ce.tar', 'eadd1b79bc2f069f2744b1dd4e0c0329', ), ], []), 'culane': ([('https://bj.bcebos.com/v1/paddledet/data/culane.tar', None, ), ], []) } DOWNLOAD_DATASETS_LIST = DATASETS.keys() DOWNLOAD_RETRY_LIMIT = 3 PPDET_WEIGHTS_DOWNLOAD_URL_PREFIX = 'https://paddledet.bj.bcebos.com/' # When running unit tests, there could be multiple processes that # trying to create DATA_HOME directory simultaneously, so we cannot # use a if condition to check for the existence of the directory; # instead, we use the filesystem as the synchronization mechanism by # catching returned errors. def must_mkdirs(path): try: os.makedirs(path) except OSError as exc: if exc.errno != errno.EEXIST: raise pass def parse_url(url): url = url.replace("ppdet://", PPDET_WEIGHTS_DOWNLOAD_URL_PREFIX) return url def get_weights_path(url): """Get weights path from WEIGHTS_HOME, if not exists, download it from url. """ url = parse_url(url) path, _ = get_path(url, WEIGHTS_HOME) return path def get_config_path(url): """Get weights path from CONFIGS_HOME, if not exists, download it from url. """ url = parse_url(url) path = map_path(url, CONFIGS_HOME, path_depth=2) if os.path.isfile(path): return path # config file not found, try download # 1. clear configs directory if osp.isdir(CONFIGS_HOME): shutil.rmtree(CONFIGS_HOME) # 2. get url try: from ppdet import __version__ as version except ImportError: version = None cfg_url = "ppdet://configs/{}/configs.tar".format(version) \ if version else "ppdet://configs/configs.tar" cfg_url = parse_url(cfg_url) # 3. download and decompress cfg_fullname = _download_dist(cfg_url, osp.dirname(CONFIGS_HOME)) _decompress_dist(cfg_fullname) # 4. check config file existing if os.path.isfile(path): return path else: logger.error("Get config {} failed after download, please contact us on " \ "https://github.com/PaddlePaddle/PaddleDetection/issues".format(path)) sys.exit(1) def get_dataset_path(path, annotation, image_dir): """ If path exists, return path. Otherwise, get dataset path from DATASET_HOME, if not exists, download it. """ if _dataset_exists(path, annotation, image_dir): return path data_name = os.path.split(path.strip().lower())[-1] if data_name not in DOWNLOAD_DATASETS_LIST: raise ValueError( "Dataset {} is not valid for reason above, please check again.". format(osp.realpath(path))) else: logger.warning( "Dataset {} is not valid for reason above, try searching {} or " "downloading dataset...".format(osp.realpath(path), DATASET_HOME)) for name, dataset in DATASETS.items(): if data_name == name: logger.debug("Parse dataset_dir {} as dataset " "{}".format(path, name)) data_dir = osp.join(DATASET_HOME, name) if name == "spine_coco": if _dataset_exists(data_dir, annotation, image_dir): return data_dir # For voc, only check dir VOCdevkit/VOC2012, VOCdevkit/VOC2007 if name in ['voc', 'fruit', 'roadsign_voc']: exists = True for sub_dir in dataset[1]: check_dir = osp.join(data_dir, sub_dir) if osp.exists(check_dir): logger.info("Found {}".format(check_dir)) else: exists = False if exists: return data_dir # voc exist is checked above, voc is not exist here check_exist = name != 'voc' and name != 'fruit' and name != 'roadsign_voc' for url, md5sum in dataset[0]: get_path(url, data_dir, md5sum, check_exist) # voc should create list after download if name == 'voc': create_voc_list(data_dir) return data_dir raise ValueError("Dataset automaticly downloading Error.") def create_voc_list(data_dir, devkit_subdir='VOCdevkit'): logger.debug("Create voc file list...") devkit_dir = osp.join(data_dir, devkit_subdir) years = ['2007', '2012'] # NOTE: since using auto download VOC # dataset, VOC default label list should be used, # do not generate label_list.txt here. For default # label, see ../data/source/voc.py create_list(devkit_dir, years, data_dir) logger.debug("Create voc file list finished") def map_path(url, root_dir, path_depth=1): # parse path after download to decompress under root_dir assert path_depth > 0, "path_depth should be a positive integer" dirname = url for _ in range(path_depth): dirname = osp.dirname(dirname) fpath = osp.relpath(url, dirname) zip_formats = ['.zip', '.tar', '.gz'] for zip_format in zip_formats: fpath = fpath.replace(zip_format, '') return osp.join(root_dir, fpath) def get_path(url, root_dir, md5sum=None, check_exist=True): """ Download from given url to root_dir. if file or directory specified by url is exists under root_dir, return the path directly, otherwise download from url and decompress it, return the path. url (str): download url root_dir (str): root dir for downloading, it should be WEIGHTS_HOME or DATASET_HOME md5sum (str): md5 sum of download package """ # parse path after download to decompress under root_dir fullpath = map_path(url, root_dir) # For same zip file, decompressed directory name different # from zip file name, rename by following map decompress_name_map = { "VOCtrainval_11-May-2012": "VOCdevkit/VOC2012", "VOCtrainval_06-Nov-2007": "VOCdevkit/VOC2007", "VOCtest_06-Nov-2007": "VOCdevkit/VOC2007", "annotations_trainval": "annotations" } for k, v in decompress_name_map.items(): if fullpath.find(k) >= 0: fullpath = osp.join(osp.split(fullpath)[0], v) if osp.exists(fullpath) and check_exist: if not osp.isfile(fullpath) or \ _check_exist_file_md5(fullpath, md5sum, url): logger.debug("Found {}".format(fullpath)) return fullpath, True else: os.remove(fullpath) fullname = _download_dist(url, root_dir, md5sum) # new weights format which postfix is 'pdparams' not # need to decompress if osp.splitext(fullname)[-1] not in ['.pdparams', '.yml', '.ttf']: _decompress_dist(fullname) return fullpath, False def download_dataset(path, dataset=None): if dataset not in DATASETS.keys(): logger.error("Unknown dataset {}, it should be " "{}".format(dataset, DATASETS.keys())) return dataset_info = DATASETS[dataset][0] for info in dataset_info: get_path(info[0], path, info[1], False) logger.debug("Download dataset {} finished.".format(dataset)) def _dataset_exists(path, annotation, image_dir): """ Check if user define dataset exists """ if not osp.exists(path): logger.warning("Config dataset_dir {} is not exits, " "dataset config is not valid".format(path)) return False if annotation: annotation_path = osp.join(path, annotation) if not osp.isfile(annotation_path): logger.warning("Config annotation {} is not a " "file, dataset config is not " "valid".format(annotation_path)) return False if image_dir: image_path = osp.join(path, image_dir) if not osp.isdir(image_path): logger.warning("Config image_dir {} is not a " "directory, dataset config is not " "valid".format(image_path)) return False return True def _download(url, path, md5sum=None): """ Download from url, save to path. url (str): download url path (str): download to given path """ must_mkdirs(path) fname = osp.split(url)[-1] fullname = osp.join(path, fname) retry_cnt = 0 while not (osp.exists(fullname) and _check_exist_file_md5(fullname, md5sum, url)): if retry_cnt < DOWNLOAD_RETRY_LIMIT: retry_cnt += 1 else: raise RuntimeError("Download from {} failed. " "Retry limit reached".format(url)) logger.info("Downloading {} from {}".format(fname, url)) # NOTE: windows path join may incur \, which is invalid in url if sys.platform == "win32": url = url.replace('\\', '/') req = requests.get(url, stream=True) if req.status_code != 200: raise RuntimeError("Downloading from {} failed with code " "{}!".format(url, req.status_code)) # For protecting download interupted, download to # tmp_fullname firstly, move tmp_fullname to fullname # after download finished tmp_fullname = fullname + "_tmp" total_size = req.headers.get('content-length') with open(tmp_fullname, 'wb') as f: if total_size: for chunk in tqdm.tqdm( req.iter_content(chunk_size=1024), total=(int(total_size) + 1023) // 1024, unit='KB'): f.write(chunk) else: for chunk in req.iter_content(chunk_size=1024): if chunk: f.write(chunk) shutil.move(tmp_fullname, fullname) return fullname def _download_dist(url, path, md5sum=None): env = os.environ if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env: # Mainly used to solve the problem of downloading data from # different machines in the case of multiple machines. # Different nodes will download data, and the same node # will only download data once. # Reference https://github.com/PaddlePaddle/PaddleClas/blob/develop/ppcls/utils/download.py#L108 rank_id_curr_node = int(os.environ.get("PADDLE_RANK_IN_NODE", 0)) num_trainers = int(env['PADDLE_TRAINERS_NUM']) if num_trainers <= 1: return _download(url, path, md5sum) else: fname = osp.split(url)[-1] fullname = osp.join(path, fname) lock_path = fullname + '.download.lock' must_mkdirs(path) if not osp.exists(fullname): with open(lock_path, 'w'): # touch os.utime(lock_path, None) if rank_id_curr_node == 0: _download(url, path, md5sum) os.remove(lock_path) else: while os.path.exists(lock_path): time.sleep(0.5) return fullname else: return _download(url, path, md5sum) def _check_exist_file_md5(filename, md5sum, url): # if md5sum is None, and file to check is weights file, # read md5um from url and check, else check md5sum directly return _md5check_from_url(filename, url) if md5sum is None \ and filename.endswith('pdparams') \ else _md5check(filename, md5sum) def _md5check_from_url(filename, url): # For weights in bcebos URLs, MD5 value is contained # in request header as 'content_md5' req = requests.get(url, stream=True) content_md5 = req.headers.get('content-md5') req.close() if not content_md5 or _md5check( filename, binascii.hexlify(base64.b64decode(content_md5.strip('"'))).decode( )): return True else: return False def _md5check(fullname, md5sum=None): if md5sum is None: return True logger.debug("File {} md5 checking...".format(fullname)) md5 = hashlib.md5() with open(fullname, 'rb') as f: for chunk in iter(lambda: f.read(4096), b""): md5.update(chunk) calc_md5sum = md5.hexdigest() if calc_md5sum != md5sum: logger.warning("File {} md5 check failed, {}(calc) != " "{}(base)".format(fullname, calc_md5sum, md5sum)) return False return True def _decompress(fname): """ Decompress for zip and tar file """ logger.info("Decompressing {}...".format(fname)) # For protecting decompressing interupted, # decompress to fpath_tmp directory firstly, if decompress # successed, move decompress files to fpath and delete # fpath_tmp and remove download compress file. fpath = osp.split(fname)[0] fpath_tmp = osp.join(fpath, 'tmp') if osp.isdir(fpath_tmp): shutil.rmtree(fpath_tmp) os.makedirs(fpath_tmp) if fname.find('tar') >= 0: with tarfile.open(fname) as tf: tf.extractall(path=fpath_tmp) elif fname.find('zip') >= 0: with zipfile.ZipFile(fname) as zf: zf.extractall(path=fpath_tmp) elif fname.find('.txt') >= 0: return else: raise TypeError("Unsupport compress file type {}".format(fname)) for f in os.listdir(fpath_tmp): src_dir = osp.join(fpath_tmp, f) dst_dir = osp.join(fpath, f) _move_and_merge_tree(src_dir, dst_dir) shutil.rmtree(fpath_tmp) os.remove(fname) def _decompress_dist(fname): env = os.environ if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env: trainer_id = int(env['PADDLE_TRAINER_ID']) num_trainers = int(env['PADDLE_TRAINERS_NUM']) if num_trainers <= 1: _decompress(fname) else: lock_path = fname + '.decompress.lock' from paddle.distributed import ParallelEnv unique_endpoints = _get_unique_endpoints(ParallelEnv() .trainer_endpoints[:]) # NOTE(dkp): _decompress_dist always performed after # _download_dist, in _download_dist sub-trainers is waiting # for download lock file release with sleeping, if decompress # prograss is very fast and finished with in the sleeping gap # time, e.g in tiny dataset such as coco_ce, spine_coco, main # trainer may finish decompress and release lock file, so we # only craete lock file in main trainer and all sub-trainer # wait 1s for main trainer to create lock file, for 1s is # twice as sleeping gap, this waiting time can keep all # trainer pipeline in order # **change this if you have more elegent methods** if ParallelEnv().current_endpoint in unique_endpoints: with open(lock_path, 'w'): # touch os.utime(lock_path, None) _decompress(fname) os.remove(lock_path) else: time.sleep(1) while os.path.exists(lock_path): time.sleep(0.5) else: _decompress(fname) def _move_and_merge_tree(src, dst): """ Move src directory to dst, if dst is already exists, merge src to dst """ if not osp.exists(dst): shutil.move(src, dst) elif osp.isfile(src): shutil.move(src, dst) else: for fp in os.listdir(src): src_fp = osp.join(src, fp) dst_fp = osp.join(dst, fp) if osp.isdir(src_fp): if osp.isdir(dst_fp): _move_and_merge_tree(src_fp, dst_fp) else: shutil.move(src_fp, dst_fp) elif osp.isfile(src_fp) and \ not osp.isfile(dst_fp): shutil.move(src_fp, dst_fp) ================================================ FILE: ppdet/utils/fuse_utils.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import copy import paddle import paddle.nn as nn __all__ = ['fuse_conv_bn'] def fuse_conv_bn(model): is_train = False if model.training: model.eval() is_train = True fuse_list = [] tmp_pair = [None, None] for name, layer in model.named_sublayers(): if isinstance(layer, nn.Conv2D): tmp_pair[0] = name if isinstance(layer, nn.BatchNorm2D): tmp_pair[1] = name if tmp_pair[0] and tmp_pair[1] and len(tmp_pair) == 2: fuse_list.append(tmp_pair) tmp_pair = [None, None] model = fuse_layers(model, fuse_list) if is_train: model.train() return model def find_parent_layer_and_sub_name(model, name): """ Given the model and the name of a layer, find the parent layer and the sub_name of the layer. For example, if name is 'block_1/convbn_1/conv_1', the parent layer is 'block_1/convbn_1' and the sub_name is `conv_1`. Args: model(paddle.nn.Layer): the model to be quantized. name(string): the name of a layer Returns: parent_layer, subname """ assert isinstance(model, nn.Layer), \ "The model must be the instance of paddle.nn.Layer." assert len(name) > 0, "The input (name) should not be empty." last_idx = 0 idx = 0 parent_layer = model while idx < len(name): if name[idx] == '.': sub_name = name[last_idx:idx] if hasattr(parent_layer, sub_name): parent_layer = getattr(parent_layer, sub_name) last_idx = idx + 1 idx += 1 sub_name = name[last_idx:idx] return parent_layer, sub_name class Identity(nn.Layer): '''a layer to replace bn or relu layers''' def __init__(self, *args, **kwargs): super(Identity, self).__init__() def forward(self, input): return input def fuse_layers(model, layers_to_fuse, inplace=False): ''' fuse layers in layers_to_fuse Args: model(nn.Layer): The model to be fused. layers_to_fuse(list): The layers' names to be fused. For example,"fuse_list = [["conv1", "bn1"], ["conv2", "bn2"]]". A TypeError would be raised if "fuse" was set as True but "fuse_list" was None. Default: None. inplace(bool): Whether apply fusing to the input model. Default: False. Return fused_model(paddle.nn.Layer): The fused model. ''' if not inplace: model = copy.deepcopy(model) for layers_list in layers_to_fuse: layer_list = [] for layer_name in layers_list: parent_layer, sub_name = find_parent_layer_and_sub_name(model, layer_name) layer_list.append(getattr(parent_layer, sub_name)) new_layers = _fuse_func(layer_list) for i, item in enumerate(layers_list): parent_layer, sub_name = find_parent_layer_and_sub_name(model, item) setattr(parent_layer, sub_name, new_layers[i]) return model def _fuse_func(layer_list): '''choose the fuser method and fuse layers''' types = tuple(type(m) for m in layer_list) fusion_method = types_to_fusion_method.get(types, None) new_layers = [None] * len(layer_list) fused_layer = fusion_method(*layer_list) for handle_id, pre_hook_fn in layer_list[0]._forward_pre_hooks.items(): fused_layer.register_forward_pre_hook(pre_hook_fn) del layer_list[0]._forward_pre_hooks[handle_id] for handle_id, hook_fn in layer_list[-1]._forward_post_hooks.items(): fused_layer.register_forward_post_hook(hook_fn) del layer_list[-1]._forward_post_hooks[handle_id] new_layers[0] = fused_layer for i in range(1, len(layer_list)): identity = Identity() identity.training = layer_list[0].training new_layers[i] = identity return new_layers def _fuse_conv_bn(conv, bn): '''fuse conv and bn for train or eval''' assert(conv.training == bn.training),\ "Conv and BN both must be in the same mode (train or eval)." if conv.training: assert bn._num_features == conv._out_channels, 'Output channel of Conv2d must match num_features of BatchNorm2d' raise NotImplementedError else: return _fuse_conv_bn_eval(conv, bn) def _fuse_conv_bn_eval(conv, bn): '''fuse conv and bn for eval''' assert (not (conv.training or bn.training)), "Fusion only for eval!" fused_conv = copy.deepcopy(conv) fused_weight, fused_bias = _fuse_conv_bn_weights( fused_conv.weight, fused_conv.bias, bn._mean, bn._variance, bn._epsilon, bn.weight, bn.bias) fused_conv.weight.set_value(fused_weight) if fused_conv.bias is None: fused_conv.bias = paddle.create_parameter( shape=[fused_conv._out_channels], is_bias=True, dtype=bn.bias.dtype) fused_conv.bias.set_value(fused_bias) return fused_conv def _fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b): '''fuse weights and bias of conv and bn''' if conv_b is None: conv_b = paddle.zeros_like(bn_rm) if bn_w is None: bn_w = paddle.ones_like(bn_rm) if bn_b is None: bn_b = paddle.zeros_like(bn_rm) bn_var_rsqrt = paddle.rsqrt(bn_rv + bn_eps) conv_w = conv_w * \ (bn_w * bn_var_rsqrt).reshape([-1] + [1] * (len(conv_w.shape) - 1)) conv_b = (conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b return conv_w, conv_b types_to_fusion_method = {(nn.Conv2D, nn.BatchNorm2D): _fuse_conv_bn, } ================================================ FILE: ppdet/utils/logger.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging import os import sys import paddle.distributed as dist __all__ = ['setup_logger'] logger_initialized = [] def setup_logger(name="ppdet", output=None, log_ranks="0"): """ Initialize logger and set its verbosity level to INFO. Args: output (str): a file name or a directory to save log. If None, will not save log file. If ends with ".txt" or ".log", assumed to be a file name. Otherwise, logs will be saved to `output/log.txt`. name (str): the root module name of this logger log_ranks (str): The ids of gpu to log which are separated by "," when more than 1, "0" by default. Returns: logging.Logger: a logger """ logger = logging.getLogger(name) if name in logger_initialized: return logger logger.setLevel(logging.INFO) logger.propagate = False formatter = logging.Formatter( "[%(asctime)s] %(name)s %(levelname)s: %(message)s", datefmt="%m/%d %H:%M:%S") if isinstance(log_ranks, str): log_ranks = [int(i) for i in log_ranks.split(',')] elif isinstance(log_ranks, int): log_ranks = [log_ranks] # stdout logging: master only local_rank = dist.get_rank() if local_rank in log_ranks: ch = logging.StreamHandler(stream=sys.stdout) ch.setLevel(logging.DEBUG) ch.setFormatter(formatter) logger.addHandler(ch) # file logging: all workers if output is not None: if output.endswith(".txt") or output.endswith(".log"): filename = output else: filename = os.path.join(output, "log.txt") if local_rank > 0: filename = filename + ".rank{}".format(local_rank) os.makedirs(os.path.dirname(filename)) fh = logging.FileHandler(filename, mode='a') fh.setLevel(logging.DEBUG) fh.setFormatter(logging.Formatter()) logger.addHandler(fh) logger_initialized.append(name) return logger ================================================ FILE: ppdet/utils/profiler.py ================================================ # copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys import paddle import paddle.profiler as profiler # A global variable to record the number of calling times for profiler # functions. It is used to specify the tracing range of training steps. _profiler_step_id = 0 # A global variable to avoid parsing from string every time. _profiler_options = None _prof = None class ProfilerOptions(object): ''' Use a string to initialize a ProfilerOptions. The string should be in the format: "key1=value1;key2=value;key3=value3". For example: "profile_path=model.profile" "batch_range=[50, 60]; profile_path=model.profile" "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile" ProfilerOptions supports following key-value pair: batch_range - a integer list, e.g. [100, 110]. state - a string, the optional values are 'CPU', 'GPU' or 'All'. sorted_key - a string, the optional values are 'calls', 'total', 'max', 'min' or 'ave. tracer_option - a string, the optional values are 'Default', 'OpDetail', 'AllOpDetail'. profile_path - a string, the path to save the serialized profile data, which can be used to generate a timeline. exit_on_finished - a boolean. ''' def __init__(self, options_str): assert isinstance(options_str, str) self._options = { 'batch_range': [10, 20], 'state': 'All', 'sorted_key': 'total', 'tracer_option': 'Default', 'profile_path': '/tmp/profile', 'exit_on_finished': True, 'timer_only': True } self._parse_from_string(options_str) def _parse_from_string(self, options_str): for kv in options_str.replace(' ', '').split(';'): key, value = kv.split('=') if key == 'batch_range': value_list = value.replace('[', '').replace(']', '').split(',') value_list = list(map(int, value_list)) if len(value_list) >= 2 and value_list[0] >= 0 and value_list[ 1] > value_list[0]: self._options[key] = value_list elif key == 'exit_on_finished': self._options[key] = value.lower() in ("yes", "true", "t", "1") elif key in [ 'state', 'sorted_key', 'tracer_option', 'profile_path' ]: self._options[key] = value elif key == 'timer_only': self._options[key] = value def __getitem__(self, name): if self._options.get(name, None) is None: raise ValueError( "ProfilerOptions does not have an option named %s." % name) return self._options[name] def add_profiler_step(options_str=None): ''' Enable the operator-level timing using PaddlePaddle's profiler. The profiler uses a independent variable to count the profiler steps. One call of this function is treated as a profiler step. Args: profiler_options - a string to initialize the ProfilerOptions. Default is None, and the profiler is disabled. ''' if options_str is None: return global _prof global _profiler_step_id global _profiler_options if _profiler_options is None: _profiler_options = ProfilerOptions(options_str) # profile : https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/performance_improving/profiling_model.html#chakanxingnengshujudetongjibiaodan # timer_only = True only the model's throughput and time overhead are displayed # timer_only = False calling summary can print a statistical form that presents performance data from different perspectives. # timer_only = False the output Timeline information can be found in the profiler_log directory if _prof is None: _timer_only = str(_profiler_options['timer_only']) == str(True) _prof = profiler.Profiler( scheduler = (_profiler_options['batch_range'][0], _profiler_options['batch_range'][1]), on_trace_ready = profiler.export_chrome_tracing('./profiler_log'), timer_only = _timer_only) _prof.start() else: _prof.step() if _profiler_step_id == _profiler_options['batch_range'][1]: _prof.stop() _prof.summary( op_detail=True, thread_sep=False, time_unit='ms') _prof = None if _profiler_options['exit_on_finished']: sys.exit(0) _profiler_step_id += 1 ================================================ FILE: ppdet/utils/stats.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import collections import numpy as np __all__ = ['SmoothedValue', 'TrainingStats'] class SmoothedValue(object): """Track a series of values and provide access to smoothed values over a window or the global series average. """ def __init__(self, window_size=20, fmt=None): if fmt is None: fmt = "{median:.4f} ({avg:.4f})" self.deque = collections.deque(maxlen=window_size) self.fmt = fmt self.total = 0. self.count = 0 def update(self, value, n=1): self.deque.append(value) self.count += n self.total += value * n @property def median(self): return np.median(self.deque) @property def avg(self): return np.mean(self.deque) @property def max(self): return np.max(self.deque) @property def value(self): return self.deque[-1] @property def global_avg(self): return self.total / self.count def __str__(self): return self.fmt.format( median=self.median, avg=self.avg, max=self.max, value=self.value) class TrainingStats(object): def __init__(self, window_size, delimiter=' '): self.meters = None self.window_size = window_size self.delimiter = delimiter def update(self, stats): if self.meters is None: self.meters = { k: SmoothedValue(self.window_size) for k in stats.keys() } for k, v in self.meters.items(): v.update(float(stats[k])) def get(self, extras=None): stats = collections.OrderedDict() if extras: for k, v in extras.items(): stats[k] = v for k, v in self.meters.items(): stats[k] = format(v.median, '.6f') return stats def log(self, extras=None): d = self.get(extras) strs = [] for k, v in d.items(): strs.append("{}: {}".format(k, str(v))) return self.delimiter.join(strs) ================================================ FILE: ppdet/utils/visualizer.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import os import numpy as np from PIL import Image, ImageDraw, ImageFont import cv2 import math from .colormap import colormap from ppdet.utils.logger import setup_logger from ppdet.utils.compact import imagedraw_textsize_c from ppdet.utils.download import get_path logger = setup_logger(__name__) __all__ = ['visualize_results'] def visualize_results(image, bbox_res, mask_res, segm_res, keypoint_res, pose3d_res, im_id, catid2name, threshold=0.5): """ Visualize bbox and mask results """ if bbox_res is not None: image = draw_bbox(image, im_id, catid2name, bbox_res, threshold) if mask_res is not None: image = draw_mask(image, im_id, mask_res, threshold) if segm_res is not None: image = draw_segm(image, im_id, catid2name, segm_res, threshold) if keypoint_res is not None: image = draw_pose(image, keypoint_res, threshold) if pose3d_res is not None: pose3d = np.array(pose3d_res[0]['pose3d']) * 1000 image = draw_pose3d(image, pose3d, visual_thread=threshold) return image def draw_mask(image, im_id, segms, threshold, alpha=0.7): """ Draw mask on image """ mask_color_id = 0 w_ratio = .4 color_list = colormap(rgb=True) img_array = np.array(image).astype('float32') for dt in np.array(segms): if im_id != dt['image_id']: continue segm, score = dt['segmentation'], dt['score'] if score < threshold: continue import pycocotools.mask as mask_util mask = mask_util.decode(segm) * 255 color_mask = color_list[mask_color_id % len(color_list), 0:3] mask_color_id += 1 for c in range(3): color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255 idx = np.nonzero(mask) img_array[idx[0], idx[1], :] *= 1.0 - alpha img_array[idx[0], idx[1], :] += alpha * color_mask return Image.fromarray(img_array.astype('uint8')) def draw_bbox(image, im_id, catid2name, bboxes, threshold): """ Draw bbox on image """ font_url = "https://paddledet.bj.bcebos.com/simfang.ttf" font_path, _ = get_path(font_url, "~/.cache/paddle/") font_size = 18 font = ImageFont.truetype(font_path, font_size, encoding="utf-8") draw = ImageDraw.Draw(image) catid2color = {} color_list = colormap(rgb=True)[:40] for dt in np.array(bboxes): if im_id != dt['image_id']: continue catid, bbox, score = dt['category_id'], dt['bbox'], dt['score'] if score < threshold: continue if catid not in catid2color: idx = np.random.randint(len(color_list)) catid2color[catid] = color_list[idx] color = tuple(catid2color[catid]) # draw bbox if len(bbox) == 4: # draw bbox xmin, ymin, w, h = bbox xmax = xmin + w ymax = ymin + h draw.line( [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin), (xmin, ymin)], width=2, fill=color) elif len(bbox) == 8: x1, y1, x2, y2, x3, y3, x4, y4 = bbox draw.line( [(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)], width=2, fill=color) xmin = min(x1, x2, x3, x4) ymin = min(y1, y2, y3, y4) else: logger.error('the shape of bbox must be [M, 4] or [M, 8]!') # draw label text = "{} {:.2f}".format(catid2name[catid], score) tw, th = imagedraw_textsize_c(draw, text, font=font) draw.rectangle( [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color) draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255), font=font) return image def save_result(save_path, results, catid2name, threshold): """ save result as txt """ img_id = int(results["im_id"]) with open(save_path, 'w') as f: if "bbox_res" in results: for dt in results["bbox_res"]: catid, bbox, score = dt['category_id'], dt['bbox'], dt['score'] if score < threshold: continue # each bbox result as a line # for rbox: classname score x1 y1 x2 y2 x3 y3 x4 y4 # for bbox: classname score x1 y1 w h bbox_pred = '{} {} '.format(catid2name[catid], score) + ' '.join( [str(e) for e in bbox]) f.write(bbox_pred + '\n') elif "keypoint_res" in results: for dt in results["keypoint_res"]: kpts = dt['keypoints'] scores = dt['score'] keypoint_pred = [img_id, scores, kpts] print(keypoint_pred, file=f) else: print("No valid results found, skip txt save") def draw_segm(image, im_id, catid2name, segms, threshold, alpha=0.7, draw_box=True): """ Draw segmentation on image """ mask_color_id = 0 w_ratio = .4 color_list = colormap(rgb=True) img_array = np.array(image).astype('float32') for dt in np.array(segms): if im_id != dt['image_id']: continue segm, score, catid = dt['segmentation'], dt['score'], dt['category_id'] if score < threshold: continue import pycocotools.mask as mask_util mask = mask_util.decode(segm) * 255 color_mask = color_list[mask_color_id % len(color_list), 0:3] mask_color_id += 1 for c in range(3): color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255 idx = np.nonzero(mask) img_array[idx[0], idx[1], :] *= 1.0 - alpha img_array[idx[0], idx[1], :] += alpha * color_mask if not draw_box: center_y, center_x = ndimage.measurements.center_of_mass(mask) label_text = "{}".format(catid2name[catid]) vis_pos = (max(int(center_x) - 10, 0), int(center_y)) cv2.putText(img_array, label_text, vis_pos, cv2.FONT_HERSHEY_COMPLEX, 0.3, (255, 255, 255)) else: mask = mask_util.decode(segm) * 255 sum_x = np.sum(mask, axis=0) x = np.where(sum_x > 0.5)[0] sum_y = np.sum(mask, axis=1) y = np.where(sum_y > 0.5)[0] x0, x1, y0, y1 = x[0], x[-1], y[0], y[-1] cv2.rectangle(img_array, (x0, y0), (x1, y1), tuple(color_mask.astype('int32').tolist()), 1) bbox_text = '%s %.2f' % (catid2name[catid], score) t_size = cv2.getTextSize(bbox_text, 0, 0.3, thickness=1)[0] cv2.rectangle(img_array, (x0, y0), (x0 + t_size[0], y0 - t_size[1] - 3), tuple(color_mask.astype('int32').tolist()), -1) cv2.putText( img_array, bbox_text, (x0, y0 - 2), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (0, 0, 0), 1, lineType=cv2.LINE_AA) return Image.fromarray(img_array.astype('uint8')) def draw_pose(image, results, visual_thread=0.6, save_name='pose.jpg', save_dir='output', returnimg=False, ids=None): try: import matplotlib.pyplot as plt import matplotlib plt.switch_backend('agg') except Exception as e: logger.error('Matplotlib not found, please install matplotlib.' 'for example: `pip install matplotlib`.') raise e skeletons = np.array([item['keypoints'] for item in results]) kpt_nums = 17 if len(skeletons) > 0: kpt_nums = int(skeletons.shape[1] / 3) skeletons = skeletons.reshape(-1, kpt_nums, 3) if kpt_nums == 17: #plot coco keypoint EDGES = [(0, 1), (0, 2), (1, 3), (2, 4), (3, 5), (4, 6), (5, 7), (6, 8), (7, 9), (8, 10), (5, 11), (6, 12), (11, 13), (12, 14), (13, 15), (14, 16), (11, 12)] else: #plot mpii keypoint EDGES = [(0, 1), (1, 2), (3, 4), (4, 5), (2, 6), (3, 6), (6, 7), (7, 8), (8, 9), (10, 11), (11, 12), (13, 14), (14, 15), (8, 12), (8, 13)] NUM_EDGES = len(EDGES) colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \ [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \ [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]] cmap = matplotlib.cm.get_cmap('hsv') plt.figure() img = np.array(image).astype('float32') color_set = results['colors'] if 'colors' in results else None if 'bbox' in results and ids is None: bboxs = results['bbox'] for j, rect in enumerate(bboxs): xmin, ymin, xmax, ymax = rect color = colors[0] if color_set is None else colors[color_set[j] % len(colors)] cv2.rectangle(img, (xmin, ymin), (xmax, ymax), color, 1) canvas = img.copy() for i in range(kpt_nums): for j in range(len(skeletons)): if skeletons[j][i, 2] < visual_thread: continue if ids is None: color = colors[i] if color_set is None else colors[color_set[j] % len(colors)] else: color = get_color(ids[j]) cv2.circle( canvas, tuple(skeletons[j][i, 0:2].astype('int32')), 2, color, thickness=-1) to_plot = cv2.addWeighted(img, 0.3, canvas, 0.7, 0) fig = matplotlib.pyplot.gcf() stickwidth = 2 for i in range(NUM_EDGES): for j in range(len(skeletons)): edge = EDGES[i] if skeletons[j][edge[0], 2] < visual_thread or skeletons[j][edge[ 1], 2] < visual_thread: continue cur_canvas = canvas.copy() X = [skeletons[j][edge[0], 1], skeletons[j][edge[1], 1]] Y = [skeletons[j][edge[0], 0], skeletons[j][edge[1], 0]] mX = np.mean(X) mY = np.mean(Y) length = ((X[0] - X[1])**2 + (Y[0] - Y[1])**2)**0.5 angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1])) polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1) if ids is None: color = colors[i] if color_set is None else colors[color_set[j] % len(colors)] else: color = get_color(ids[j]) cv2.fillConvexPoly(cur_canvas, polygon, color) canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0) image = Image.fromarray(canvas.astype('uint8')) plt.close() return image def draw_pose3d(image, pose3d, pose2d=None, visual_thread=0.6, save_name='pose3d.jpg', returnimg=True): try: import matplotlib.pyplot as plt import matplotlib plt.switch_backend('agg') except Exception as e: logger.error('Matplotlib not found, please install matplotlib.' 'for example: `pip install matplotlib`.') raise e if pose3d.shape[0] == 24: joints_connectivity_dict = [ [0, 1, 0], [1, 2, 0], [5, 4, 1], [4, 3, 1], [2, 3, 0], [2, 14, 1], [3, 14, 1], [14, 16, 1], [15, 16, 1], [15, 12, 1], [6, 7, 0], [7, 8, 0], [11, 10, 1], [10, 9, 1], [8, 12, 0], [9, 12, 1], [12, 19, 1], [19, 18, 1], [19, 20, 0], [19, 21, 1], [22, 20, 0], [23, 21, 1] ] elif pose3d.shape[0] == 14: joints_connectivity_dict = [ [0, 1, 0], [1, 2, 0], [5, 4, 1], [4, 3, 1], [2, 3, 0], [2, 12, 0], [3, 12, 1], [6, 7, 0], [7, 8, 0], [11, 10, 1], [10, 9, 1], [8, 12, 0], [9, 12, 1], [12, 13, 1] ] else: print( "not defined joints number :{}, cannot visualize because unknown of joint connectivity". format(pose.shape[0])) return def draw3Dpose(pose3d, ax, lcolor="#3498db", rcolor="#e74c3c", add_labels=False): # pose3d = orthographic_projection(pose3d, cam) for i in joints_connectivity_dict: x, y, z = [ np.array([pose3d[i[0], j], pose3d[i[1], j]]) for j in range(3) ] ax.plot(-x, -z, -y, lw=2, c=lcolor if i[2] else rcolor) RADIUS = 1000 center_xy = 2 if pose3d.shape[0] == 14 else 14 x, y, z = pose3d[center_xy, 0], pose3d[center_xy, 1], pose3d[center_xy, 2] ax.set_xlim3d([-RADIUS + x, RADIUS + x]) ax.set_ylim3d([-RADIUS + y, RADIUS + y]) ax.set_zlim3d([-RADIUS + z, RADIUS + z]) ax.set_xlabel("x") ax.set_ylabel("y") ax.set_zlabel("z") def draw2Dpose(pose2d, ax, lcolor="#3498db", rcolor="#e74c3c", add_labels=False): for i in joints_connectivity_dict: if pose2d[i[0], 2] and pose2d[i[1], 2]: x, y = [ np.array([pose2d[i[0], j], pose2d[i[1], j]]) for j in range(2) ] ax.plot(x, y, 0, lw=2, c=lcolor if i[2] else rcolor) def draw_img_pose(pose3d, pose2d=None, frame=None, figsize=(12, 12), savepath=None): fig = plt.figure(figsize=figsize, dpi=80) # fig.clear() fig.tight_layout() ax = fig.add_subplot(221) if frame is not None: ax.imshow(frame, interpolation='nearest') if pose2d is not None: draw2Dpose(pose2d, ax) ax = fig.add_subplot(222, projection='3d') ax.view_init(45, 45) draw3Dpose(pose3d, ax) ax = fig.add_subplot(223, projection='3d') ax.view_init(0, 0) draw3Dpose(pose3d, ax) ax = fig.add_subplot(224, projection='3d') ax.view_init(0, 90) draw3Dpose(pose3d, ax) if savepath is not None: plt.savefig(savepath) plt.close() else: return fig def fig2data(fig): """ fig = plt.figure() image = fig2data(fig) @brief Convert a Matplotlib figure to a 4D numpy array with RGBA channels and return it @param fig a matplotlib figure @return a numpy 3D array of RGBA values """ # draw the renderer fig.canvas.draw() # Get the RGBA buffer from the figure w, h = fig.canvas.get_width_height() buf = np.fromstring(fig.canvas.tostring_argb(), dtype=np.uint8) buf.shape = (w, h, 4) # canvas.tostring_argb give pixmap in ARGB mode. Roll the ALPHA channel to have it in RGBA mode buf = np.roll(buf, 3, axis=2) image = Image.frombytes("RGBA", (w, h), buf.tostring()) return image.convert("RGB") fig = draw_img_pose(pose3d, pose2d, frame=image) data = fig2data(fig) if returnimg is False: data.save(save_name) else: return data ================================================ FILE: ppdet/utils/voc_utils.py ================================================ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import os.path as osp import re import random __all__ = ['create_list'] def create_list(devkit_dir, years, output_dir): """ create following list: 1. trainval.txt 2. test.txt """ trainval_list = [] test_list = [] for year in years: trainval, test = _walk_voc_dir(devkit_dir, year, output_dir) trainval_list.extend(trainval) test_list.extend(test) random.shuffle(trainval_list) with open(osp.join(output_dir, 'trainval.txt'), 'w') as ftrainval: for item in trainval_list: ftrainval.write(item[0] + ' ' + item[1] + '\n') with open(osp.join(output_dir, 'test.txt'), 'w') as fval: ct = 0 for item in test_list: ct += 1 fval.write(item[0] + ' ' + item[1] + '\n') def _get_voc_dir(devkit_dir, year, type): return osp.join(devkit_dir, 'VOC' + year, type) def _walk_voc_dir(devkit_dir, year, output_dir): filelist_dir = _get_voc_dir(devkit_dir, year, 'ImageSets/Main') annotation_dir = _get_voc_dir(devkit_dir, year, 'Annotations') img_dir = _get_voc_dir(devkit_dir, year, 'JPEGImages') trainval_list = [] test_list = [] added = set() for _, _, files in os.walk(filelist_dir): for fname in files: img_ann_list = [] if re.match(r'[a-z]+_trainval\.txt', fname): img_ann_list = trainval_list elif re.match(r'[a-z]+_test\.txt', fname): img_ann_list = test_list else: continue fpath = osp.join(filelist_dir, fname) for line in open(fpath): name_prefix = line.strip().split()[0] if name_prefix in added: continue added.add(name_prefix) ann_path = osp.join( osp.relpath(annotation_dir, output_dir), name_prefix + '.xml') img_path = osp.join( osp.relpath(img_dir, output_dir), name_prefix + '.jpg') img_ann_list.append((img_path, ann_path)) return trainval_list, test_list ================================================ FILE: requirements.txt ================================================ numpy < 2.0 tqdm typeguard visualdl>=2.2.0 opencv-python <= 4.6.0 PyYAML shapely scipy terminaltables Cython pycocotools setuptools Pillow # for MOT evaluation and inference lapx motmetrics sklearn==0.0 # for vehicleplate in deploy/pipeline/ppvehicle pyclipper # for culane data augumetation imgaug>=0.4.0 ================================================ FILE: scripts/build_wheel.sh ================================================ #!/usr/bin/env bash # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. #================================================= # Utils #================================================= # directory config DIST_DIR="dist" BUILD_DIR="build" EGG_DIR="paddledet.egg-info" CFG_DIR="configs" TEST_DIR=".tests" DATA_DIR="dataset" # command line log config RED='\033[0;31m' BLUE='\033[0;34m' GREEN='\033[1;32m' BOLD='\033[1m' NONE='\033[0m' function python_version_check() { PY_MAIN_VERSION=`python -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $1}'` PY_SUB_VERSION=`python -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $2}'` echo -e "find python version ${PY_MAIN_VERSION}.${PY_SUB_VERSION}" if [ $PY_MAIN_VERSION -ne "3" -o $PY_SUB_VERSION -lt "5" ]; then echo -e "${RED}FAIL:${NONE} please use Python >= 3.5 !" exit 1 fi } function init() { echo -e "${BLUE}[init]${NONE} removing building directory..." rm -rf $DIST_DIR $BUILD_DIR $EGG_DIR $TEST_DIR if [ `pip list | grep paddledet | wc -l` -gt 0 ]; then echo -e "${BLUE}[init]${NONE} uninstalling paddledet..." pip uninstall -y paddledet fi echo -e "${BLUE}[init]${NONE} ${GREEN}init success\n" } function build_and_install() { echo -e "${BLUE}[build]${NONE} building paddledet wheel..." python setup.py sdist bdist_wheel if [ $? -ne 0 ]; then echo -e "${RED}[FAIL]${NONE} build paddledet wheel failed !" exit 1 fi echo -e "${BLUE}[build]${NONE} ${GREEN}build paddldet wheel success\n" echo -e "${BLUE}[install]${NONE} installing paddledet..." cd $DIST_DIR find . -name "paddledet*.whl" | xargs pip install if [ $? -ne 0 ]; then cd .. echo -e "${RED}[FAIL]${NONE} install paddledet wheel failed !" exit 1 fi echo -e "${BLUE}[install]${NONE} ${GREEN}paddledet install success\n" cd .. } function unittest() { if [ -d $TEST_DIR ]; then rm -rf $TEST_DIR fi; echo -e "${BLUE}[unittest]${NONE} run unittests..." # NOTE: perform unittests under TEST_DIR to # make sure installed paddledet is used mkdir $TEST_DIR cp -r $CFG_DIR $TEST_DIR cp -r $DATA_DIR $TEST_DIR cd $TEST_DIR if [ $? != 0 ]; then exit 1 fi find "../ppdet" -wholename '*tests/test_*' -type f -print0 | \ xargs -0 -I{} -n1 -t bash -c 'python -u -s {}' # clean TEST_DIR cd .. rm -rf $TEST_DIR echo -e "${BLUE}[unittest]${NONE} ${GREEN}unittests success\n${NONE}" } function cleanup() { if [ -d $TEST_DIR ]; then rm -rf $TEST_DIR fi rm -rf $BUILD_DIR $EGG_DIR pip uninstall -y paddledet } function abort() { echo -e "${RED}[FAIL]${NONE} build wheel and unittest failed ! please check your code" 1>&2 cur_dir=`basename "$pwd"` if [ cur_dir==$TEST_DIR -o cur_dir==$DIST_DIR ]; then cd .. fi rm -rf $BUILD_DIR $EGG_DIR $DIST_DIR $TEST_DIR pip uninstall -y paddledet } python_version_check trap 'abort' 0 set -e init build_and_install unittest cleanup # get Paddle version PADDLE_VERSION=`python -c "import paddle; print(paddle.version.full_version)"` PADDLE_COMMIT=`python -c "import paddle; print(paddle.version.commit)"` PADDLE_COMMIT=`git rev-parse --short $PADDLE_COMMIT` # get PaddleDetection branch PPDET_BRANCH=`git rev-parse --abbrev-ref HEAD` PPDET_COMMIT=`git rev-parse --short HEAD` # get Python version PYTHON_VERSION=`python -c "import platform; print(platform.python_version())"` echo -e "\n${GREEN}paddledet wheel compiled and checked success !${NONE} ${BLUE}Python version:${NONE} $PYTHON_VERSION ${BLUE}Paddle version:${NONE} $PADDLE_VERSION ($PADDLE_COMMIT) ${BLUE}PaddleDetection branch:${NONE} $PPDET_BRANCH ($PPDET_COMMIT)\n" echo -e "${GREEN}wheel saved under${NONE} ${RED}${BOLD}./dist" trap : 0 ================================================ FILE: scripts/eval.sh ================================================ ../../../../py37_meta_pd-2.4_cu11_comer/bin/python3.7 tools/eval.py \ --config configs/artdetrv3_final/cortdetr_noisegroupx3_o2m_r18vd_120e_coco.yml \ -o weights=outputs/ ================================================ FILE: scripts/kill.sh ================================================ ps -ef | grep train.py | awk '{print $2}' | xargs kill -9 kill -9 $(lsof -t /dev/nvidia*) ================================================ FILE: scripts/train.sh ================================================ PY37=/root/paddlejob/workspace/env_run/ws/py37_meta_pd-2.4_cu11_comer/bin/python3.7 # PY37=../anaconda3/envs/py37_meta_pd-2.3.0_cu11/bin/python3.7 export CUDA_VISIBLE_DEVICES=0,1,2,3 nohup $PY37 -m paddle.distributed.launch --gpus=0,1,2,3 \ tools/train.py \ -c configs/artdetrv3/rtdetrv3_final_r18vd_6x_coco.yml --eval\ -r output/rtdetrv3_final_r18vd_6x_coco/1 \ -o save_dir=output/rtdetrv3_final_r18vd_6x_coco \ &> output/train_rtdetrv3_final_r18vd_6x_coco.log& ================================================ FILE: tools/anchor_cluster.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys # add python path of PaddleDetection to sys.path parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2))) sys.path.insert(0, parent_path) from ppdet.utils.logger import setup_logger logger = setup_logger('ppdet.anchor_cluster') from scipy.cluster.vq import kmeans import numpy as np from tqdm import tqdm from ppdet.utils.cli import ArgsParser from ppdet.utils.check import check_gpu, check_version, check_config from ppdet.core.workspace import load_config, merge_config class BaseAnchorCluster(object): def __init__(self, n, cache_path, cache, verbose=True): """ Base Anchor Cluster Args: n (int): number of clusters cache_path (str): cache directory path cache (bool): whether using cache verbose (bool): whether print results """ super(BaseAnchorCluster, self).__init__() self.n = n self.cache_path = cache_path self.cache = cache self.verbose = verbose def print_result(self, centers): raise NotImplementedError('%s.print_result is not available' % self.__class__.__name__) def get_whs(self): whs_cache_path = os.path.join(self.cache_path, 'whs.npy') shapes_cache_path = os.path.join(self.cache_path, 'shapes.npy') if self.cache and os.path.exists(whs_cache_path) and os.path.exists( shapes_cache_path): self.whs = np.load(whs_cache_path) self.shapes = np.load(shapes_cache_path) return self.whs, self.shapes whs = np.zeros((0, 2)) shapes = np.zeros((0, 2)) self.dataset.parse_dataset() roidbs = self.dataset.roidbs for rec in tqdm(roidbs): h, w = rec['h'], rec['w'] bbox = rec['gt_bbox'] wh = bbox[:, 2:4] - bbox[:, 0:2] + 1 wh = wh / np.array([[w, h]]) shape = np.ones_like(wh) * np.array([[w, h]]) whs = np.vstack((whs, wh)) shapes = np.vstack((shapes, shape)) if self.cache: os.makedirs(self.cache_path, exist_ok=True) np.save(whs_cache_path, whs) np.save(shapes_cache_path, shapes) self.whs = whs self.shapes = shapes return self.whs, self.shapes def calc_anchors(self): raise NotImplementedError('%s.calc_anchors is not available' % self.__class__.__name__) def __call__(self): self.get_whs() centers = self.calc_anchors() if self.verbose: self.print_result(centers) return centers class YOLOv2AnchorCluster(BaseAnchorCluster): def __init__(self, n, dataset, size, cache_path, cache, iters=1000, verbose=True): super(YOLOv2AnchorCluster, self).__init__( n, cache_path, cache, verbose=verbose) """ YOLOv2 Anchor Cluster The code is based on https://github.com/AlexeyAB/darknet/blob/master/scripts/gen_anchors.py Args: n (int): number of clusters dataset (DataSet): DataSet instance, VOC or COCO size (list): [w, h] cache_path (str): cache directory path cache (bool): whether using cache iters (int): kmeans algorithm iters verbose (bool): whether print results """ self.dataset = dataset self.size = size self.iters = iters def print_result(self, centers): logger.info('%d anchor cluster result: [w, h]' % self.n) for w, h in centers: logger.info('[%d, %d]' % (round(w), round(h))) def metric(self, whs, centers): wh1 = whs[:, None] wh2 = centers[None] inter = np.minimum(wh1, wh2).prod(2) return inter / (wh1.prod(2) + wh2.prod(2) - inter) def kmeans_expectation(self, whs, centers, assignments): dist = self.metric(whs, centers) new_assignments = dist.argmax(1) converged = (new_assignments == assignments).all() return converged, new_assignments def kmeans_maximizations(self, whs, centers, assignments): new_centers = np.zeros_like(centers) for i in range(centers.shape[0]): mask = (assignments == i) if mask.sum(): new_centers[i, :] = whs[mask].mean(0) return new_centers def calc_anchors(self): self.whs = self.whs * np.array([self.size]) # random select k centers whs, n, iters = self.whs, self.n, self.iters logger.info('Running kmeans for %d anchors on %d points...' % (n, len(whs))) idx = np.random.choice(whs.shape[0], size=n, replace=False) centers = whs[idx] assignments = np.zeros(whs.shape[0:1]) * -1 # kmeans if n == 1: return self.kmeans_maximizations(whs, centers, assignments) pbar = tqdm(range(iters), desc='Cluster anchors with k-means algorithm') for _ in pbar: # E step converged, assignments = self.kmeans_expectation(whs, centers, assignments) if converged: logger.info('kmeans algorithm has converged') break # M step centers = self.kmeans_maximizations(whs, centers, assignments) ious = self.metric(whs, centers) pbar.desc = 'avg_iou: %.4f' % (ious.max(1).mean()) centers = sorted(centers, key=lambda x: x[0] * x[1]) return centers def main(): parser = ArgsParser() parser.add_argument( '--n', '-n', default=9, type=int, help='num of clusters') parser.add_argument( '--iters', '-i', default=1000, type=int, help='num of iterations for kmeans') parser.add_argument( '--verbose', '-v', default=True, type=bool, help='whether print result') parser.add_argument( '--size', '-s', default=None, type=str, help='image size: w,h, using comma as delimiter') parser.add_argument( '--method', '-m', default='v2', type=str, help='cluster method, v2 is only supported now') parser.add_argument( '--cache_path', default='cache', type=str, help='cache path') parser.add_argument( '--cache', action='store_true', help='whether use cache') FLAGS = parser.parse_args() cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) check_config(cfg) # check if set use_gpu=True in paddlepaddle cpu version if 'use_gpu' not in cfg: cfg.use_gpu = False check_gpu(cfg.use_gpu) # check if paddlepaddle version is satisfied check_version('develop') # get dataset dataset = cfg['TrainDataset'] if FLAGS.size: if ',' in FLAGS.size: size = list(map(int, FLAGS.size.split(','))) assert len(size) == 2, "the format of size is incorrect" else: size = int(FLAGS.size) size = [size, size] elif 'inputs_def' in cfg['TestReader'] and 'image_shape' in cfg[ 'TestReader']['inputs_def']: size = cfg['TestReader']['inputs_def']['image_shape'][1:] else: raise ValueError('size is not specified') if FLAGS.method == 'v2': cluster = YOLOv2AnchorCluster(FLAGS.n, dataset, size, FLAGS.cache_path, FLAGS.cache, FLAGS.iters, FLAGS.verbose) else: raise ValueError('cluster method: %s is not supported' % FLAGS.method) anchors = cluster() if __name__ == "__main__": main() ================================================ FILE: tools/box_distribution.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import matplotlib.pyplot as plt import json import numpy as np import argparse from pycocotools.coco import COCO from tqdm import tqdm def median(data): data.sort() mid = len(data) // 2 median = (data[mid] + data[~mid]) / 2 return median def draw_distribution(width, height, out_path): w_bins = int((max(width) - min(width)) // 10) h_bins = int((max(height) - min(height)) // 10) plt.figure() plt.subplot(221) plt.hist(width, bins=w_bins, color='green') plt.xlabel('Width rate *1000') plt.ylabel('number') plt.title('Distribution of Width') plt.subplot(222) plt.hist(height, bins=h_bins, color='blue') plt.xlabel('Height rate *1000') plt.title('Distribution of Height') plt.savefig(out_path) print(f'Distribution saved as {out_path}') plt.show() def get_ratio_infos(jsonfile, out_img, eval_size, small_stride): coco = COCO(annotation_file=jsonfile) allannjson = json.load(open(jsonfile, 'r')) be_im_id = allannjson['annotations'][0]['image_id'] be_im_w = [] be_im_h = [] ratio_w = [] ratio_h = [] im_wid, im_hei = [], [] for ann in tqdm(allannjson['annotations']): if ann['iscrowd']: continue x0, y0, w, h = ann['bbox'][:] if be_im_id == ann['image_id']: be_im_w.append(w) be_im_h.append(h) else: im_w = coco.imgs[be_im_id]['width'] im_h = coco.imgs[be_im_id]['height'] im_wid.append(im_w) im_hei.append(im_h) im_m_w = np.mean(be_im_w) im_m_h = np.mean(be_im_h) dis_w = im_m_w / im_w dis_h = im_m_h / im_h ratio_w.append(dis_w) ratio_h.append(dis_h) be_im_id = ann['image_id'] be_im_w = [w] be_im_h = [h] im_w = coco.imgs[be_im_id]['width'] im_h = coco.imgs[be_im_id]['height'] im_wid.append(im_w) im_hei.append(im_h) all_im_m_w = np.mean(im_wid) all_im_m_h = np.mean(im_hei) im_m_w = np.mean(be_im_w) im_m_h = np.mean(be_im_h) dis_w = im_m_w / im_w dis_h = im_m_h / im_h ratio_w.append(dis_w) ratio_h.append(dis_h) mid_w = median(ratio_w) mid_h = median(ratio_h) reg_ratio = [] ratio_all = ratio_h + ratio_w for r in ratio_all: if r < 0.2: reg_ratio.append(r) elif r < 0.4: reg_ratio.append(r / 2) else: reg_ratio.append(r / 4) reg_ratio = sorted(reg_ratio) max_ratio = reg_ratio[int(0.95 * len(reg_ratio))] reg_max = round(max_ratio * eval_size / small_stride) ratio_w = [i * 1000 for i in ratio_w] ratio_h = [i * 1000 for i in ratio_h] print(f'Suggested reg_range[1] is {reg_max+1}') print(f'Mean of all img_w is {all_im_m_w}') print(f'Mean of all img_h is {all_im_m_h}') print(f'Median of ratio_w is {mid_w}') print(f'Median of ratio_h is {mid_h}') print('all_img with box: ', len(ratio_h)) print('all_ann: ', len(allannjson['annotations'])) draw_distribution(ratio_w, ratio_h, out_img) def main(): parser = argparse.ArgumentParser() parser.add_argument( '--json_path', type=str, default=None, help="Dataset json path.") parser.add_argument('--eval_size', type=int, default=640, help="eval size.") parser.add_argument( '--small_stride', type=int, default=8, help="smallest stride.") parser.add_argument( '--out_img', type=str, default='box_distribution.jpg', help="Name of distibution img.") args = parser.parse_args() get_ratio_infos(args.json_path, args.out_img, args.eval_size, args.small_stride) if __name__ == "__main__": main() ================================================ FILE: tools/cam_ppdet.py ================================================ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys # add python path of PaddleDetection to sys.path parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2))) sys.path.insert(0, parent_path) # ignore warning log import warnings warnings.filterwarnings('ignore') from ppdet.utils.cli import ArgsParser, merge_args from ppdet.core.workspace import load_config, merge_config from ppdet.utils.check import check_gpu, check_npu, check_xpu, check_version, check_config from ppdet.utils.cam_utils import BBoxCAM import paddle def parse_args(): parser = ArgsParser() parser.add_argument( "--infer_img", type=str, default='demo/000000014439.jpg', # hxw: 404x640 help="Image path, has higher priority over --infer_dir") parser.add_argument("--weights", type=str, default='output/faster_rcnn_r50_vd_fpn_2x_coco_paddlejob/best_model.pdparams' ) parser.add_argument("--cam_out", type=str, default='cam_faster_rcnn' ) parser.add_argument("--use_gpu", type=bool, default=True) parser.add_argument( "--infer_dir", type=str, default=None, help="Directory for images to perform inference on.") parser.add_argument( "--output_dir", type=str, default="output", help="Directory for storing the output visualization files.") parser.add_argument( "--draw_threshold", type=float, default=0.8, help="Threshold to reserve the result for visualization.") parser.add_argument( "--save_results", type=bool, default=False, help="Whether to save inference results to output_dir.") parser.add_argument( "--target_feature_layer_name", type=str, default='model.backbone', # define the featuremap to show grad cam, such as model.backbone, model.bbox_head.roi_extractor help="Whether to save inference results to output_dir.") args = parser.parse_args() return args def run(FLAGS, cfg): assert cfg.architecture in ['FasterRCNN', 'MaskRCNN', 'YOLOv3', 'PPYOLOE', 'PPYOLOEWithAuxHead', 'BlazeFace', 'SSD', 'RetinaNet'], \ 'Only supported cam for faster_rcnn based and yolov3 based architecture for now, ' \ 'the others are not supported temporarily!' bbox_cam = BBoxCAM(FLAGS, cfg) bbox_cam.get_bboxes_cams() print('finish') def main(): FLAGS = parse_args() cfg = load_config(FLAGS.config) merge_args(cfg, FLAGS) merge_config(FLAGS.opt) # disable npu in config by default if 'use_npu' not in cfg: cfg.use_npu = False # disable xpu in config by default if 'use_xpu' not in cfg: cfg.use_xpu = False if cfg.use_gpu: place = paddle.set_device('gpu') elif cfg.use_npu: place = paddle.set_device('npu') elif cfg.use_xpu: place = paddle.set_device('xpu') else: place = paddle.set_device('cpu') check_config(cfg) check_gpu(cfg.use_gpu) check_npu(cfg.use_npu) check_xpu(cfg.use_xpu) check_version() run(FLAGS, cfg) if __name__ == '__main__': main() ================================================ FILE: tools/eval.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys # add python path of PaddleDetection to sys.path parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2))) sys.path.insert(0, parent_path) # ignore warning log import warnings warnings.filterwarnings('ignore') import paddle from ppdet.core.workspace import create, load_config, merge_config from ppdet.utils.check import check_gpu, check_npu, check_xpu, check_mlu, check_version, check_config from ppdet.utils.cli import ArgsParser, merge_args from ppdet.engine import Trainer, Trainer_ARSL, init_parallel_env from ppdet.metrics.coco_utils import json_eval_results from ppdet.slim import build_slim_model from ppdet.utils.logger import setup_logger logger = setup_logger('eval') def parse_args(): parser = ArgsParser() parser.add_argument( "--output_eval", default=None, type=str, help="Evaluation directory, default is current directory.") parser.add_argument( '--json_eval', action='store_true', default=False, help='Whether to re eval with already exists bbox.json or mask.json') parser.add_argument( "--slim_config", default=None, type=str, help="Configuration file of slim method.") # TODO: bias should be unified parser.add_argument( "--bias", action="store_true", help="whether add bias or not while getting w and h") parser.add_argument( "--classwise", action="store_true", help="whether per-category AP and draw P-R Curve or not.") parser.add_argument( '--save_prediction_only', action='store_true', default=False, help='Whether to save the evaluation results only') parser.add_argument( "--amp", action='store_true', default=False, help="Enable auto mixed precision eval.") # for smalldet slice_infer parser.add_argument( "--slice_infer", action='store_true', help="Whether to slice the image and merge the inference results for small object detection." ) parser.add_argument( '--slice_size', nargs='+', type=int, default=[640, 640], help="Height of the sliced image.") parser.add_argument( "--overlap_ratio", nargs='+', type=float, default=[0.25, 0.25], help="Overlap height ratio of the sliced image.") parser.add_argument( "--combine_method", type=str, default='nms', help="Combine method of the sliced images' detection results, choose in ['nms', 'nmm', 'concat']." ) parser.add_argument( "--match_threshold", type=float, default=0.6, help="Combine method matching threshold.") parser.add_argument( "--match_metric", type=str, default='ios', help="Combine method matching metric, choose in ['iou', 'ios'].") args = parser.parse_args() return args def run(FLAGS, cfg): if FLAGS.json_eval: logger.info( "In json_eval mode, PaddleDetection will evaluate json files in " "output_eval directly. And proposal.json, bbox.json and mask.json " "will be detected by default.") json_eval_results( cfg.metric, json_directory=FLAGS.output_eval, dataset=create('EvalDataset')()) return # init parallel environment if nranks > 1 init_parallel_env() ssod_method = cfg.get('ssod_method', None) if ssod_method == 'ARSL': # build ARSL_trainer trainer = Trainer_ARSL(cfg, mode='eval') # load ARSL_weights trainer.load_weights(cfg.weights, ARSL_eval=True) else: # build trainer trainer = Trainer(cfg, mode='eval') #load weights trainer.load_weights(cfg.weights) # training if FLAGS.slice_infer: trainer.evaluate_slice( slice_size=FLAGS.slice_size, overlap_ratio=FLAGS.overlap_ratio, combine_method=FLAGS.combine_method, match_threshold=FLAGS.match_threshold, match_metric=FLAGS.match_metric) else: trainer.evaluate() def main(): FLAGS = parse_args() cfg = load_config(FLAGS.config) merge_args(cfg, FLAGS) merge_config(FLAGS.opt) # disable npu in config by default if 'use_npu' not in cfg: cfg.use_npu = False # disable xpu in config by default if 'use_xpu' not in cfg: cfg.use_xpu = False if 'use_gpu' not in cfg: cfg.use_gpu = False # disable mlu in config by default if 'use_mlu' not in cfg: cfg.use_mlu = False if cfg.use_gpu: place = paddle.set_device('gpu') elif cfg.use_npu: place = paddle.set_device('npu') elif cfg.use_xpu: place = paddle.set_device('xpu') elif cfg.use_mlu: place = paddle.set_device('mlu') else: place = paddle.set_device('cpu') if FLAGS.slim_config: cfg = build_slim_model(cfg, FLAGS.slim_config, mode='eval') check_config(cfg) check_gpu(cfg.use_gpu) check_npu(cfg.use_npu) check_xpu(cfg.use_xpu) check_mlu(cfg.use_mlu) check_version() run(FLAGS, cfg) if __name__ == '__main__': main() ================================================ FILE: tools/eval_mot.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys # add python path of PaddleDetection to sys.path parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2))) sys.path.insert(0, parent_path) # ignore warning log import warnings warnings.filterwarnings('ignore') import paddle from ppdet.core.workspace import load_config, merge_config from ppdet.utils.check import check_gpu, check_npu, check_xpu, check_mlu, check_version, check_config from ppdet.utils.cli import ArgsParser from ppdet.engine import Tracker def parse_args(): parser = ArgsParser() parser.add_argument( "--det_results_dir", type=str, default='', help="Directory name for detection results.") parser.add_argument( '--output_dir', type=str, default='output', help='Directory name for output tracking results.') parser.add_argument( '--save_images', action='store_true', help='Save tracking results (image).') parser.add_argument( '--save_videos', action='store_true', help='Save tracking results (video).') parser.add_argument( '--show_image', action='store_true', help='Show tracking results (image).') parser.add_argument( '--scaled', type=bool, default=False, help="Whether coords after detector outputs are scaled, False in JDE YOLOv3 " "True in general detector.") args = parser.parse_args() return args def run(FLAGS, cfg): dataset_dir = cfg['EvalMOTDataset'].dataset_dir data_root = cfg['EvalMOTDataset'].data_root data_root = '{}/{}'.format(dataset_dir, data_root) seqs = os.listdir(data_root) seqs.sort() # build Tracker tracker = Tracker(cfg, mode='eval') # load weights if cfg.architecture in ['DeepSORT', 'ByteTrack']: tracker.load_weights_sde(cfg.det_weights, cfg.reid_weights) else: tracker.load_weights_jde(cfg.weights) # inference tracker.mot_evaluate( data_root=data_root, seqs=seqs, data_type=cfg.metric.lower(), model_type=cfg.architecture, output_dir=FLAGS.output_dir, save_images=FLAGS.save_images, save_videos=FLAGS.save_videos, show_image=FLAGS.show_image, scaled=FLAGS.scaled, det_results_dir=FLAGS.det_results_dir) def main(): FLAGS = parse_args() cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) # disable npu in config by default if 'use_npu' not in cfg: cfg.use_npu = False # disable xpu in config by default if 'use_xpu' not in cfg: cfg.use_xpu = False if 'use_gpu' not in cfg: cfg.use_gpu = False # disable mlu in config by default if 'use_mlu' not in cfg: cfg.use_mlu = False if cfg.use_gpu: place = paddle.set_device('gpu') elif cfg.use_npu: place = paddle.set_device('npu') elif cfg.use_xpu: place = paddle.set_device('xpu') elif cfg.use_mlu: place = paddle.set_device('mlu') else: place = paddle.set_device('cpu') check_config(cfg) check_gpu(cfg.use_gpu) check_npu(cfg.use_npu) check_xpu(cfg.use_xpu) check_mlu(cfg.use_mlu) check_version() run(FLAGS, cfg) if __name__ == '__main__': main() ================================================ FILE: tools/export_model.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys # add python path of PaddleDetection to sys.path parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2))) sys.path.insert(0, parent_path) # ignore warning log import warnings warnings.filterwarnings('ignore') import paddle from ppdet.core.workspace import load_config, merge_config from ppdet.utils.check import check_gpu, check_version, check_config from ppdet.utils.cli import ArgsParser from ppdet.engine import Trainer from ppdet.engine.trainer_ssod import Trainer_ARSL from ppdet.slim import build_slim_model from ppdet.utils.logger import setup_logger logger = setup_logger('export_model') def parse_args(): parser = ArgsParser() parser.add_argument( "--output_dir", type=str, default="output_inference", help="Directory for storing the output model files.") parser.add_argument( "--export_serving_model", type=bool, default=False, help="Whether to export serving model or not.") parser.add_argument( "--slim_config", default=None, type=str, help="Configuration file of slim method.") parser.add_argument("--for_fd", action='store_true') args = parser.parse_args() return args def run(FLAGS, cfg): ssod_method = cfg.get('ssod_method', None) if ssod_method is not None and ssod_method == 'ARSL': trainer = Trainer_ARSL(cfg, mode='test') trainer.load_weights(cfg.weights, ARSL_eval=True) # build detector else: trainer = Trainer(cfg, mode='test') # load weights if cfg.architecture in ['DeepSORT', 'ByteTrack']: trainer.load_weights_sde(cfg.det_weights, cfg.reid_weights) else: trainer.load_weights(cfg.weights) # export model trainer.export(FLAGS.output_dir, for_fd=FLAGS.for_fd) if FLAGS.export_serving_model: assert not FLAGS.for_fd from paddle_serving_client.io import inference_model_to_serving model_name = os.path.splitext(os.path.split(cfg.filename)[-1])[0] inference_model_to_serving( dirname="{}/{}".format(FLAGS.output_dir, model_name), serving_server="{}/{}/serving_server".format(FLAGS.output_dir, model_name), serving_client="{}/{}/serving_client".format(FLAGS.output_dir, model_name), model_filename="model.pdmodel", params_filename="model.pdiparams") def main(): paddle.set_device("cpu") FLAGS = parse_args() cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) if FLAGS.slim_config: cfg = build_slim_model(cfg, FLAGS.slim_config, mode='test') # FIXME: Temporarily solve the priority problem of FLAGS.opt merge_config(FLAGS.opt) check_config(cfg) if 'use_gpu' not in cfg: cfg.use_gpu = False check_gpu(cfg.use_gpu) check_version() run(FLAGS, cfg) if __name__ == '__main__': main() ================================================ FILE: tools/gen_semi_coco.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import json import argparse import numpy as np def save_json(path, images, annotations, categories): new_json = { 'images': images, 'annotations': annotations, 'categories': categories, } with open(path, 'w') as f: json.dump(new_json, f) print('{} saved, with {} images and {} annotations.'.format( path, len(images), len(annotations))) def gen_semi_data(data_dir, json_file, percent=10.0, seed=1, seed_offset=0, txt_file=None): json_name = json_file.split('/')[-1].split('.')[0] json_file = os.path.join(data_dir, json_file) anno = json.load(open(json_file, 'r')) categories = anno['categories'] all_images = anno['images'] all_anns = anno['annotations'] print( 'Totally {} images and {} annotations, about {} gts per image.'.format( len(all_images), len(all_anns), len(all_anns) / len(all_images))) if txt_file: print('Using percent {} and seed {}.'.format(percent, seed)) txt_file = os.path.join(data_dir, txt_file) sup_idx = json.load(open(txt_file, 'r'))[str(percent)][str(seed)] # max(sup_idx) = 117262 # 10%, sup_idx is not image_id else: np.random.seed(seed + seed_offset) sup_len = int(percent / 100.0 * len(all_images)) sup_idx = np.random.choice( range(len(all_images)), size=sup_len, replace=False) labeled_images, labeled_anns = [], [] labeled_im_ids = [] unlabeled_images, unlabeled_anns = [], [] for i in range(len(all_images)): if i in sup_idx: labeled_im_ids.append(all_images[i]['id']) labeled_images.append(all_images[i]) else: unlabeled_images.append(all_images[i]) for an in all_anns: im_id = an['image_id'] if im_id in labeled_im_ids: labeled_anns.append(an) else: continue save_path = '{}/{}'.format(data_dir, 'semi_annotations') if not os.path.exists(save_path): os.mkdir(save_path) sup_name = '{}.{}@{}.json'.format(json_name, seed, int(percent)) sup_path = os.path.join(save_path, sup_name) save_json(sup_path, labeled_images, labeled_anns, categories) unsup_name = '{}.{}@{}-unlabeled.json'.format(json_name, seed, int(percent)) unsup_path = os.path.join(save_path, unsup_name) save_json(unsup_path, unlabeled_images, unlabeled_anns, categories) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--data_dir', type=str, default='./dataset/coco') parser.add_argument( '--json_file', type=str, default='annotations/instances_train2017.json') parser.add_argument('--percent', type=float, default=10.0) parser.add_argument('--seed', type=int, default=1) parser.add_argument('--seed_offset', type=int, default=0) parser.add_argument('--txt_file', type=str, default='COCO_supervision.txt') args = parser.parse_args() print(args) gen_semi_data(args.data_dir, args.json_file, args.percent, args.seed, args.seed_offset, args.txt_file) ================================================ FILE: tools/infer.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys # add python path of PaddleDetection to sys.path parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2))) sys.path.insert(0, parent_path) # ignore warning log import warnings warnings.filterwarnings('ignore') import glob import ast import paddle from ppdet.core.workspace import create, load_config, merge_config from ppdet.engine import Trainer, Trainer_ARSL from ppdet.utils.check import check_gpu, check_npu, check_xpu, check_mlu, check_version, check_config from ppdet.utils.cli import ArgsParser, merge_args from ppdet.slim import build_slim_model from ppdet.utils.logger import setup_logger logger = setup_logger('train') def parse_args(): parser = ArgsParser() parser.add_argument( "--infer_dir", type=str, default=None, help="Directory for images to perform inference on.") parser.add_argument( "--infer_list", type=str, default=None, help="The file path containing path of image to be infered. Valid only when --infer_dir is given." ) parser.add_argument( "--infer_img", type=str, default=None, help="Image path, has higher priority over --infer_dir") parser.add_argument( "--output_dir", type=str, default="output", help="Directory for storing the output visualization files.") parser.add_argument( "--draw_threshold", type=float, default=0.5, help="Threshold to reserve the result for visualization.") parser.add_argument( "--save_threshold", type=float, default=0.5, help="Threshold to reserve the result for saving.") parser.add_argument( "--slim_config", default=None, type=str, help="Configuration file of slim method.") parser.add_argument( "--use_vdl", type=bool, default=False, help="Whether to record the data to VisualDL.") parser.add_argument( "--do_eval", type=ast.literal_eval, default=False, help="Whether to eval after infer.") parser.add_argument( '--vdl_log_dir', type=str, default="vdl_log_dir/image", help='VisualDL logging directory for image.') parser.add_argument( "--save_results", type=bool, default=False, help="Whether to save inference results to output_dir.") parser.add_argument( "--slice_infer", action='store_true', help="Whether to slice the image and merge the inference results for small object detection." ) parser.add_argument( '--slice_size', nargs='+', type=int, default=[640, 640], help="Height of the sliced image.") parser.add_argument( "--overlap_ratio", nargs='+', type=float, default=[0.25, 0.25], help="Overlap height ratio of the sliced image.") parser.add_argument( "--combine_method", type=str, default='nms', help="Combine method of the sliced images' detection results, choose in ['nms', 'nmm', 'concat']." ) parser.add_argument( "--match_threshold", type=float, default=0.6, help="Combine method matching threshold.") parser.add_argument( "--match_metric", type=str, default='ios', help="Combine method matching metric, choose in ['iou', 'ios'].") parser.add_argument( "--visualize", type=ast.literal_eval, default=True, help="Whether to save visualize results to output_dir.") parser.add_argument( "--rtn_im_file", type=bool, default=False, help="Whether to return image file path in Dataloader.") args = parser.parse_args() return args def get_test_images(infer_dir, infer_img, infer_list=None): """ Get image path list in TEST mode """ assert infer_img is not None or infer_dir is not None, \ "--infer_img or --infer_dir should be set" assert infer_img is None or os.path.isfile(infer_img), \ "{} is not a file".format(infer_img) assert infer_dir is None or os.path.isdir(infer_dir), \ "{} is not a directory".format(infer_dir) # infer_img has a higher priority if infer_img and os.path.isfile(infer_img): return [infer_img] images = set() infer_dir = os.path.abspath(infer_dir) assert os.path.isdir(infer_dir), \ "infer_dir {} is not a directory".format(infer_dir) if infer_list: assert os.path.isfile( infer_list), f"infer_list {infer_list} is not a valid file path." with open(infer_list, 'r') as f: lines = f.readlines() for line in lines: images.update([os.path.join(infer_dir, line.strip())]) else: exts = ['jpg', 'jpeg', 'png', 'bmp'] exts += [ext.upper() for ext in exts] for ext in exts: images.update(glob.glob('{}/*.{}'.format(infer_dir, ext))) images = list(images) assert len(images) > 0, "no image found in {}".format(infer_dir) logger.info("Found {} inference images in total.".format(len(images))) return images def run(FLAGS, cfg): if FLAGS.rtn_im_file: cfg['TestReader']['sample_transforms'][0]['Decode'][ 'rtn_im_file'] = FLAGS.rtn_im_file ssod_method = cfg.get('ssod_method', None) if ssod_method == 'ARSL': trainer = Trainer_ARSL(cfg, mode='test') trainer.load_weights(cfg.weights, ARSL_eval=True) else: trainer = Trainer(cfg, mode='test') trainer.load_weights(cfg.weights) # get inference images if FLAGS.do_eval: dataset = create('TestDataset')() images = dataset.get_images() else: images = get_test_images(FLAGS.infer_dir, FLAGS.infer_img, FLAGS.infer_list) # inference if FLAGS.slice_infer: trainer.slice_predict( images, slice_size=FLAGS.slice_size, overlap_ratio=FLAGS.overlap_ratio, combine_method=FLAGS.combine_method, match_threshold=FLAGS.match_threshold, match_metric=FLAGS.match_metric, draw_threshold=FLAGS.draw_threshold, output_dir=FLAGS.output_dir, save_results=FLAGS.save_results, visualize=FLAGS.visualize) else: trainer.predict( images, draw_threshold=FLAGS.draw_threshold, output_dir=FLAGS.output_dir, save_results=FLAGS.save_results, visualize=FLAGS.visualize, save_threshold=FLAGS.save_threshold, do_eval=FLAGS.do_eval) def main(): FLAGS = parse_args() cfg = load_config(FLAGS.config) merge_args(cfg, FLAGS) merge_config(FLAGS.opt) # disable npu in config by default if 'use_npu' not in cfg: cfg.use_npu = False # disable xpu in config by default if 'use_xpu' not in cfg: cfg.use_xpu = False if 'use_gpu' not in cfg: cfg.use_gpu = False # disable mlu in config by default if 'use_mlu' not in cfg: cfg.use_mlu = False if cfg.use_gpu: place = paddle.set_device('gpu') elif cfg.use_npu: place = paddle.set_device('npu') elif cfg.use_xpu: place = paddle.set_device('xpu') elif cfg.use_mlu: place = paddle.set_device('mlu') else: place = paddle.set_device('cpu') if FLAGS.slim_config: cfg = build_slim_model(cfg, FLAGS.slim_config, mode='test') check_config(cfg) check_gpu(cfg.use_gpu) check_npu(cfg.use_npu) check_xpu(cfg.use_xpu) check_mlu(cfg.use_mlu) check_version() run(FLAGS, cfg) if __name__ == '__main__': main() ================================================ FILE: tools/infer_culane.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys # add python path of PaddleDetection to sys.path parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2))) sys.path.insert(0, parent_path) # ignore warning log import warnings warnings.filterwarnings('ignore') import glob import ast import paddle from ppdet.core.workspace import load_config, merge_config from ppdet.engine import Trainer from ppdet.utils.check import check_gpu, check_npu, check_xpu, check_mlu, check_version, check_config from ppdet.utils.cli import ArgsParser, merge_args from ppdet.slim import build_slim_model from ppdet.utils.logger import setup_logger logger = setup_logger('train') def parse_args(): parser = ArgsParser() parser.add_argument( "--infer_dir", type=str, default=None, help="Directory for images to perform inference on.") parser.add_argument( "--infer_img", type=str, default=None, help="Image path, has higher priority over --infer_dir") parser.add_argument( "--output_dir", type=str, default="output", help="Directory for storing the output visualization files.") parser.add_argument( "--save_results", type=bool, default=False, help="Whether to save inference results to output_dir.") parser.add_argument( "--visualize", type=ast.literal_eval, default=True, help="Whether to save visualize results to output_dir.") args = parser.parse_args() return args def get_test_images(infer_dir, infer_img): """ Get image path list in TEST mode """ assert infer_img is not None or infer_dir is not None, \ "--infer_img or --infer_dir should be set" assert infer_img is None or os.path.isfile(infer_img), \ "{} is not a file".format(infer_img) assert infer_dir is None or os.path.isdir(infer_dir), \ "{} is not a directory".format(infer_dir) # infer_img has a higher priority if infer_img and os.path.isfile(infer_img): return [infer_img] images = set() infer_dir = os.path.abspath(infer_dir) assert os.path.isdir(infer_dir), \ "infer_dir {} is not a directory".format(infer_dir) exts = ['jpg', 'jpeg', 'png', 'bmp'] exts += [ext.upper() for ext in exts] for ext in exts: images.update(glob.glob('{}/*.{}'.format(infer_dir, ext))) images = list(images) assert len(images) > 0, "no image found in {}".format(infer_dir) logger.info("Found {} inference images in total.".format(len(images))) return images def run(FLAGS, cfg): # build trainer trainer = Trainer(cfg, mode='test') # load weights trainer.load_weights(cfg.weights) # get inference images images = get_test_images(FLAGS.infer_dir, FLAGS.infer_img) trainer.predict_culane( images, output_dir=FLAGS.output_dir, save_results=FLAGS.save_results, visualize=FLAGS.visualize) def main(): FLAGS = parse_args() cfg = load_config(FLAGS.config) merge_args(cfg, FLAGS) merge_config(FLAGS.opt) # disable npu in config by default if 'use_npu' not in cfg: cfg.use_npu = False # disable xpu in config by default if 'use_xpu' not in cfg: cfg.use_xpu = False if 'use_gpu' not in cfg: cfg.use_gpu = False # disable mlu in config by default if 'use_mlu' not in cfg: cfg.use_mlu = False if cfg.use_gpu: place = paddle.set_device('gpu') elif cfg.use_npu: place = paddle.set_device('npu') elif cfg.use_xpu: place = paddle.set_device('xpu') elif cfg.use_mlu: place = paddle.set_device('mlu') else: place = paddle.set_device('cpu') check_config(cfg) check_gpu(cfg.use_gpu) check_npu(cfg.use_npu) check_xpu(cfg.use_xpu) check_mlu(cfg.use_mlu) check_version() run(FLAGS, cfg) if __name__ == '__main__': main() ================================================ FILE: tools/infer_mot.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys # add python path of PaddleDetection to sys.path parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2))) sys.path.insert(0, parent_path) # ignore warning log import warnings warnings.filterwarnings('ignore') import paddle from ppdet.core.workspace import load_config, merge_config from ppdet.engine import Tracker from ppdet.utils.check import check_gpu, check_npu, check_xpu, check_mlu, check_version, check_config from ppdet.utils.cli import ArgsParser def parse_args(): parser = ArgsParser() parser.add_argument( '--video_file', type=str, default=None, help='Video name for tracking.') parser.add_argument( '--frame_rate', type=int, default=-1, help='Video frame rate for tracking.') parser.add_argument( "--image_dir", type=str, default=None, help="Directory for images to perform inference on.") parser.add_argument( "--det_results_dir", type=str, default='', help="Directory name for detection results.") parser.add_argument( '--output_dir', type=str, default='output', help='Directory name for output tracking results.') parser.add_argument( '--save_images', action='store_true', help='Save tracking results (image).') parser.add_argument( '--save_videos', action='store_true', help='Save tracking results (video).') parser.add_argument( '--show_image', action='store_true', help='Show tracking results (image).') parser.add_argument( '--scaled', type=bool, default=False, help="Whether coords after detector outputs are scaled, False in JDE YOLOv3 " "True in general detector.") parser.add_argument( "--draw_threshold", type=float, default=0.5, help="Threshold to reserve the result for visualization.") args = parser.parse_args() return args def run(FLAGS, cfg): # build Tracker tracker = Tracker(cfg, mode='test') # load weights if cfg.architecture in ['DeepSORT', 'ByteTrack']: tracker.load_weights_sde(cfg.det_weights, cfg.reid_weights) else: tracker.load_weights_jde(cfg.weights) # inference tracker.mot_predict_seq( video_file=FLAGS.video_file, frame_rate=FLAGS.frame_rate, image_dir=FLAGS.image_dir, data_type=cfg.metric.lower(), model_type=cfg.architecture, output_dir=FLAGS.output_dir, save_images=FLAGS.save_images, save_videos=FLAGS.save_videos, show_image=FLAGS.show_image, scaled=FLAGS.scaled, det_results_dir=FLAGS.det_results_dir, draw_threshold=FLAGS.draw_threshold) def main(): FLAGS = parse_args() cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) # disable npu in config by default if 'use_npu' not in cfg: cfg.use_npu = False # disable xpu in config by default if 'use_xpu' not in cfg: cfg.use_xpu = False if 'use_gpu' not in cfg: cfg.use_gpu = False # disable mlu in config by default if 'use_mlu' not in cfg: cfg.use_mlu = False if cfg.use_gpu: place = paddle.set_device('gpu') elif cfg.use_npu: place = paddle.set_device('npu') elif cfg.use_xpu: place = paddle.set_device('xpu') elif cfg.use_mlu: place = paddle.set_device('mlu') else: place = paddle.set_device('cpu') check_config(cfg) check_gpu(cfg.use_gpu) check_npu(cfg.use_npu) check_xpu(cfg.use_xpu) check_mlu(cfg.use_mlu) check_version() run(FLAGS, cfg) if __name__ == '__main__': main() ================================================ FILE: tools/post_quant.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys # add python path of PaddleDetection to sys.path parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2))) sys.path.insert(0, parent_path) # ignore warning log import warnings warnings.filterwarnings('ignore') import paddle from ppdet.core.workspace import load_config, merge_config from ppdet.utils.check import check_gpu, check_version, check_config from ppdet.utils.cli import ArgsParser from ppdet.engine import Trainer from ppdet.slim import build_slim_model from ppdet.utils.logger import setup_logger logger = setup_logger('post_quant') def parse_args(): parser = ArgsParser() parser.add_argument( "--output_dir", type=str, default="output_inference", help="Directory for storing the output model files.") parser.add_argument( "--slim_config", default=None, type=str, help="Configuration file of slim method.") args = parser.parse_args() return args def run(FLAGS, cfg): # build detector trainer = Trainer(cfg, mode='eval') # load weights if cfg.architecture in ['DeepSORT']: if cfg.det_weights != 'None': trainer.load_weights_sde(cfg.det_weights, cfg.reid_weights) else: trainer.load_weights_sde(None, cfg.reid_weights) else: trainer.load_weights(cfg.weights) # post quant model trainer.post_quant(FLAGS.output_dir) def main(): FLAGS = parse_args() cfg = load_config(FLAGS.config) # TODO: to be refined in the future if 'norm_type' in cfg and cfg['norm_type'] == 'sync_bn': FLAGS.opt['norm_type'] = 'bn' merge_config(FLAGS.opt) if FLAGS.slim_config: cfg = build_slim_model(cfg, FLAGS.slim_config, mode='test') # FIXME: Temporarily solve the priority problem of FLAGS.opt merge_config(FLAGS.opt) check_config(cfg) if 'use_gpu' not in cfg: cfg.use_gpu = False check_gpu(cfg.use_gpu) check_version() run(FLAGS, cfg) if __name__ == '__main__': main() ================================================ FILE: tools/slice_image.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse from tqdm import tqdm def slice_data(image_dir, dataset_json_path, output_dir, slice_size, overlap_ratio): try: from sahi.scripts.slice_coco import slice except Exception as e: raise RuntimeError( 'Unable to use sahi to slice images, please install sahi, for example: `pip install sahi`, see https://github.com/obss/sahi' ) tqdm.write( f" slicing for slice_size={slice_size}, overlap_ratio={overlap_ratio}") slice( image_dir=image_dir, dataset_json_path=dataset_json_path, output_dir=output_dir, slice_size=slice_size, overlap_ratio=overlap_ratio, ) def main(): parser = argparse.ArgumentParser() parser.add_argument( '--image_dir', type=str, default=None, help="The image folder path.") parser.add_argument( '--json_path', type=str, default=None, help="Dataset json path.") parser.add_argument( '--output_dir', type=str, default=None, help="Output dir.") parser.add_argument( '--slice_size', type=int, default=500, help="slice_size") parser.add_argument( '--overlap_ratio', type=float, default=0.25, help="overlap_ratio") args = parser.parse_args() slice_data(args.image_dir, args.json_path, args.output_dir, args.slice_size, args.overlap_ratio) if __name__ == "__main__": main() ================================================ FILE: tools/sniper_params_stats.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys import json import logging import numpy as np from ppdet.utils.logger import setup_logger logger = setup_logger('sniper_params_stats') def get_default_params(architecture): """get_default_params""" if architecture == "FasterRCNN": anchor_range = np.array([64., 512.]) # for frcnn-fpn # anchor_range = np.array([16., 373.]) # for yolov3 # anchor_range = np.array([32., 373.]) # for yolov3 default_crop_size = 1536 # mod 32 for frcnn-fpn default_max_bbox_size = 352 elif architecture == "YOLOv3": anchor_range = np.array([32., 373.]) # for yolov3 default_crop_size = 800 # mod 32 for yolov3 default_max_bbox_size = 352 else: raise NotImplementedError return anchor_range, default_crop_size, default_max_bbox_size def get_box_ratios(anno_file): """ get_size_ratios :param anno_file: coco anno flile :return: size_ratio: (box_long_size / pic_long_size) """ coco_dict = json.load(open(anno_file)) image_list = coco_dict['images'] anno_list = coco_dict['annotations'] image_id2hw = {} for im_dict in image_list: im_id = im_dict['id'] h, w = im_dict['height'], im_dict['width'] image_id2hw[im_id] = (h, w) box_ratios = [] for a_dict in anno_list: im_id = a_dict['image_id'] im_h, im_w = image_id2hw[im_id] bbox = a_dict['bbox'] x1, y1, w, h = bbox pic_long = max(im_h, im_w) box_long = max(w, h) box_ratios.append(box_long / pic_long) return np.array(box_ratios) def get_target_size_and_valid_box_ratios(anchor_range, box_ratio_p2, box_ratio_p98): """get_scale_and_ratios""" anchor_better_low, anchor_better_high = anchor_range # (60., 512.) anchor_center = np.sqrt(anchor_better_high * anchor_better_low) anchor_log_range = np.log10(anchor_better_high) - np.log10(anchor_better_low) box_ratio_log_range = np.log10(box_ratio_p98) - np.log10(box_ratio_p2) logger.info("anchor_log_range:{}, box_ratio_log_range:{}".format(anchor_log_range, box_ratio_log_range)) box_cut_num = int(np.ceil(box_ratio_log_range / anchor_log_range)) box_ratio_log_window = box_ratio_log_range / box_cut_num logger.info("box_cut_num:{}, box_ratio_log_window:{}".format(box_cut_num, box_ratio_log_window)) image_target_sizes = [] valid_ratios = [] for i in range(box_cut_num): # # method1: align center # box_ratio_log_center = np.log10(p2) + 0.5 * box_ratio_log_window + i * box_ratio_log_window # box_ratio_center = np.power(10, box_ratio_log_center) # scale = anchor_center / box_ratio_center # method2: align left low box_ratio_low = np.power(10, np.log10(box_ratio_p2) + i * box_ratio_log_window) image_target_size = anchor_better_low / box_ratio_low image_target_sizes.append(int(image_target_size)) valid_ratio = anchor_range / image_target_size valid_ratios.append(valid_ratio.tolist()) logger.info("Box cut {}".format(i)) logger.info("box_ratio_low: {}".format(box_ratio_low)) logger.info("image_target_size: {}".format(image_target_size)) logger.info("valid_ratio: {}".format(valid_ratio)) return image_target_sizes, valid_ratios def get_valid_ranges(valid_ratios): """ get_valid_box_ratios_range :param valid_ratios: :return: """ valid_ranges = [] if len(valid_ratios) == 1: valid_ranges.append([-1, -1]) else: for i, vratio in enumerate(valid_ratios): if i == 0: valid_ranges.append([-1, vratio[1]]) elif i == len(valid_ratios) - 1: valid_ranges.append([vratio[0], -1]) else: valid_ranges.append(vratio) return valid_ranges def get_percentile(a_array, low_percent, high_percent): """ get_percentile :param low_percent: :param high_percent: :return: """ array_p0 = min(a_array) array_p100 = max(a_array) array_plow = np.percentile(a_array, low_percent) array_phigh = np.percentile(a_array, high_percent) logger.info( "array_percentile(0): {},array_percentile low({}): {}, " "array_percentile high({}): {}, array_percentile 100: {}".format( array_p0, low_percent, array_plow, high_percent, array_phigh, array_p100)) return array_plow, array_phigh def sniper_anno_stats(architecture, anno_file): """ sniper_anno_stats :param anno_file: :return: """ anchor_range, default_crop_size, default_max_bbox_size = get_default_params(architecture) box_ratios = get_box_ratios(anno_file) box_ratio_p8, box_ratio_p92 = get_percentile(box_ratios, 8, 92) image_target_sizes, valid_box_ratios = get_target_size_and_valid_box_ratios(anchor_range, box_ratio_p8, box_ratio_p92) valid_ranges = get_valid_ranges(valid_box_ratios) crop_size = min(default_crop_size, min([item for item in image_target_sizes])) crop_size = int(np.ceil(crop_size / 32.) * 32.) crop_stride = max(min(default_max_bbox_size, crop_size), crop_size - default_max_bbox_size) logger.info("Result".center(100, '-')) logger.info("image_target_sizes: {}".format(image_target_sizes)) logger.info("valid_box_ratio_ranges: {}".format(valid_ranges)) logger.info("chip_target_size: {}, chip_target_stride: {}".format(crop_size, crop_stride)) return { "image_target_sizes": image_target_sizes, "valid_box_ratio_ranges": valid_ranges, "chip_target_size": crop_size, "chip_target_stride": crop_stride } if __name__=="__main__": architecture, anno_file = sys.argv[1], sys.argv[2] sniper_anno_stats(architecture, anno_file) ================================================ FILE: tools/train.py ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys # add python path of PaddleDetection to sys.path parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2))) sys.path.insert(0, parent_path) # ignore warning log import warnings warnings.filterwarnings('ignore') import cv2 cv2.setNumThreads(0) cv2.ocl.setUseOpenCL(False) import paddle from ppdet.core.workspace import load_config, merge_config from ppdet.engine import Trainer, TrainerCot, init_parallel_env, set_random_seed, init_fleet_env from ppdet.engine.trainer_ssod import Trainer_DenseTeacher, Trainer_ARSL, Trainer_Semi_RTDETR from ppdet.slim import build_slim_model from ppdet.utils.cli import ArgsParser, merge_args import ppdet.utils.check as check from ppdet.utils.logger import setup_logger logger = setup_logger('train') def parse_args(): parser = ArgsParser() parser.add_argument( "--eval", action='store_true', default=False, help="Whether to perform evaluation in train") parser.add_argument( "-r", "--resume", default=None, help="weights path for resume") parser.add_argument( "--slim_config", default=None, type=str, help="Configuration file of slim method.") parser.add_argument( "--enable_ce", type=bool, default=False, help="If set True, enable continuous evaluation job." "This flag is only used for internal test.") parser.add_argument( "--amp", action='store_true', default=False, help="Enable auto mixed precision training.") parser.add_argument( "--fleet", action='store_true', default=False, help="Use fleet or not") parser.add_argument( "--use_vdl", type=bool, default=False, help="whether to record the data to VisualDL.") parser.add_argument( '--vdl_log_dir', type=str, default="vdl_log_dir/scalar", help='VisualDL logging directory for scalar.') parser.add_argument( "--use_wandb", type=bool, default=False, help="whether to record the data to wandb.") parser.add_argument( '--save_prediction_only', action='store_true', default=False, help='Whether to save the evaluation results only') parser.add_argument( '--profiler_options', type=str, default=None, help="The option of profiler, which should be in " "format \"key1=value1;key2=value2;key3=value3\"." "please see ppdet/utils/profiler.py for detail.") parser.add_argument( '--save_proposals', action='store_true', default=False, help='Whether to save the train proposals') parser.add_argument( '--proposals_path', type=str, default="sniper/proposals.json", help='Train proposals directory') parser.add_argument( "--to_static", action='store_true', default=False, help="Enable dy2st to train.") args = parser.parse_args() return args def run(FLAGS, cfg): # init fleet environment if cfg.fleet: init_fleet_env(cfg.get('find_unused_parameters', False)) else: # init parallel environment if nranks > 1 init_parallel_env() if FLAGS.enable_ce: set_random_seed(0) # build trainer ssod_method = cfg.get('ssod_method', None) if ssod_method is not None: if ssod_method == 'DenseTeacher': trainer = Trainer_DenseTeacher(cfg, mode='train') elif ssod_method == 'ARSL': trainer = Trainer_ARSL(cfg, mode='train') elif ssod_method == 'Semi_RTDETR': trainer = Trainer_Semi_RTDETR(cfg, mode='train') else: raise ValueError( "Semi-Supervised Object Detection only no support this method.") elif cfg.get('use_cot', False): trainer = TrainerCot(cfg, mode='train') else: trainer = Trainer(cfg, mode='train') # load weights if FLAGS.resume is not None: trainer.resume_weights(FLAGS.resume) elif 'pretrain_student_weights' in cfg and 'pretrain_teacher_weights' in cfg \ and cfg.pretrain_teacher_weights and cfg.pretrain_student_weights: trainer.load_semi_weights(cfg.pretrain_teacher_weights, cfg.pretrain_student_weights) elif 'pretrain_weights' in cfg and cfg.pretrain_weights: trainer.load_weights(cfg.pretrain_weights) # training trainer.train(FLAGS.eval) def main(): FLAGS = parse_args() cfg = load_config(FLAGS.config) merge_args(cfg, FLAGS) merge_config(FLAGS.opt) # disable npu in config by default if 'use_npu' not in cfg: cfg.use_npu = False # disable xpu in config by default if 'use_xpu' not in cfg: cfg.use_xpu = False if 'use_gpu' not in cfg: cfg.use_gpu = False # disable mlu in config by default if 'use_mlu' not in cfg: cfg.use_mlu = False if cfg.use_gpu: place = paddle.set_device('gpu') elif cfg.use_npu: place = paddle.set_device('npu') elif cfg.use_xpu: place = paddle.set_device('xpu') elif cfg.use_mlu: place = paddle.set_device('mlu') else: place = paddle.set_device('cpu') if FLAGS.slim_config: cfg = build_slim_model(cfg, FLAGS.slim_config) # FIXME: Temporarily solve the priority problem of FLAGS.opt merge_config(FLAGS.opt) check.check_config(cfg) check.check_gpu(cfg.use_gpu) check.check_npu(cfg.use_npu) check.check_xpu(cfg.use_xpu) check.check_mlu(cfg.use_mlu) check.check_version() run(FLAGS, cfg) if __name__ == "__main__": main() ================================================ FILE: tools/x2coco.py ================================================ #!/usr/bin/env python # coding: utf-8 # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import glob import json import os import os.path as osp import shutil import xml.etree.ElementTree as ET import numpy as np import PIL.ImageDraw from tqdm import tqdm import cv2 label_to_num = {} categories_list = [] labels_list = [] class MyEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.integer): return int(obj) elif isinstance(obj, np.floating): return float(obj) elif isinstance(obj, np.ndarray): return obj.tolist() else: return super(MyEncoder, self).default(obj) def images_labelme(data, num): image = {} image['height'] = data['imageHeight'] image['width'] = data['imageWidth'] image['id'] = num + 1 if '\\' in data['imagePath']: image['file_name'] = data['imagePath'].split('\\')[-1] else: image['file_name'] = data['imagePath'].split('/')[-1] return image def images_cityscape(data, num, img_file): image = {} image['height'] = data['imgHeight'] image['width'] = data['imgWidth'] image['id'] = num + 1 image['file_name'] = img_file return image def categories(label, labels_list): category = {} category['supercategory'] = 'component' category['id'] = len(labels_list) + 1 category['name'] = label return category def annotations_rectangle(points, label, image_num, object_num, label_to_num): annotation = {} seg_points = np.asarray(points).copy() seg_points[1, :] = np.asarray(points)[2, :] seg_points[2, :] = np.asarray(points)[1, :] annotation['segmentation'] = [list(seg_points.flatten())] annotation['iscrowd'] = 0 annotation['image_id'] = image_num + 1 annotation['bbox'] = list( map(float, [ points[0][0], points[0][1], points[1][0] - points[0][0], points[1][ 1] - points[0][1] ])) annotation['area'] = annotation['bbox'][2] * annotation['bbox'][3] annotation['category_id'] = label_to_num[label] annotation['id'] = object_num + 1 return annotation def annotations_polygon(height, width, points, label, image_num, object_num, label_to_num): annotation = {} annotation['segmentation'] = [list(np.asarray(points).flatten())] annotation['iscrowd'] = 0 annotation['image_id'] = image_num + 1 annotation['bbox'] = list(map(float, get_bbox(height, width, points))) annotation['area'] = annotation['bbox'][2] * annotation['bbox'][3] annotation['category_id'] = label_to_num[label] annotation['id'] = object_num + 1 return annotation def get_bbox(height, width, points): polygons = points mask = np.zeros([height, width], dtype=np.uint8) mask = PIL.Image.fromarray(mask) xy = list(map(tuple, polygons)) PIL.ImageDraw.Draw(mask).polygon(xy=xy, outline=1, fill=1) mask = np.array(mask, dtype=bool) index = np.argwhere(mask == 1) rows = index[:, 0] clos = index[:, 1] left_top_r = np.min(rows) left_top_c = np.min(clos) right_bottom_r = np.max(rows) right_bottom_c = np.max(clos) return [ left_top_c, left_top_r, right_bottom_c - left_top_c, right_bottom_r - left_top_r ] def deal_json(ds_type, img_path, json_path): data_coco = {} images_list = [] annotations_list = [] image_num = -1 object_num = -1 for img_file in os.listdir(img_path): img_label = os.path.splitext(img_file)[0] if img_file.split('.')[ -1] not in ['bmp', 'jpg', 'jpeg', 'png', 'JPEG', 'JPG', 'PNG']: continue label_file = osp.join(json_path, img_label + '.json') print('Generating dataset from:', label_file) image_num = image_num + 1 with open(label_file) as f: data = json.load(f) if ds_type == 'labelme': images_list.append(images_labelme(data, image_num)) elif ds_type == 'cityscape': images_list.append(images_cityscape(data, image_num, img_file)) if ds_type == 'labelme': for shapes in data['shapes']: object_num = object_num + 1 label = shapes['label'] if label not in labels_list: categories_list.append(categories(label, labels_list)) labels_list.append(label) label_to_num[label] = len(labels_list) p_type = shapes['shape_type'] if p_type == 'polygon': points = shapes['points'] annotations_list.append( annotations_polygon(data['imageHeight'], data[ 'imageWidth'], points, label, image_num, object_num, label_to_num)) if p_type == 'rectangle': (x1, y1), (x2, y2) = shapes['points'] x1, x2 = sorted([x1, x2]) y1, y2 = sorted([y1, y2]) points = [[x1, y1], [x2, y2], [x1, y2], [x2, y1]] annotations_list.append( annotations_rectangle(points, label, image_num, object_num, label_to_num)) elif ds_type == 'cityscape': for shapes in data['objects']: object_num = object_num + 1 label = shapes['label'] if label not in labels_list: categories_list.append(categories(label, labels_list)) labels_list.append(label) label_to_num[label] = len(labels_list) points = shapes['polygon'] annotations_list.append( annotations_polygon(data['imgHeight'], data[ 'imgWidth'], points, label, image_num, object_num, label_to_num)) data_coco['images'] = images_list data_coco['categories'] = categories_list data_coco['annotations'] = annotations_list return data_coco def voc_get_label_anno(ann_dir_path, ann_ids_path, labels_path): with open(labels_path, 'r') as f: labels_str = f.read().split() labels_ids = list(range(1, len(labels_str) + 1)) with open(ann_ids_path, 'r') as f: ann_ids = [lin.strip().split(' ')[-1] for lin in f.readlines()] ann_paths = [] for aid in ann_ids: if aid.endswith('xml'): ann_path = os.path.join(ann_dir_path, aid) else: ann_path = os.path.join(ann_dir_path, aid + '.xml') ann_paths.append(ann_path) return dict(zip(labels_str, labels_ids)), ann_paths def voc_get_image_info(annotation_root, im_id): filename = annotation_root.findtext('filename') assert filename is not None img_name = os.path.basename(filename) size = annotation_root.find('size') width = float(size.findtext('width')) height = float(size.findtext('height')) image_info = { 'file_name': filename, 'height': height, 'width': width, 'id': im_id } return image_info def voc_get_coco_annotation(obj, label2id): label = obj.findtext('name') assert label in label2id, "label is not in label2id." category_id = label2id[label] bndbox = obj.find('bndbox') xmin = float(bndbox.findtext('xmin')) ymin = float(bndbox.findtext('ymin')) xmax = float(bndbox.findtext('xmax')) ymax = float(bndbox.findtext('ymax')) assert xmax > xmin and ymax > ymin, "Box size error." o_width = xmax - xmin o_height = ymax - ymin anno = { 'area': o_width * o_height, 'iscrowd': 0, 'bbox': [xmin, ymin, o_width, o_height], 'category_id': category_id, 'ignore': 0, } return anno def voc_xmls_to_cocojson(annotation_paths, label2id, output_dir, output_file): output_json_dict = { "images": [], "type": "instances", "annotations": [], "categories": [] } bnd_id = 1 # bounding box start id im_id = 0 print('Start converting !') for a_path in tqdm(annotation_paths): # Read annotation xml ann_tree = ET.parse(a_path) ann_root = ann_tree.getroot() img_info = voc_get_image_info(ann_root, im_id) output_json_dict['images'].append(img_info) for obj in ann_root.findall('object'): ann = voc_get_coco_annotation(obj=obj, label2id=label2id) ann.update({'image_id': im_id, 'id': bnd_id}) output_json_dict['annotations'].append(ann) bnd_id = bnd_id + 1 im_id += 1 for label, label_id in label2id.items(): category_info = {'supercategory': 'none', 'id': label_id, 'name': label} output_json_dict['categories'].append(category_info) output_file = os.path.join(output_dir, output_file) with open(output_file, 'w') as f: output_json = json.dumps(output_json_dict) f.write(output_json) def widerface_to_cocojson(root_path): train_gt_txt = os.path.join(root_path, "wider_face_split", "wider_face_train_bbx_gt.txt") val_gt_txt = os.path.join(root_path, "wider_face_split", "wider_face_val_bbx_gt.txt") train_img_dir = os.path.join(root_path, "WIDER_train", "images") val_img_dir = os.path.join(root_path, "WIDER_val", "images") assert train_gt_txt assert val_gt_txt assert train_img_dir assert val_img_dir save_path = os.path.join(root_path, "widerface_train.json") widerface_convert(train_gt_txt, train_img_dir, save_path) print("Wider Face train dataset converts sucess, the json path: {}".format(save_path)) save_path = os.path.join(root_path, "widerface_val.json") widerface_convert(val_gt_txt, val_img_dir, save_path) print("Wider Face val dataset converts sucess, the json path: {}".format(save_path)) def widerface_convert(gt_txt, img_dir, save_path): output_json_dict = { "images": [], "type": "instances", "annotations": [], "categories": [{'supercategory': 'none', 'id': 0, 'name': "human_face"}] } bnd_id = 1 # bounding box start id im_id = 0 print('Start converting !') with open(gt_txt) as fd: lines = fd.readlines() i = 0 while i < len(lines): image_name = lines[i].strip() bbox_num = int(lines[i + 1].strip()) i += 2 img_info = get_widerface_image_info(img_dir, image_name, im_id) if img_info: output_json_dict["images"].append(img_info) for j in range(i, i + bbox_num): anno = get_widerface_ann_info(lines[j]) anno.update({'image_id': im_id, 'id': bnd_id}) output_json_dict['annotations'].append(anno) bnd_id += 1 else: print("The image dose not exist: {}".format(os.path.join(img_dir, image_name))) bbox_num = 1 if bbox_num == 0 else bbox_num i += bbox_num im_id += 1 with open(save_path, 'w') as f: output_json = json.dumps(output_json_dict) f.write(output_json) def get_widerface_image_info(img_root, img_relative_path, img_id): image_info = {} save_path = os.path.join(img_root, img_relative_path) if os.path.exists(save_path): img = cv2.imread(save_path) image_info["file_name"] = os.path.join(os.path.basename( os.path.dirname(img_root)), os.path.basename(img_root), img_relative_path) image_info["height"] = img.shape[0] image_info["width"] = img.shape[1] image_info["id"] = img_id return image_info def get_widerface_ann_info(info): info = [int(x) for x in info.strip().split()] anno = { 'area': info[2] * info[3], 'iscrowd': 0, 'bbox': [info[0], info[1], info[2], info[3]], 'category_id': 0, 'ignore': 0, 'blur': info[4], 'expression': info[5], 'illumination': info[6], 'invalid': info[7], 'occlusion': info[8], 'pose': info[9] } return anno def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '--dataset_type', help='the type of dataset, can be `voc`, `widerface`, `labelme` or `cityscape`') parser.add_argument('--json_input_dir', help='input annotated directory') parser.add_argument('--image_input_dir', help='image directory') parser.add_argument( '--output_dir', help='output dataset directory', default='./') parser.add_argument( '--train_proportion', help='the proportion of train dataset', type=float, default=1.0) parser.add_argument( '--val_proportion', help='the proportion of validation dataset', type=float, default=0.0) parser.add_argument( '--test_proportion', help='the proportion of test dataset', type=float, default=0.0) parser.add_argument( '--voc_anno_dir', help='In Voc format dataset, path to annotation files directory.', type=str, default=None) parser.add_argument( '--voc_anno_list', help='In Voc format dataset, path to annotation files ids list.', type=str, default=None) parser.add_argument( '--voc_label_list', help='In Voc format dataset, path to label list. The content of each line is a category.', type=str, default=None) parser.add_argument( '--voc_out_name', type=str, default='voc.json', help='In Voc format dataset, path to output json file') parser.add_argument( '--widerface_root_dir', help='The root_path for wider face dataset, which contains `wider_face_split`, `WIDER_train` and `WIDER_val`.And the json file will save in this path', type=str, default=None) args = parser.parse_args() try: assert args.dataset_type in ['voc', 'labelme', 'cityscape', 'widerface'] except AssertionError as e: print( 'Now only support the voc, cityscape dataset and labelme dataset!!') os._exit(0) if args.dataset_type == 'voc': assert args.voc_anno_dir and args.voc_anno_list and args.voc_label_list label2id, ann_paths = voc_get_label_anno( args.voc_anno_dir, args.voc_anno_list, args.voc_label_list) voc_xmls_to_cocojson( annotation_paths=ann_paths, label2id=label2id, output_dir=args.output_dir, output_file=args.voc_out_name) elif args.dataset_type == "widerface": assert args.widerface_root_dir widerface_to_cocojson(args.widerface_root_dir) else: try: assert os.path.exists(args.json_input_dir) except AssertionError as e: print('The json folder does not exist!') os._exit(0) try: assert os.path.exists(args.image_input_dir) except AssertionError as e: print('The image folder does not exist!') os._exit(0) try: assert abs(args.train_proportion + args.val_proportion \ + args.test_proportion - 1.0) < 1e-5 except AssertionError as e: print( 'The sum of pqoportion of training, validation and test datase must be 1!' ) os._exit(0) # Allocate the dataset. total_num = len(glob.glob(osp.join(args.json_input_dir, '*.json'))) if args.train_proportion != 0: train_num = int(total_num * args.train_proportion) out_dir = args.output_dir + '/train' if not os.path.exists(out_dir): os.makedirs(out_dir) else: train_num = 0 if args.val_proportion == 0.0: val_num = 0 test_num = total_num - train_num out_dir = args.output_dir + '/test' if args.test_proportion != 0.0 and not os.path.exists(out_dir): os.makedirs(out_dir) else: val_num = int(total_num * args.val_proportion) test_num = total_num - train_num - val_num val_out_dir = args.output_dir + '/val' if not os.path.exists(val_out_dir): os.makedirs(val_out_dir) test_out_dir = args.output_dir + '/test' if args.test_proportion != 0.0 and not os.path.exists(test_out_dir): os.makedirs(test_out_dir) count = 1 for img_name in os.listdir(args.image_input_dir): if count <= train_num: if osp.exists(args.output_dir + '/train/'): shutil.copyfile( osp.join(args.image_input_dir, img_name), osp.join(args.output_dir + '/train/', img_name)) else: if count <= train_num + val_num: if osp.exists(args.output_dir + '/val/'): shutil.copyfile( osp.join(args.image_input_dir, img_name), osp.join(args.output_dir + '/val/', img_name)) else: if osp.exists(args.output_dir + '/test/'): shutil.copyfile( osp.join(args.image_input_dir, img_name), osp.join(args.output_dir + '/test/', img_name)) count = count + 1 # Deal with the json files. if not os.path.exists(args.output_dir + '/annotations'): os.makedirs(args.output_dir + '/annotations') if args.train_proportion != 0: train_data_coco = deal_json(args.dataset_type, args.output_dir + '/train', args.json_input_dir) train_json_path = osp.join(args.output_dir + '/annotations', 'instance_train.json') json.dump( train_data_coco, open(train_json_path, 'w'), indent=4, cls=MyEncoder) if args.val_proportion != 0: val_data_coco = deal_json(args.dataset_type, args.output_dir + '/val', args.json_input_dir) val_json_path = osp.join(args.output_dir + '/annotations', 'instance_val.json') json.dump( val_data_coco, open(val_json_path, 'w'), indent=4, cls=MyEncoder) if args.test_proportion != 0: test_data_coco = deal_json(args.dataset_type, args.output_dir + '/test', args.json_input_dir) test_json_path = osp.join(args.output_dir + '/annotations', 'instance_test.json') json.dump( test_data_coco, open(test_json_path, 'w'), indent=4, cls=MyEncoder) if __name__ == '__main__': main()